mergingtariffandindustrydata.py
mail-sayrejay@gmail.comn

Purpose: Combines tariff data (processed by wtoandcaftatariffcleaning.py) and estimated shares of industrial activity in a given municipality (processed by compute_regional_employment.py)

INPUTS -

'Tariff Conversion/HS1996toISIC3/HS1996toISIC3.csv' - Not sure
what file made this, source seems to be 
http://wits.worldbank.org/product_concordance.html

estmunicipalindustryactivity2002.csv - Estimates municipal industrial activity at the ISIC 2-digit level for 2002 (using D.R. empresa and IPUMS data) processed by compute_regional_employment.py

estmunicipalindustryactivity2010.csv - Estimates municipal industrial activity at the ISIC 2-digit level for 2010

"cafta-dr/Output/wtoandcaftahstariff.csv" - Harmonized system 6-digit level tariff data from 1996-2015 from both WTO data and CAFTA treaty text processed by wtoandcaftatariffcleaning.py

INTERMEDIATE DATA -

OUTPUTS - 
    
    

In [3]:
import pandas as pd
import os

if os.name == 'nt':
    base_dir = "D:/Dropbox/Dropbox (Personal)/College/DR_Paper/"
else:
    base_dir = "/home/j/Dropbox/College/DR_Paper/"

## INPUTS
hs96isic3 = base_dir + 'Tariff Conversion/HS1996toISIC3/HS1996toISIC3.csv'
wtoandcaftadata = base_dir +"cafta-dr/Output/wtoandcaftahstariff.csv"
industry2002 = base_dir+'cafta-dr/Output/estmunicipalindustryactivity2002.csv'
industry2010 = base_dir+'cafta-dr/Output/estmunicipalindustryactivity2010.csv'

## INTERMEDIATE DATA

## OUTPUTS


In [95]:
### Convert wtoandcaftadata from Harmonized System 1996 to ISIC four digit codes

## Build table to convert HS1996 to ISIC3 data
isic3 = pd.read_csv(hs96isic3, 
                    converters={'HS 1996 Product Code': lambda x: str(x),
                                'ISIC Revision 3 Product Code': lambda x: str(x)})
isic3.columns=['HS96', 'HSdesc', 'ISIC3', 'ISICdesc']
isic3 = isic3[['HS96', 'ISIC3']]

isic3conv = dict(zip(isic3['HS96'],isic3['ISIC3']))

## Convert tariff data from HS1996 to ISIC3
tariffdf = pd.read_csv(wtoandcaftadata)
#Drop number of tariff lines per HS6 good
lines = [a for a in tariffdf.columns if 'Lines' in a] 
tariffdf = tariffdf.drop(lines, 1)

#Convert HS1996 codes in tariff data into ISIC3 codes
tariffdf['ISIC'] = tariffdf['HS6'].apply(lambda x: "'"+isic3conv[x.replace("'","")])

### Group tariff data down to the ISIC two digit level
## This is a really unsophisticated way to group data
tariffdf['ISICtwodig'] = tariffdf['ISIC'].apply(lambda x: str(int(x[1:3])))
tariffdf.drop(['HS6','ISIC'],1)
tariffdf = tariffdf.groupby('ISICtwodig', as_index=False)[list(tariffdf.columns)].mean()
## Keep only columns that need to be merged with industrial activity data
tariffdrops = set(tariffdf.columns)-set(['ISICtwodig','2002AvgRate','duty2013'])
tariffdf = tariffdf.drop(list(tariffdrops),1)
tariffdf.rename(columns={'ISICtwodig':'isic'},inplace=True)

In [98]:
## Checking merging issues - unfortunately, seems unfixable
print "ISIC 2 digit codes in conversion data:", 
print len(set([int(a[:2]) for a in isic3conv.values()]))
print "ISIC 2 digit codes in tariff data:", len(list(tariffdf['isic']))
print "ISIC 2 digit codes in industrial activity data:",
print len([a for a in list(indusdf2002.columns) if len(a) <= 2])

ISIC 2 digit codes in conversion data: 35
ISIC 2 digit codes in tariff data: 35
ISIC 2 digit codes in industrial activity data: 62


In [102]:
### Merge tariff data with municipality level occupation data
indusdf2002  =  pd.read_csv(industry2002)
indusdf2010  =  pd.read_csv(industry2010)

## Drop ISIC 2 digit codes that don't have corresponding tariff data
isicintariffdf = list(tariffdf['isic'])
indus02drops = [a for a in indusdf2002.columns if a not in isicintariffdf]
indus10drops = [a for a in indusdf2010.columns if a not in isicintariffdf]
indus02drops.remove('mun')
indus10drops.remove('mun')
indusdf2002 = indusdf2002.drop(list(indus02drops),1)
indusdf2010 = indusdf2010.drop(list(indus10drops),1)
## Pivot municipality level occupation data.
indusdf2002 = indusdf2002.set_index('mun').T.reset_index()
indusdf2010 = indusdf2010.set_index('mun').T.reset_index()
indusdf2002.rename(columns={'index':'isic'},inplace=True)
indusdf2010.rename(columns={'index':'isic'},inplace=True)

## Complete merge
indusdf2002 = indusdf2002.merge(tariffdf, on='isic',how='left')
indusdf2010 = indusdf2010.merge(tariffdf, on='isic',how='left')
## Could write this to intermediate data if I feel like it..

In [None]:
## Need to fix this script - OLD
## Compute weighted average of import competing tariff for each
## municipality/province
comptarcsv = '/home/j/Dropbox/College/DR_Paper/companiestariffs.csv'
municicsv = '/home/j/Dropbox/College/DR_Paper/municitariffs.csv'
provcsv = '/home/j/Dropbox/College/DR_Paper/provtariffs.csv'

compdf = pd.read_csv(comptarcsv)

# Reduces data frame down to only "import competing" (wink) industries
# You want to look into this later, of course
compdf = compdf[compdf['duty2007'].astype(str) != 'nan'] 

#Calculate "new" number of companies in province and municipality
compdf = compdf.drop(['MUNICINUM','PROVNUM'],1)
calcdf = compdf
calcdf = calcdf[list(calcdf.columns)[:4]]
municidf2 = calcdf.groupby(['PROVINCE','MUNICIPIO'], as_index=False).sum()
provdf2 = calcdf.groupby('PROVINCE', as_index=False)['EMPRESAS'].sum()
municidf2.columns = ['PROVINCE','MUNICIPIO','MUNICINUM']
provdf2.columns = ['PROVINCE','PROVNUM']

compdf = compdf.merge(municidf2, on=['PROVINCE','MUNICIPIO'], how='left')
compdf = compdf.merge(provdf2, on=['PROVINCE'], how='left')
compdf['MUNSHARE'] = compdf['EMPRESAS']/compdf['MUNICINUM']
compdf['PROVSHARE'] = compdf['EMPRESAS']/compdf['PROVNUM']

## Calculate weighted tariff for each province and municipality
for col in list(compdf.columns)[4:25]: # Might need to check columns
    compdf['wptr'+col] = compdf[col] * compdf['PROVSHARE']
    compdf['wmtr'+col] = compdf[col] * compdf['MUNSHARE']
    
#df.loc[i,'duty'+str(year)]=ctariff(df['Base'][i]

## Sum down to municipality and province tariff averages

# Province level
provtariffdf= compdf
provkeeps = [a for a in provtariffdf.columns if 'wptr' in a]
provkeeps.extend(['PROVINCE'])
provtariffdf = provtariffdf[provkeeps]
provcols = list(provtariffdf.columns)
provcols.remove('PROVINCE')
provtariffdf = provtariffdf.groupby('PROVINCE', as_index=False)[provcols].sum()
provtariffdf.to_csv(provcsv,index=False)

# Municipality level
municitariffdf = compdf
municikeeps = [a for a in municitariffdf.columns if 'wmtr' in a]
municikeeps.extend(['MUNICIPIO'])
municitariffdf = municitariffdf[municikeeps]
municicols = list(municitariffdf.columns)
municicols.remove('MUNICIPIO')
municitariffdf = municitariffdf.groupby('MUNICIPIO', as_index=False)[municicols].sum()
municitariffdf.to_csv(municicsv,index=False)

In [None]:
## Merge income numbers with municipality level tariff data
## Only used for test conducted on 9/13/2015

municitarf = base_dir + "municitariffs.csv"
#munemp = base_dir + "DirectoryofCompaniesandEstablishments/municipalitycodecorrespondence.csv"
income = base_dir + "MUNICIoccinc.csv"


municitarfdf = pd.read_csv(municitarf)
#munempdf = pd.read_csv(munemp,encoding='utf-8')
#munempdf = munempdf.drop([u'PROVINCE_DESC',u'MUNICIPIO_DESC'],1)
incomedf = pd.read_csv(income)
incomedf['MUNICIPIO'] = incomedf['PROV'].astype(str)+'0'+incomedf['MUN'].astype(str)
incomedf.drop(['PROV','MUN'],1, inplace=True)
incomedf.columns = ['INC07', 'INC13','MUNICIPIO']


#municitarfdf = municitarfdf.merge(munempdf, on='MUNICIPIO', how='left')
municitarfdf['MUNICIPIO'] = municitarfdf['MUNICIPIO'].astype(str)
municitarfdf = municitarfdf.merge(incomedf, on='MUNICIPIO', how='left')
municitarfdf.to_csv(base_dir + 'municitarincocc.csv',index=False,encoding='utf-8')

## Merge income numbers with province level tariff data
## Only used for test conducted on 9/13/2015

provtarf = base_dir + "provtariffs.csv"
income = base_dir + "PROVoccinc.csv"


provtarfdf = pd.read_csv(provtarf)
incomedf = pd.read_csv(income)
incomedf.columns = ['PROVINCE','INC07', 'INC13']

provtarfdf = provtarfdf.merge(incomedf, on='PROVINCE', how='left')
provtarfdf.to_csv(base_dir + 'provtarincocc.csv',index=False,encoding='utf-8')