mergingtariffandindustrydata.py
mail-sayrejay@gmail.comn

Purpose: Combines tariff data (processed by wtoandcaftatariffcleaning.py) and estimated shares of industrial activity in a given municipality (processed by compute_regional_employment.py)

INPUTS -

'Tariff Conversion/HS1996toISIC3/HS1996toISIC3.csv' - Not sure
what file made this, source seems to be 
http://wits.worldbank.org/product_concordance.html

estmunicipalindustryactivity2002.csv - Estimates municipal industrial activity at the ISIC 2-digit level for 2002 (using D.R. empresa and IPUMS data) processed by compute_regional_employment.py

estmunicipalindustryactivity2010.csv - Estimates municipal industrial activity at the ISIC 2-digit level for 2010

"cafta-dr/Output/wtoandcaftahstariff.csv" - Harmonized system 6-digit level tariff data from 1996-2015 from both WTO data and CAFTA treaty text processed by wtoandcaftatariffcleaning.py

INTERMEDIATE DATA -

OUTPUTS - 
    
    

In [2]:
import pandas as pd
import os

if os.name == 'nt':
    base_dir = "D:/Dropbox/Dropbox (Personal)/College/DR_Paper/"
else:
    base_dir = "/home/j/Dropbox/College/DR_Paper/"

## INPUTS
hs96isic3 = base_dir + 'Tariff Conversion/HS1996toISIC3/HS1996toISIC3.csv'
wtoandcaftadata = base_dir +"cafta-dr/Output/wtoandcaftahstariff.csv"
industry2002 = base_dir+'cafta-dr/Output/estmunicipalindustryactivity2002.csv'
industry2010 = base_dir+'cafta-dr/Output/estmunicipalindustryactivity2010.csv'

## INTERMEDIATE DATA

## OUTPUTS





In [3]:
### Convert wtoandcaftadata from Harmonized System 1996 to ISIC four digit codes

## Build table to convert HS1996 to ISIC3 data
isic3 = pd.read_csv(hs96isic3, 
                    converters={'HS 1996 Product Code': lambda x: str(x),
                                'ISIC Revision 3 Product Code': lambda x: str(x)})
isic3.columns=['HS96', 'HSdesc', 'ISIC3', 'ISICdesc']
isic3 = isic3[['HS96', 'ISIC3']]

isic3conv = dict(zip(isic3['HS96'],isic3['ISIC3']))

## Convert tariff data from HS1996 to ISIC3
tariffdf = pd.read_csv(wtoandcaftadata)
#Drop number of tariff lines per HS6 good
lines = [a for a in tariffdf.columns if 'Lines' in a] 
tariffdf = tariffdf.drop(lines, 1)

#Convert HS1996 codes in tariff data into ISIC3 codes
tariffdf['ISIC'] = tariffdf['HS6'].apply(lambda x: "'"+isic3conv[x.replace("'","")])

### Group tariff data down to the ISIC two digit level
## This is a really unsophisticated way to group data
tariffdf['ISICtwodig'] = tariffdf['ISIC'].apply(lambda x: str(int(x[1:3])))
tariffdf.drop(['HS6','ISIC'],1)
tariffdf = tariffdf.groupby('ISICtwodig', as_index=False)[list(tariffdf.columns)].mean()
## Keep only columns that need to be merged with industrial activity data
tariffdrops = set(tariffdf.columns)-set(['ISICtwodig','2002AvgRate','duty2013'])
tariffdf = tariffdf.drop(list(tariffdrops),1)
tariffdf.rename(columns={'ISICtwodig':'isic'},inplace=True)

In [6]:
## Checking merging issues - unfortunately, seems unfixable
print "ISIC 2 digit codes in conversion data:", 
print len(set([int(a[:2]) for a in isic3conv.values()]))
print "ISIC Codes:", set([int(a[:2]) for a in isic3conv.values()])
print "ISIC 2 digit codes in tariff data:", len(list(tariffdf['isic']))
print "ISIC 2 digit codes in industrial activity data:",
#print len([a for a in list(indusdf2002.columns) if len(a) <= 2])

 ISIC 2 digit codes in conversion data: 35
ISIC Codes: set([1, 2, 5, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 40, 74, 92, 93, 99])
ISIC 2 digit codes in tariff data: 35
ISIC 2 digit codes in industrial activity data:


In [4]:
### Merge tariff data with municipality level occupation data
indusdf2002  =  pd.read_csv(industry2002)
indusdf2010  =  pd.read_csv(industry2010) # Do this too

## Drop ISIC 2 digit codes that don't have corresponding tariff data
isicintariffdf = list(tariffdf['isic'])
indusdf2002drops = [a for a in indusdf2002.columns if a not in isicintariffdf]
indusdf2010drops = [a for a in indusdf2010.columns if a not in isicintariffdf]
indusdf2002drops.remove('mun')
indusdf2010drops.remove('mun')
indusdf2002 = indusdf2002.drop(list(indusdf2002drops),1)
indusdf2010 = indusdf2010.drop(list(indusdf2010drops),1)
##Compute total number of sample of workers in given municipality
codes2002 = [a for a in indusdf2002.columns if 'mun' not in a]
codes2010 = [a for a in indusdf2010.columns if 'mun' not in a]
indusdf2002['MUNTOTAL']=0
indusdf2010['MUNTOTAL']=0
for col in codes2002:
    indusdf2002['MUNTOTAL'] += indusdf2002[col]
for col in codes2010:
    indusdf2010['MUNTOTAL'] += indusdf2010[col]
## Compute share of occupation in a given municipality
for col in codes2002:
    indusdf2002[col] = indusdf2002[col]/indusdf2002['MUNTOTAL']
for col in codes2010:
    indusdf2010[col] = indusdf2010[col]/indusdf2010['MUNTOTAL']
indusdf2002 = indusdf2002.drop('MUNTOTAL',1)
indusdf2010 = indusdf2010.drop('MUNTOTAL',1)

## Pivot municipality level occupation data.
indusdf2002 = indusdf2002.set_index('mun').T.reset_index()
indusdf2010 = indusdf2010.set_index('mun').T.reset_index()
indusdf2002.rename(columns={'index':'isic'},inplace=True)
indusdf2010.rename(columns={'index':'isic'},inplace=True)

## Complete merge
#indusdf2002 = indusdf2002.merge(tariffdf, on='isic',how='left')
#indusdf2010 = indusdf2010.merge(tariffdf, on='isic',how='left')
## Could write this to intermediate data if I feel like it..

In [6]:
indusdf2002

#indusdf2002.set_index('isic').T

mun,isic,101,201,202,203,204,205,206,207,208,...,3103,3201,3202,3203,3204,3205,3206,3207,2002AvgRate,duty2013
0,1,0.048267,0.236842,0.048544,0.012658,0.053571,0.048544,0.053571,0.068966,0.012658,...,0.005848,0.064401,0.023994,0.048528,0.072055,0.220472,0.054752,0.137815,11.192825,2.159741
1,11,0.000385,0.005848,0.019417,0.202532,0.008929,0.019417,0.008929,0.008621,0.202532,...,0.005848,0.000737,0.000307,0.00058,0.000779,0.007874,0.0,0.006723,3.0,0.0
2,12,0.004656,0.009747,0.0,0.037975,0.035714,0.0,0.035714,0.014368,0.037975,...,0.005848,0.004913,0.003526,0.001257,0.001168,0.007874,0.009298,0.003361,3.0,0.0
3,13,0.008918,0.003899,0.009709,0.0,0.008929,0.009709,0.008929,0.005747,0.0,...,0.011696,0.005977,0.006899,0.003528,0.000389,0.007874,0.011364,0.006723,3.0,0.0
4,21,0.019731,0.003899,0.0,0.0,0.0,0.0,0.0,0.005747,0.0,...,0.005848,0.036192,0.025297,0.015951,0.017332,0.0,0.022727,0.013445,7.276794,1.02523
5,22,0.083318,0.091618,0.291262,0.151899,0.25,0.291262,0.25,0.135057,0.151899,...,0.239766,0.048352,0.079034,0.067427,0.005453,0.031496,0.027893,0.016807,13.354167,3.513889
6,23,0.017028,0.134503,0.07767,0.202532,0.160714,0.07767,0.160714,0.198276,0.202532,...,0.052632,0.042907,0.028363,0.025956,0.015969,0.094488,0.085744,0.084034,3.213889,0.1456
7,24,0.085843,0.038986,0.009709,0.025316,0.035714,0.009709,0.035714,0.057471,0.025316,...,0.005848,0.093961,0.211422,0.151385,0.111782,0.023622,0.042355,0.053782,4.108671,0.517844
8,31,0.020764,0.019493,0.009709,0.012658,0.017857,0.009709,0.017857,0.028736,0.012658,...,0.017544,0.043439,0.028747,0.045048,0.065628,0.015748,0.049587,0.013445,6.884804,0.464528
9,32,0.002309,0.031189,0.0,0.037975,0.017857,0.0,0.017857,0.045977,0.037975,...,0.005848,0.026039,0.003833,0.004978,0.119182,0.015748,0.010331,0.026891,9.797138,0.418733


In [None]:
## Need to fix this script - OLD
## Compute weighted average of import competing tariff for each
## municipality/province
## Sum down to municipality and province tariff averages

# Province level
provtariffdf= compdf
provkeeps = [a for a in provtariffdf.columns if 'wptr' in a]
provkeeps.extend(['PROVINCE'])
provtariffdf = provtariffdf[provkeeps]
provcols = list(provtariffdf.columns)
provcols.remove('PROVINCE')
provtariffdf = provtariffdf.groupby('PROVINCE', as_index=False)[provcols].sum()
provtariffdf.to_csv(provcsv,index=False)

# Municipality level
municitariffdf = compdf
municikeeps = [a for a in municitariffdf.columns if 'wmtr' in a]
municikeeps.extend(['MUNICIPIO'])
municitariffdf = municitariffdf[municikeeps]
municicols = list(municitariffdf.columns)
municicols.remove('MUNICIPIO')
municitariffdf = municitariffdf.groupby('MUNICIPIO', as_index=False)[municicols].sum()
municitariffdf.to_csv(municicsv,index=False)

## Merge income numbers with municipality level tariff data
## Only used for test conducted on 9/13/2015

municitarf = base_dir + "municitariffs.csv"
#munemp = base_dir + "DirectoryofCompaniesandEstablishments/municipalitycodecorrespondence.csv"
income = base_dir + "MUNICIoccinc.csv"


municitarfdf = pd.read_csv(municitarf)
#munempdf = pd.read_csv(munemp,encoding='utf-8')
#munempdf = munempdf.drop([u'PROVINCE_DESC',u'MUNICIPIO_DESC'],1)
incomedf = pd.read_csv(income)
incomedf['MUNICIPIO'] = incomedf['PROV'].astype(str)+'0'+incomedf['MUN'].astype(str)
incomedf.drop(['PROV','MUN'],1, inplace=True)
incomedf.columns = ['INC07', 'INC13','MUNICIPIO']


#municitarfdf = municitarfdf.merge(munempdf, on='MUNICIPIO', how='left')
municitarfdf['MUNICIPIO'] = municitarfdf['MUNICIPIO'].astype(str)
municitarfdf = municitarfdf.merge(incomedf, on='MUNICIPIO', how='left')
municitarfdf.to_csv(base_dir + 'municitarincocc.csv',index=False,encoding='utf-8')

## Merge income numbers with province level tariff data
## Only used for test conducted on 9/13/2015

provtarf = base_dir + "provtariffs.csv"
income = base_dir + "PROVoccinc.csv"


provtarfdf = pd.read_csv(provtarf)
incomedf = pd.read_csv(income)
incomedf.columns = ['PROVINCE','INC07', 'INC13']

provtarfdf = provtarfdf.merge(incomedf, on='PROVINCE', how='left')
provtarfdf.to_csv(base_dir + 'provtarincocc.csv',index=False,encoding='utf-8')