012616IPUMSdataaggregation.py - Takes cleaned IPUMS data (cleaned by IPUMS/01182016ipumscleaning.py) and
aggregates the variables in the files to the provincial and municipal level, after merging geolevel with universal key

INPUTS: 
'IPUMS/ipumsclean.csv' - cleaned IPUMS data  (by 01182016ipumscleaning.ipynb) for DR in 2002 and 2010
'DR Codigos/muncorrespondence.csv' - file which provides a correspondence between IPUMS geo2 names and D.R. municipality ccodes

INTERMEDIATE FILES: N/A

OUTPUTS:
'occupationbymunicipality2002.csv' - IPUMS 2002 information on the share of workers with reported ISIC 2-digit occupation in a given D.R. municipality

'occupationbymunicipality2010.csv'  - IPUMS 2010 information on the share of workers with reported ISIC 2-digit occupation in a given D.R. municipality

'averageincomebymunicipality2002.csv' - IPUMS 2002 information on the average income of workers in the private sector for a given D.R. municipality

Jay Sayre - sayrejay@gmail.com

In [1]:
import pandas as pd
import os

if os.name == 'nt':
    base_dir = "D:/Dropbox/Dropbox (Personal)/College/DR_Paper/"
else:
    base_dir = "/home/j/Dropbox/College/DR_Paper/"

## INPUTS
inputfl = base_dir+'IPUMS/ipumsclean.csv'
geocodefl = base_dir+'cafta-dr/DR Codigos/muncorrespondence.csv'
## OUTPUTS
occumunoutput2002 = base_dir+'cafta-dr/Output/occupationbymunicipality2002.csv'
occumunoutput2010 = base_dir+'cafta-dr/Output/occupationbymunicipality2010.csv'
munincoutput = base_dir+'cafta-dr/Output/averageincomebymunicipality2002.csv'

## Read in input files
df = pd.read_csv(inputfl, encoding='utf-8')
geodf = pd.read_csv(geocodefl, encoding='utf-8')

## Merge IPUMS data with geographic key

## Make list of IPUMS names that correspond to more than one municipality
tempc = geodf.set_index('CODIGO')['IPUMS'].to_dict()
codesnames = {}
for a,b in tempc.items():
    if b not in codesnames.keys():
        codesnames[b] = [a]
    else:
        codesnames[b].extend([a])
        
## Manually add "other municipalities" categories
codesnames.update({'Other municipalities in Peravia':[1701,1702],
 'Other municipalities in Monte Plata':[2901,2902,2903,2904,2905],
 'Other municipalities in La Altagracia':[1101,1102],
 'Other municipalities in Duarte':[601,602,603,604,605,606,607],
 'Other municipalities in Maria Trinidad':[1401,1402,1403,1404],
 'Other municipalities in Hermanas Mirabal':[1901,1902,1903],
 'Other municipalities in La Vega':[1301,1302,1303,1304],
 u'Other municipalities in Monse\xf1or Nouel':[2801,2802,2803],
 'Other municipalities in Espaillat':[901,902,903,904],
 'Other municipalities in Puerto Plata':[1801,1802,1803,1804,1805,1806,1807,1808,1809],
 'Other municipalities in Monte Cristi':[1501,1502,1503,1504,1505,1506],
 'Other municipalities in Valverde':[2701,2702,2703],
 'Other municipalities in San Juan':[2201,2202,2203,2204,2205,2206],
 ##Since I can't determine where "El Carril" is, dropping obs from data set 
 #'El Carril':[]
    })
#Drop 'El Carril' observations
df = df[df["geo2_dox"] != 'El Carril']

## Split data set into 2002 and 2010 sections, we primarily care about 2002
df2002, df2010 = df[df['year']==2002], df[df['year']==2010]

## Subsetting down to only obs with available (and nonzero) income data for 2002
incdf = df2002[df2002['inctot']!= 9999998]
incdf = incdf[incdf['inctot']!= 9999999]
incdf = incdf[incdf['inctot']!= 0]
## Subset to only workers employed in the private sector
incdf = incdf[incdf['classwk'] == 2]

## Subsetting down to only workers employed in the private sector for 2010
workers2010df = df2010[df2010['classwk'] == 2]

cols = ['COUNTRY', 'YEAR', 'SAMPLE', 'SERIAL', 'PERSONS', 'HHWT', 'SUBSAMP', 'STRATA', 'URBAN', 'REGIONW', 'GEOLEV1', 'GEO1_DO', 'GEO1_DOX', 'GEO2_DOX', 'SUBRDO', 'AGE', 'SEX', 'NATIVITY', 'BPLCOUNTRY', 'BPLDO', 'YRIMM', 'YRSIMM', 'SCHOOL', 'LIT', 'EDATTAIN', 'EDATTAIND', 'YRSCHOOL', 'EDUCDO', 'EMPSTAT', 'EMPSTATD', 'OCCISCO', 'OCC', 'INDGEN', 'IND', 'CLASSWK', 'CLASSWKD', 'EMPSECT', 'INCTOT', 'MIGRATE5', 'MIGCTRY5', 'MIGDO', 'DISABLED', 'DISEMP']

In [2]:
## Calculate average income at the municipality level for 2002

## Calculate number of observations for each municipality code originating from each IPUMS geoname 
## Create dict with number of observations from a given municipality for each municipio codigo
munshares = {}
munobs = incdf.groupby('geo2_dox')['year'].count()
munobs = dict(zip(munobs.index,munobs))
for mun in munobs.keys():
    for muncode in codesnames[mun]:
        if muncode in munshares.keys():
            munshares[muncode][mun] = munobs[mun]
        else:
            munshares[muncode] = {mun:munobs[mun]}
            

# Weight municipality according to num of obs from each category
codigos = [str(b) for b in list(geodf['CODIGO'])]
for muncode in munshares.keys():
    if len(munshares[muncode]) == 1:
        munshares[muncode] = [1]
    else:
        othermuns = float(len([a for a in codigos if str(muncode)[:2] in a[:2]]))
        totalcount = 0
        for munname in munshares[muncode].keys():
                if 'Other municipalities' in munname:
                    totalcount += munshares[muncode][munname]/othermuns
                    othermuncount = munshares[muncode][munname]/othermuns
                else:
                    totalcount += munshares[muncode][munname]
                    frstmuncount = munshares[muncode][munname]
        munshares[muncode] = [frstmuncount/totalcount, othermuncount/totalcount]
        
## Quick program to find row according to each municipality
def whereindf(item,dataframe,column='mun'):
    return list(dataframe[column]).index(item)

## Aggregate income data at the municipality level
muninc = incdf.groupby('geo2_dox', as_index=False)['inctot'].mean()
muninc = dict(zip(muninc['geo2_dox'],muninc['inctot']))
munincdf = pd.DataFrame({'mun':list(geodf['CODIGO'])+['MEAN'],'inctot':0})
for municiname in muninc.keys():
    for municicode in codesnames[municiname]:
        if len(munshares[municicode]) == 1:
            munincdf.loc[whereindf(municicode,munincdf),'inctot'] = muninc[municiname]
        else:
            if 'Other municipalities' in municiname:
                munincdf.loc[whereindf(municicode,munincdf),'inctot'] += munshares[municicode][1]*muninc[municiname]
            else: 
                munincdf.loc[whereindf(municicode,munincdf),'inctot'] += munshares[municicode][1]*muninc[municiname]
                
munincdf.loc[whereindf('MEAN',munincdf),'inctot'] = float(munincdf.mean())

In [3]:
## Figure out proportion of occupations in each municipality for 2002 data
occudf = pd.DataFrame({'mun':list(geodf['CODIGO'])})
occupationcodes = sorted(incdf['occ'].unique())
## Create columns in dataframe according to each occupation code
for occ in occupationcodes:
    occudf[occ] = 0    
## Construct data set
munocc = incdf.groupby('occ', as_index=False)['geo2_dox'].groups
for occ in occupationcodes:
    for dfindex in munocc[occ]:
        municiname = incdf['geo2_dox'][dfindex]
        for municicode in codesnames[municiname]:
            occudf.loc[whereindf(municicode,occudf),occ] += 1

In [4]:
## Figure out proportion of occupations in each municipality for 2010 data
## This code is literally a direct copy of code above

occudf2010 = pd.DataFrame({'mun':list(geodf['CODIGO'])})
occodes2010 = sorted(workers2010df['occ'].unique())
## Create columns in dataframe according to each occupation code
for occ in occodes2010:
    occudf2010[occ] = 0    
## Construct data set
munocc2010 = workers2010df.groupby('occ', as_index=False)['geo2_dox'].groups
for occ in occodes2010:
    for dfindex in munocc2010[occ]:
        municiname = workers2010df['geo2_dox'][dfindex]
        for municicode in codesnames[municiname]:
            occudf2010.loc[whereindf(municicode,occudf2010),occ] += 1

In [5]:
## Write each file to csv
occudf.to_csv(occumunoutput2002,index=False)
occudf2010.to_csv(occumunoutput2010,index=False)
munincdf.to_csv(munincoutput,index=False)

In [None]:
## Aggregate income data at the municipality and occupation level for 2002
## Not really necessary, I think I won't link IPUMS income and occupation data
#munincocc = incdf.groupby(['geo2_dox','occ'], as_index=False)['inctot'].mean()