compute_regional_employment.py
Jay Sayre - sayrejay@gmail.com

Computes estimated share of employment in each municipality of the Dominican Republic in a given ISIC (International Standard Industrial Classification) code using data provided by IPUMS international at the ISIC 2-digit level on workers' occupations and data provided by the DR's ONE (Oficina Nacional de Estadística) at the ISIC 4-digit level on companies (empresas) registered in the D.R.

Inputs:
DirectoryofCompaniesandEstablishments/data/mergedprov.csv - D.R. provided information on the companies (and their sizes and industrial classifications) in each municipality. Merged together from separate files in "data/" folder using a UNIX one-liner.

Intermediate data:
DirectoryofCompaniesandEstablishments/companiesandsizesbymunicipality.csv - Clean and aggegrate input companies data (mergedprov.csv) to the municipality level, computing number of companies of a given size in a given industry

DirectoryofCompaniesandEstablishments/municipalitycodecorrespondence.csv - Takes input data, strips data of ll information except for geocodes and corresponding province/municipality names

Outputs:




In [23]:
import pandas as pd
import os

if os.name == 'nt':
    tdir ="D:/Dropbox/Dropbox (Personal)/College/DR_Paper/"
else:
    tdir ="/home/j/Dropbox/College/DR_Paper/"
sndlvl = "DirectoryofCompaniesandEstablishments/"
    
## Inputs
empresadata = tdir+sndlvl+"data/mergedprov.csv"
occu2002 = tdir+'cafta-dr/Output/occupationbymunicipality2002.csv'
occu2010 = tdir+'cafta-dr/Output/occupationbymunicipality2010.csv'
## Intermediate Data
muncodecorres = tdir+sndlvl+'municipalitycodecorrespondence.csv'
companiesbysize = tdir+sndlvl+'companiesandsizesbymunicipality.csv'
## Outputs
industry2002 = tdir+'cafta-dr/Output/estmunicipalindustryactivity2002.csv'
industry2010 = tdir+'cafta-dr/Output/estmunicipalindustryactivity2010.csv'


In [32]:
## Create correspondence table of municis to codes found in Empresa data
## Only need to run this once
'''
provlistdf = pd.read_csv(empresadata,encoding='utf-8')
provlistdf['ISIC'] = provlistdf['CLASS'].apply(lambda x: "'"+x[-4:])
provdrops = [u'SECT',u'DIV',u'GROUP',u'CLASS',u'SECT_DESC',
            u'DIV_DESC',u'GROUP_DESC',u'CLASS_DESC',u'GEOCODE',
            u'PROVINCE_DESC', u'MUNICIPIO_DESC']
provdrops.remove(u'PROVINCE_DESC')
provdrops.remove(u'MUNICIPIO_DESC')
provdrops.extend(['ISIC'])
provlistdf = provlistdf.drop(provdrops, 1)
provlistdf = provlistdf.groupby(['PROVINCE','MUNICIPIO','PROVINCE_DESC', 'MUNICIPIO_DESC'], as_index=False).sum()
provlistdf = provlistdf.drop('EMPRESAS', 1)

## Commented out so it won't write to file everytime
#provlistdf.to_csv(muncodecorres,index=False,encoding="utf-8")
'''

In [26]:
## Directory of Empresas
## Calculate share of companies in a industry of a given size in each mun

empresadf = pd.read_csv(empresadata,encoding='utf-8')
empresadf['ISIC'] = empresadf['CLASS'].apply(lambda x: "'"+x[-4:])
dropcols = [u'SECT',u'DIV',u'GROUP',u'CLASS',u'SECT_DESC',
            u'DIV_DESC',u'GROUP_DESC',u'CLASS_DESC',u'GEOCODE',
            u'PROVINCE_DESC', u'MUNICIPIO_DESC']

empresadf.drop(dropcols, 1, inplace=True)

## Weighting by company size
def csize(word):
    if word == u'050 a 099':
        #return "50-99"
        return 75
    elif word == u'010 a 049':
        #return "10-49"
        return 30
    elif word == u'001 a 009':
        #return "1-9"
        return 5
    elif word == u'250 o m\xe1s':
        #return "250-"
        return 300
    elif word == u'100 a 249':
        #return "100-249"
        return 175
    else:
        return word
empresadf['SIZE'] = empresadf['SIZE'].apply(lambda x: csize(x))

## Group Empresas by Municipality, ISIC Code, and size
empresadf = empresadf.groupby(['PROVINCE','MUNICIPIO','ISIC','SIZE'], as_index=False).sum()

## Calculate total number of companies by size in province and municipality
municidf = empresadf.groupby(['PROVINCE','MUNICIPIO','SIZE'], as_index=False).sum()
provdf = empresadf.groupby(['PROVINCE','SIZE'], as_index=False)['EMPRESAS'].sum()
municidf.columns = ['PROVINCE','MUNICIPIO','SIZE','MUNICINUM']
provdf.columns = ['PROVINCE','SIZE','PROVNUM']

empresadf = empresadf.merge(municidf, on=['PROVINCE','MUNICIPIO','SIZE'], how='left')
empresadf = empresadf.merge(provdf, on=['PROVINCE','SIZE'], how='left')

## Saving intermediate data
#empresadf.to_csv(companiesbysize,index=False)

## Group D.R. Empresa data down to the ISIC two digit level
empresadf['ISICtwodig'] = empresadf['ISIC'].apply(lambda x: str(int(x[1:3])))
empresadf = empresadf.drop(['ISIC','MUNICINUM','PROVNUM','PROVINCE'], 1)
empresadf['ISICtwodig'].unique()
empresadf = empresadf.groupby(['MUNICIPIO','ISICtwodig','SIZE'], as_index=False).sum()

## Used to compute estimated workers in each isic two digit
empresadf['ESTWORKERS']=empresadf['SIZE']*empresadf['EMPRESAS']
## Group down to solely ISIC 2 digit, and not company size
empresadf = empresadf.drop(['SIZE','EMPRESAS'], 1)
empresadf = empresadf.groupby(['MUNICIPIO','ISICtwodig'], as_index=False).sum()

## Calibrate above data with IPUMS industry data for 2002 and 2010
## IPUMS data is the output of IPUMSdataaggregation.py
occudf2002 = pd.read_csv(occu2002)
occudf2010 = pd.read_csv(occu2010)
## Drop last rows of each dataframe
#occudf2002 = occudf2002[occudf2002['mun'] != 'TOTAL']
#occudf2010 = occudf2010[occudf2010['mun'] != 'TOTAL']

In [17]:
## This part is crucial, may want to rewrite
## Combines IPUMS and D.R. estimates for workers in a given sector
def whereindf(item,dataframe,column='mun'):
    return list(dataframe[column]).index(item)

## Come back to this!!! Determines relative weighting of IPUMS and D.R.
## data
wht_empresa_data = 0.5

dataframes = [occudf2002,occudf2010]
for df in dataframes:
    for i in empresadf.index:
        isiccode = empresadf.loc[i,'ISICtwodig']
        if isiccode not in df.columns:
            df[isiccode] = 0    
        muncode = str(empresadf.loc[i,'MUNICIPIO'])
        estworkers = empresadf.loc[i,'ESTWORKERS'] * wht_empresa_data
        occudfindex = whereindf(muncode,df)
        df.loc[occudfindex,isiccode] += estworkers 
        
        #occudf2010['MUNTOTAL']=0
        for col in occodes2010:
            occudf2010['MUNTOTAL'] += occudf2010[col]

## Compute share of occupation in a given municipality
for col in occodes2010:
    occudf2010['share'+str(col)] = occudf2010[col]/occudf2010['MUNTOTAL']

#occudf2002.to_csv(industry2002,index=False)
#occudf2010.to_csv(industry2010,index=False)       

In [30]:
occ2002drops = [a for a in occudf2002.columns if 'share' in a]+['MUNTOTAL']