In [34]:
import pandas as pd
import numpy as np
import time
#import plotly.io as pio
#import plotly.express as px
#pd.options.plotting.backend = "plotly"

## Yearly Data (RAIS)

In [5]:
#Path
path = '/Users/jpmvbastos/Library/CloudStorage/OneDrive-TexasTechUniversity/Fall 2023/Causal Inference/Term Paper/Data/RAIS/'

#Loop to read each year  
start_year = 2006
end_year = 2017
years = list(range(start_year, end_year + 1))

# Empty lists to collect the name of the dfs
names_list = []

# The globals() method is necessary to name the dfs using the string generated in the loop
for year in years:
    print('Building year ' + str(year))
    name = f'raisestab{year}'  
  #Open file
    df = pd.read_csv(path+f'ESTB{year}.txt', delimiter=';', encoding='latin1', low_memory=False)
  
    df[df['Ind Atividade Ano']==1] #Exclude inactive firms 
    df['year'] = year # Create year var for panel structure 
  
 # Get the CNAE codes 
    df['CNAE2'] = df['CNAE 2.0 Subclasse'].astype(str).apply(lambda x: int(x[:-5]))

# Type of Establishment
    df['Natureza Jurídica'] = pd.to_numeric(df['Natureza Jurídica'], errors='coerce')
    df['government'] = np.where(df['Natureza Jurídica'] < 2000, 1, 0)
    df[df['government']==0] #focus on private firms
# Private Companies and Associations
    df['private'] = np.where((df['Natureza Jurídica'] > 2011) & (df['Natureza Jurídica'] < 2292),1,0)
# Individual companies
    df['individual'] = np.where(df['Tipo Estab']==3, 1, 0)

 ## Firm Size
# Define the conditions and choice
    df['small'] = np.where(df['Tamanho Estabelecimento'] < 5, 1, 0)
    df['medium'] = np.where((df['Tamanho Estabelecimento'] > 4) 
                                & (df['Tamanho Estabelecimento'] < 7), 1, 0)
    df['large'] = np.where(df['Tamanho Estabelecimento'] > 6, 1, 0)

    # Create 'transportation' variable
    df['transportation'] = 0
    df.loc[df['CNAE2'].notnull(), 'transportation'] = np.where(df['CNAE2'].isin([49, 50, 51, 52, 53, 61, 79]), 1, 0)

# Create 'accommodation' variable
    df['accommodation'] = 0
    df.loc[df['CNAE2'].notnull(), 'accommodation'] = np.where(df['CNAE2'].isin([55, 56, 59, 60, 90, 91, 92, 93, 94]), 1, 0)

# Create 'retail' variable
    df['retail'] = 0
    df.loc[df['CNAE2'].notnull(), 'retail'] = np.where(df['CNAE2'] == 47, 1, 0)

    # Create 'construction' variable
    df['construction'] = 0
    df.loc[df['CNAE2'].notnull(), 'construction'] = np.where(df['CNAE2'].isin([41, 42, 43]), 1, 0)

    sectors = ['transportation', 'accommodation', 'retail', 'construction']
    for sector in sectors:
        df[f'{sector}_emp'] = df['Qtd Vínculos Ativos'] * df[sector]
        df[f'{sector}_indiv'] = df['Qtd Vínculos Ativos'] * df[sector] * df['individual']
        df[f'{sector}_s'] = df['Qtd Vínculos Ativos'] * df[sector] * df['small']
        df[f'{sector}_m'] = df['Qtd Vínculos Ativos'] * df[sector] * df['medium']
        df[f'{sector}_l'] = df['Qtd Vínculos Ativos'] * df[sector] * df['large']


    df = df.rename(columns={'Município':'ibge_code'})
    df['ibge_code'] = df['ibge_code'].astype(int) 

    vars = ['Ind Atividade Ano', 'small', 'medium', 'large', 'private', 'individual', 'transportation',
       'accommodation', 'retail', 'construction', 'Qtd Vínculos Ativos','transportation_emp',
       'transportation_indiv', 'transportation_s', 'transportation_m',
       'transportation_l', 'accommodation_emp', 'accommodation_indiv',
       'accommodation_s', 'accommodation_m', 'accommodation_l', 'retail_emp',
       'retail_indiv', 'retail_s', 'retail_m', 'retail_l', 'construction_emp',
       'construction_indiv', 'construction_s', 'construction_m',
       'construction_l']

    df = df.groupby(['ibge_code','year'])[vars].sum()
    df.rename(columns={'Ind Atividade Ano':'total','Qtd Vínculos Ativos':'total_emp'}, inplace=True)
     
    globals()[name] = df
    names_list.append(name)
    print('Finished year ' + str(year))

# Use the list of names in _list to call all dfs and concatenate them
rais = pd.concat((globals()[name] for name in names_list), axis=0)
rais.sort_index(inplace=True)

Building year 2006
Finished year 2006
Building year 2007
Finished year 2007
Building year 2008
Finished year 2008
Building year 2009
Finished year 2009
Building year 2010
Finished year 2010
Building year 2011
Finished year 2011
Building year 2012
Finished year 2012
Building year 2013
Finished year 2013
Building year 2014
Finished year 2014
Building year 2015
Finished year 2015
Building year 2016
Finished year 2016
Building year 2017
Finished year 2017


In [6]:
rais

Unnamed: 0_level_0,Unnamed: 1_level_0,total,small,medium,large,private,individual,transportation,accommodation,retail,construction,...,retail_emp,retail_indiv,retail_s,retail_m,retail_l,construction_emp,construction_indiv,construction_s,construction_m,construction_l
ibge_code,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
110001,2006,602,729,7,1,490,110,17,114,245,11,...,459,0,411,48,0,0,0,0,0,0
110001,2007,498,734,8,1,516,113,18,113,253,12,...,435,0,415,20,0,17,8,17,0,0
110001,2008,613,770,4,1,546,113,18,185,263,20,...,495,0,495,0,0,26,4,26,0,0
110001,2009,592,749,4,1,554,119,22,79,262,22,...,535,0,514,21,0,47,0,47,0,0
110001,2010,630,879,7,1,629,129,28,131,301,32,...,573,0,548,25,0,44,1,44,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530010,2013,112816,118793,4821,1172,109879,2530,4345,16849,34425,9378,...,131919,64,72900,37877,21142,79840,342,17423,21105,41312
530010,2014,108378,119604,4827,1160,109291,2463,4493,17188,33728,9229,...,131391,69,72582,37253,21556,69197,284,16321,18806,34070
530010,2015,103829,120268,4650,1114,106008,2379,4535,17308,33108,8629,...,129015,72,72140,36346,20529,53427,277,14169,14559,24699
530010,2016,103123,121523,4429,1023,103649,2279,4537,17417,32430,8174,...,120967,61,70317,32802,17848,45972,282,11472,11511,22989


In [7]:
# Merge Municipal Data
df = pd.read_excel('/Users/jpmvbastos/Documents/GitHub/AppliedEconometrics/Causal Inference/TermProject/Data/munic_data.xlsx')
df

Unnamed: 0,Sigla,Codigo,Município,Year,bankbranches,bankdeposits,homiciderate,icms_transfers,pibmunicipal,savings,...,population,AEROPORTO DE DESTINO (NOME),AEROPORTO DE DESTINO (UF),ANO,ncountry_from,nairports_from,npassengers,host,cand,ibge_code
0,GO,5200050,ABADIA DE GOIÁS,2003,,,17.580872,786765.74,37920.338097,,...,5621.0,,,,0,0,0,0,0,520005
1,GO,5200050,ABADIA DE GOIÁS,2004,,,16.963528,934511.02,36710.710950,,...,6054.0,,,,0,0,0,0,0,520005
2,GO,5200050,ABADIA DE GOIÁS,2005,,,47.664442,928955.40,40500.359822,,...,6294.0,,,,0,0,0,0,0,520005
3,GO,5200050,ABADIA DE GOIÁS,2006,,,15.883100,815089.56,41983.969137,,...,6531.0,,,,0,0,0,0,0,520005
4,GO,5200050,ABADIA DE GOIÁS,2007,,,15.405947,920842.56,43672.812184,,...,6356.5,,,,0,0,0,0,0,520005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89531,SP,3533809,ÓLEO,2014,,,,2859740.22,56968.400922,,...,2652.0,,,,0,0,0,0,0,353380
89532,SP,3533809,ÓLEO,2015,,,,3032534.69,57057.341856,,...,2628.0,,,,0,0,0,0,0,353380
89533,SP,3533809,ÓLEO,2016,,,,3215613.05,57017.611839,,...,2605.0,,,,0,0,0,0,0,353380
89534,SP,3533809,ÓLEO,2017,,,,3498748.00,54150.480409,,...,2583.0,,,,0,0,0,0,0,353380


In [8]:
data = pd.merge(df, rais, left_on=['ibge_code','Year'], right_on=['ibge_code','year'], how='left')
data

Unnamed: 0,Sigla,Codigo,Município,Year,bankbranches,bankdeposits,homiciderate,icms_transfers,pibmunicipal,savings,...,retail_emp,retail_indiv,retail_s,retail_m,retail_l,construction_emp,construction_indiv,construction_s,construction_m,construction_l
0,GO,5200050,ABADIA DE GOIÁS,2003,,,17.580872,786765.74,37920.338097,,...,,,,,,,,,,
1,GO,5200050,ABADIA DE GOIÁS,2004,,,16.963528,934511.02,36710.710950,,...,,,,,,,,,,
2,GO,5200050,ABADIA DE GOIÁS,2005,,,47.664442,928955.40,40500.359822,,...,,,,,,,,,,
3,GO,5200050,ABADIA DE GOIÁS,2006,,,15.883100,815089.56,41983.969137,,...,60.0,0.0,60.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0
4,GO,5200050,ABADIA DE GOIÁS,2007,,,15.405947,920842.56,43672.812184,,...,60.0,0.0,60.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89531,SP,3533809,ÓLEO,2014,,,,2859740.22,56968.400922,,...,17.0,0.0,17.0,0.0,0.0,14.0,0.0,14.0,0.0,0.0
89532,SP,3533809,ÓLEO,2015,,,,3032534.69,57057.341856,,...,13.0,0.0,13.0,0.0,0.0,40.0,0.0,1.0,39.0,0.0
89533,SP,3533809,ÓLEO,2016,,,,3215613.05,57017.611839,,...,16.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89534,SP,3533809,ÓLEO,2017,,,,3498748.00,54150.480409,,...,12.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Check if all years are there
df.Year.value_counts()

Year
2003    5596
2004    5596
2005    5596
2006    5596
2007    5596
2008    5596
2009    5596
2010    5596
2011    5596
2012    5596
2013    5596
2014    5596
2015    5596
2016    5596
2017    5596
2018    5596
Name: count, dtype: int64

In [10]:
# Export Yearly Data
data.to_csv('/Users/jpmvbastos/Documents/GitHub/AppliedEconometrics/Causal Inference/TermProject/Data/MainData.csv', sep=';' , index=False)

# Monthly Data (CAGED)

In [11]:
caged = pd.read_csv('/Users/jpmvbastos/Library/CloudStorage/OneDrive-TexasTechUniversity/Fall 2023/Causal Inference/Term Paper/Data/CAGED/CAGEDEST_012013.txt', encoding='latin1', delimiter=';', low_memory=False)

In [16]:
caged.columns

Index(['Admitidos/Desligados', 'Competência Declarada', 'Município',
       'Ano Declarado', 'CBO 2002 Ocupação', 'CNAE 1.0 Classe',
       'CNAE 2.0 Classe', 'CNAE 2.0 Subclas', 'Faixa Empr Início Jan',
       'Grau Instrução', 'Qtd Hora Contrat', 'IBGE Subsetor', 'Idade',
       'Ind Aprendiz', 'Ind Portador Defic', 'Raça Cor', 'Salário Mensal',
       'Saldo Mov', 'Sexo', 'Tempo Emprego', 'Tipo Estab', 'Tipo Defic',
       'Tipo Mov Desagregado', 'UF', 'Bairros SP', 'Bairros Fortaleza',
       'Bairros RJ', 'Distritos SP', 'Regiões Adm DF', 'Mesorregião',
       'Microrregião', 'Região Adm RJ', 'Região Adm SP', 'Região Corede',
       'Região Corede 04', 'Região Gov SP', 'Região Senac PR',
       'Região Senai PR', 'Região Senai SP', 'Sub-Região Senai PR'],
      dtype='object')

In [35]:
#Path
path = '/Users/jpmvbastos/Library/CloudStorage/OneDrive-TexasTechUniversity/Fall 2023/Causal Inference/Term Paper/Data/CAGED/'

# Empty lists to collect the name of the dfs
names_list = []

# The globals() method is necessary to name the dfs using the string generated in the loop
for year in [2013, 2014]:
    for month in ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", '11', '12']:
        start_time = time.time()
        print('Building month ' + month + ' of year ' + str(year))
        name = 'CAGEDEST_'+month+str(year)
    #Open file
        df = pd.read_csv(path+name+'.txt', delimiter=';', encoding='latin1', low_memory=False)

    # Focus on private firms
        df[df['IBGE Subsetor']!=24] # 24 == "Administraçao pública direta e autárquica"

    # Drop irrelevant openings/closings
        df[df['Tipo Mov Desagregado'].isin([3,6,7,8,9])==False]
                # Admissão por Transferência	3
                # Desligamento a Pedido	6
                # Desligamento por Aposentadoria	7
                # Desligamento por Morte	8
                # Desligamento por Transferência    9           

    # Drop irrelevant variables
        df.drop(columns=['Ano Declarado', 'CBO 2002 Ocupação', 'CNAE 1.0 Classe',
                     'Bairros SP', 'Bairros Fortaleza','Bairros RJ', 'Distritos SP', 
                     'Regiões Adm DF', 'Mesorregião', 'Microrregião', 'Região Adm RJ', 
                     'Região Adm SP', 'Região Corede', 'Região Corede 04', 'Região Gov SP', 
                     'Região Senac PR','Região Senai PR', 'Região Senai SP', 'Sub-Região Senai PR',
                     'Idade','Ind Aprendiz', 'Ind Portador Defic', 'Raça Cor','Saldo Mov', 'Sexo'],
                       inplace=True)  
    
    # Get the CNAE codes 
        df['CNAE2'] = df['CNAE 2.0 Subclas'].astype(str).apply(lambda x: int(x[:-5]))

    # Convert wages to float
        df['Salário Mensal'] = df['Salário Mensal'].str.replace(',','.').astype(float)

    # Hourly wage
        df['hourly_wage'] = df['Salário Mensal'] / df['Qtd Hora Contrat']

    ## Firm Size
    # Define the conditions and choice
        df['small'] = np.where(df['Faixa Empr Início Jan'] < 5, 1, 0)
        df['medium'] = np.where((df['Faixa Empr Início Jan'] > 4) 
                                    & (df['Faixa Empr Início Jan'] < 7), 1, 0)
        df['large'] = np.where(df['Faixa Empr Início Jan'] > 6, 1, 0)

     ## Totals
        # Total Hired 
        df['hired'] = np.where(df['Admitidos/Desligados']==1, 1, 0)
        # Total Fired 
        df['fired'] = np.where(df['Admitidos/Desligados']==2, 1, 0)
        # Total Temporary Hired 
        df['temp_hired'] = np.where(df['Tipo Mov Desagregado']==25, 1, 0)
        # Total Temporary Fired = df['temp_fired']
        df['temp_fired'] = np.where(df['Tipo Mov Desagregado']==43, 1, 0)
        # Total Wages
        df['th_wages'] = df['Salário Mensal'] * df['hired'] 
        df['tf_wages'] = df['Salário Mensal'] * df['fired']
        # Total Hours
        df['th_hours'] = df['Qtd Hora Contrat'] * df['hired']
        df['tf_hours'] = df['Qtd Hora Contrat'] * df['fired']

        # Small Firms
        df['small_hired'] = df['hired'] * df['small']
        df['small_fired'] = df['fired'] * df['small']
        df['small_temp_hired'] = df['temp_hired'] * df['small']
        df['small_temp_fired'] = df['temp_fired'] * df['small']
        df['small_th_wages'] = df['th_wages'] * df['small']
        df['small_tf_wages'] = df['tf_wages'] * df['small']
        df['small_th_hours'] = df['th_hours'] * df['small']
        df['small_tf_hours'] = df['tf_hours'] * df['small']
        
        # Large Firms
        df['large_hired'] = df['hired'] * df['large']
        df['large_fired'] = df['fired'] * df['large']
        df['large_temp_hired'] = df['temp_hired'] * df['large']
        df['large_temp_fired'] = df['temp_fired'] * df['large']
        df['large_th_wages'] = df['th_wages'] * df['large']
        df['large_tf_wages'] = df['tf_wages'] * df['large']
        df['large_th_hours'] = df['th_hours'] * df['large']
        df['large_tf_hours'] = df['tf_hours'] * df['large']

    # Create 'transportation' variable
        df['transportation'] = 0
        df.loc[df['CNAE2'].notnull(), 'transportation'] = np.where(df['CNAE2'].isin([49, 50, 51, 52, 53, 61, 79]), 1, 0)

    # Create 'accommodation' variable
        df['accommodation'] = 0
        df.loc[df['CNAE2'].notnull(), 'accommodation'] = np.where(df['CNAE2'].isin([55, 56, 59, 60, 90, 91, 92, 93, 94]), 1, 0)

    # Create 'retail' variable
        df['retail'] = 0
        df.loc[df['CNAE2'].notnull(), 'retail'] = np.where(df['CNAE2'] == 47, 1, 0)

    # Create 'construction' variable
        df['construction'] = 0
        df.loc[df['CNAE2'].notnull(), 'construction'] = np.where(df['CNAE2'].isin([41, 42, 43]), 1, 0)

        sectors = ['transportation', 'accommodation', 'retail', 'construction']
        for sector in sectors:
            # Total Hired 
            df[f'hired_{sector}'] = df['hired'] * df[sector]
            # Total Fired 
            df[f'fired_{sector}'] = df['fired'] * df[sector]
            # Total Temporary Hired
            df[f'temp_hired_{sector}'] = df['temp_hired'] * df[sector]
            df[f'temp_fired_{sector}'] = df['temp_fired'] * df[sector]
            # Total Wages
            df[f'th_wages_{sector}'] = df['th_wages'] * df[sector]
            df[f'tf_wages_{sector}'] = df['tf_wages'] * df[sector]
            # Total Hours
            df[f'th_hours_{sector}'] = df['th_hours'] * df[sector]
            df[f'tf_hours_{sector}'] = df['tf_hours'] * df[sector]

        # Create groupby instances 
        df = df.rename(columns={'Município':'ibge_code','Competência Declarada':'period'})
        df['ibge_code'] = df['ibge_code'].astype(int) 

        vars = ['hired', 'fired', 'small', 'medium', 'large', 'temp_hired', 'temp_fired', 
                'th_wages', 'tf_wages', 'th_hours', 'tf_hours','small_hired', 'small_fired',
                'small_temp_hired','small_temp_fired','small_th_wages','small_tf_wages',
                'small_th_hours','small_tf_hours','large_hired','large_fired','large_temp_hired',
                'large_temp_fired','large_th_wages','large_tf_wages','large_th_hours','large_tf_hours',
                'hired_transportation', 'fired_transportation', 'temp_hired_transportation',
                'temp_fired_transportation', 'th_wages_transportation', 'tf_wages_transportation',
                'th_hours_transportation', 'tf_hours_transportation', 'hired_accommodation',
                'fired_accommodation', 'temp_hired_accommodation', 'temp_fired_accommodation',
                'th_wages_accommodation', 'tf_wages_accommodation', 'th_hours_accommodation',
                'tf_hours_accommodation', 'hired_retail', 'fired_retail', 'temp_hired_retail',
                'temp_fired_retail', 'th_wages_retail', 'tf_wages_retail', 'th_hours_retail',
                'tf_hours_retail', 'hired_construction', 'fired_construction',
                'temp_hired_construction', 'temp_fired_construction', 'th_wages_construction',
                'tf_wages_construction', 'th_hours_construction', 'tf_hours_construction']

        df = df.groupby(['ibge_code','period'])[vars].sum()

        # Net number of jobs
        df['netjobs'] = df['hired'] - df['fired'] 
        # Net wages
        df['net_wages'] = df['th_wages'] - df['tf_wages']
        # Net hours
        df['net_hours'] = df['th_hours'] - df['tf_hours']
        # Percentages of temporary hires and fires
        df['th_share'] = df['temp_hired'] / df['hired']
        df['tf_share'] = df['temp_fired'] / df['fired']
        # Net Temporary Jobs
        df['net_tempjobs'] = df['temp_hired'] - df['temp_fired']
        # Temporary Net Wages
        df['net_tempwages'] = df['th_wages'] - df['tf_wages']
        # Temporary Net Hours
        df['net_temphours'] = df['th_hours'] - df['tf_hours']
        # Avg Hourly Wage
        df['avg_hourly_wage_h'] = df['th_wages'] / df['th_hours']
        df['avg_hourly_wage_f'] = df['tf_wages'] / df['tf_hours']
        for sector in sectors:
            # Net Number of Jobs
            df[f'netjobs_{sector}'] = df[f'hired_{sector}'] - df[f'fired_{sector}']
            # Net Wages
            df[f'net_wages_{sector}'] = df[f'th_wages_{sector}'] - df[f'tf_wages_{sector}']
            # Net Hours
            df[f'net_hours_{sector}'] = df[f'th_hours_{sector}'] - df[f'tf_hours_{sector}']
        
        
        globals()[name] = df
        names_list.append(name)
        print('Finished year ' + str(year))
        print("--- %s seconds ---" % (time.time() - start_time))

# Use the list of names in _list to call all dfs and concatenate them
caged = pd.concat((globals()[name] for name in names_list), axis=0)
caged.sort_index(inplace=True)

Building month 01 of year 2013
Finished year 2013
--- 15.430002927780151 seconds ---
Building month 02 of year 2013
Finished year 2013
--- 14.042168140411377 seconds ---
Building month 03 of year 2013
Finished year 2013
--- 15.128101110458374 seconds ---
Building month 04 of year 2013
Finished year 2013
--- 17.227380990982056 seconds ---
Building month 05 of year 2013
Finished year 2013
--- 16.530835151672363 seconds ---
Building month 06 of year 2013
Finished year 2013
--- 15.372475862503052 seconds ---
Building month 07 of year 2013
Finished year 2013
--- 16.336631774902344 seconds ---
Building month 08 of year 2013
Finished year 2013
--- 16.309181928634644 seconds ---
Building month 09 of year 2013
Finished year 2013
--- 15.65537405014038 seconds ---
Building month 10 of year 2013
Finished year 2013
--- 15.32451319694519 seconds ---
Building month 11 of year 2013
Finished year 2013
--- 14.499203205108643 seconds ---
Building month 12 of year 2013
Finished year 2013
--- 11.5635440349

In [36]:
caged

Unnamed: 0_level_0,Unnamed: 1_level_0,hired,fired,small,medium,large,temp_hired,temp_fired,th_wages,tf_wages,th_hours,...,net_hours_transportation,netjobs_accommodation,net_wages_accommodation,net_hours_accommodation,netjobs_retail,net_wages_retail,net_hours_retail,netjobs_construction,net_wages_construction,net_hours_construction
ibge_code,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
110001,201301,64,86,150,0,0,0,0,51654.0,72188.0,2794,...,-88,-1,-678.0,-44,-23,-20840.0,-1004,-7,-5846.0,-308
110001,201302,80,72,152,0,0,0,0,65948.0,57729.0,3478,...,0,1,1100.0,44,17,8735.0,706,-3,-1506.0,-132
110001,201303,59,54,113,0,0,0,0,50813.0,48335.0,2502,...,0,-2,-1356.0,-88,-12,-12264.0,-528,-5,-4670.0,-220
110001,201304,94,63,157,0,0,2,0,87954.0,59800.0,4070,...,-172,-1,-1100.0,-44,12,13239.0,522,2,4933.0,88
110001,201305,126,76,202,0,0,0,0,106890.0,65035.0,5533,...,0,1,800.0,44,4,3919.0,211,8,6726.0,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530010,201408,31706,29043,31108,13799,15842,334,259,39970532.0,38654998.0,1302932,...,-627,670,158024.0,19315,440,397928.0,18165,-948,-2008635.0,-43886
530010,201409,32070,29671,32668,13801,15272,352,193,37711369.0,38765432.0,1330403,...,2063,426,107230.0,11443,47,-236848.0,-44,-348,-1273191.0,-15379
530010,201410,28945,31440,32890,13556,13939,351,207,35010773.0,41476638.0,1217822,...,2736,-29,-245093.0,-2696,135,-146830.0,5537,-1847,-3255061.0,-79262
530010,201411,27946,28521,31001,12316,13150,326,212,32869739.0,36807442.0,1176488,...,10209,274,262867.0,10253,1338,919175.0,58199,-2204,-3478840.0,-95608


In [49]:
# Merge City Basic Info
df = pd.read_excel('/Users/jpmvbastos/Documents/GitHub/AppliedEconometrics/Causal Inference/TermProject/Data/munic_data.xlsx')
df = df[['Sigla', 'ibge_code', 'Município','host','cand']].drop_duplicates()
df

Unnamed: 0,Sigla,ibge_code,Município,host,cand
0,GO,520005,ABADIA DE GOIÁS,0,0
16,MG,310010,ABADIA DOS DOURADOS,0,0
32,GO,520010,ABADIÂNIA,0,0
48,PA,150010,ABAETETUBA,0,0
64,MG,310020,ABAETÉ,0,0
...,...,...,...,...,...
89456,RS,430155,ÁUREA,0,0
89472,PR,410115,ÂNGULO,0,0
89488,BA,290050,ÉRICO CARDOSO,0,0
89504,PA,150510,ÓBIDOS,0,0


In [52]:
data = pd.merge(df, caged.reset_index(), on='ibge_code', how='right')
data

Unnamed: 0,Sigla,ibge_code,Município,host,cand,period,hired,fired,small,medium,...,net_hours_transportation,netjobs_accommodation,net_wages_accommodation,net_hours_accommodation,netjobs_retail,net_wages_retail,net_hours_retail,netjobs_construction,net_wages_construction,net_hours_construction
0,RO,110001,ALTA FLORESTA D'OESTE,0,0,201301,64,86,150,0,...,-88,-1,-678.0,-44,-23,-20840.0,-1004,-7,-5846.0,-308
1,RO,110001,ALTA FLORESTA D'OESTE,0,0,201302,80,72,152,0,...,0,1,1100.0,44,17,8735.0,706,-3,-1506.0,-132
2,RO,110001,ALTA FLORESTA D'OESTE,0,0,201303,59,54,113,0,...,0,-2,-1356.0,-88,-12,-12264.0,-528,-5,-4670.0,-220
3,RO,110001,ALTA FLORESTA D'OESTE,0,0,201304,94,63,157,0,...,-172,-1,-1100.0,-44,12,13239.0,522,2,4933.0,88
4,RO,110001,ALTA FLORESTA D'OESTE,0,0,201305,126,76,202,0,...,0,1,800.0,44,4,3919.0,211,8,6726.0,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126010,DF,530010,BRASÍLIA,1,0,201408,31706,29043,31108,13799,...,-627,670,158024.0,19315,440,397928.0,18165,-948,-2008635.0,-43886
126011,DF,530010,BRASÍLIA,1,0,201409,32070,29671,32668,13801,...,2063,426,107230.0,11443,47,-236848.0,-44,-348,-1273191.0,-15379
126012,DF,530010,BRASÍLIA,1,0,201410,28945,31440,32890,13556,...,2736,-29,-245093.0,-2696,135,-146830.0,5537,-1847,-3255061.0,-79262
126013,DF,530010,BRASÍLIA,1,0,201411,27946,28521,31001,12316,...,10209,274,262867.0,10253,1338,919175.0,58199,-2204,-3478840.0,-95608


In [53]:
data.to_csv('/Users/jpmvbastos/Documents/GitHub/AppliedEconometrics/Causal Inference/TermProject/Data/CagedData.csv', sep=';' , index=False)