# Creating Binary Treatment and Event Count Variables for Brazil Firm-Disaster Data

## Overview
This code processes the Brazil firm-disaster distances dataset (`brazil_est_shock_distances.csv`) to create a binary treatment variable and an event count variable. 

- `treated`: binary =1 if the establishment is inside the subtype-specific radius of at least one disaster that year; 0 otherwise. (`flood`,`storm`, `landslide`: 25km; `earthquake`: 100km).
- `num_treatments`: integer count of how many distinct disasters hit that establishment within its radius in that year.

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('./geocoded_data/brazil_est_shock_distances.csv')
df

Unnamed: 0,year,disaster_id,est_id,lat_disaster,lng_disaster,lat_est,lng_est,disaster_type,distance_km
0,2003,2716_2003,2.460658e+12,-19.900563,-43.958439,-9.852406,-63.060539,flood,2333.858418
1,2003,2716_2003,8.464388e+13,-19.900563,-43.958439,-9.903970,-63.035419,flood,2328.562299
2,2003,2716_2003,3.477327e+13,-19.900563,-43.958439,-9.920243,-63.046216,flood,2328.656341
3,2003,2716_2003,8.462377e+13,-19.900563,-43.958439,-10.083945,-63.217735,flood,2335.658212
4,2003,2716_2003,2.286109e+13,-19.900563,-43.958439,-9.936345,-63.013974,flood,2324.705418
...,...,...,...,...,...,...,...,...,...
8999213,2017,34095_2017,1.609919e+13,-4.412389,-37.783080,-15.763823,-47.881577,flood,1676.589271
8999214,2017,34095_2017,7.420040e+13,-4.412389,-37.783080,-15.797515,-47.891887,flood,1680.105768
8999215,2017,34095_2017,6.088486e+13,-4.412389,-37.783080,-15.791174,-47.920045,flood,1681.608159
8999216,2017,34095_2017,6.088486e+13,-4.412389,-37.783080,-15.847697,-47.893227,flood,1684.341342


In [6]:
df["disaster_type"].value_counts()

disaster_type
flood         8433493
storm          353832
landslide      179163
earthquake      32730
Name: count, dtype: int64

In [7]:
disaster_subtype_radii = {
    'flood': 25.0,
    'storm': 25.0,
    'landslide': 25.0,
    'earthquake': 100.0
}

In [8]:
# 1. Assign 'within_radius' based on disaster type
df['within_radius'] = df.apply(
    lambda row: int(row['distance_km'] <= disaster_subtype_radii.get(row['disaster_type'])),
    axis=1
)
df.head()

Unnamed: 0,year,disaster_id,est_id,lat_disaster,lng_disaster,lat_est,lng_est,disaster_type,distance_km,within_radius
0,2003,2716_2003,2460658000000.0,-19.900563,-43.958439,-9.852406,-63.060539,flood,2333.858418,0
1,2003,2716_2003,84643880000000.0,-19.900563,-43.958439,-9.90397,-63.035419,flood,2328.562299,0
2,2003,2716_2003,34773270000000.0,-19.900563,-43.958439,-9.920243,-63.046216,flood,2328.656341,0
3,2003,2716_2003,84623770000000.0,-19.900563,-43.958439,-10.083945,-63.217735,flood,2335.658212,0
4,2003,2716_2003,22861090000000.0,-19.900563,-43.958439,-9.936345,-63.013974,flood,2324.705418,0


In [9]:
# 2. Create a 'treated' flag for each establishment-year combination
#    indicating if the establishment was within the radius of any disaster in that year
treated_flags = (
    df.groupby(['est_id', 'year'])['within_radius']
    .max()
    .reset_index()
    .rename(columns={'within_radius': 'treated'})
)
treated_flags

Unnamed: 0,est_id,year,treated
0,7.870000e+02,2006,0
1,5.844010e+05,2011,0
2,1.180001e+08,2003,0
3,1.180001e+08,2004,0
4,1.180001e+08,2006,0
...,...,...,...
549070,9.875036e+13,2013,0
549071,9.875036e+13,2014,0
549072,9.875036e+13,2015,0
549073,9.875036e+13,2016,0


In [10]:
treated_flags['treated'].value_counts()

treated
0    520064
1     29011
Name: count, dtype: int64

In [11]:
# 3. Count the number of disasters per establishment-year combination
treatment_counts = (
    df[df['within_radius'] == 1]
    .groupby(['est_id', 'year'])['disaster_id']
    .nunique()
    .reset_index()
    .rename(columns={'disaster_id': 'treatment_count'})
)
treatment_counts

Unnamed: 0,est_id,year,treatment_count
0,1.392000e+09,2007,1
1,1.392000e+09,2010,1
2,3.516000e+09,2016,1
3,5.103000e+09,2010,1
4,5.275000e+09,2010,1
...,...,...,...
29006,9.852225e+13,2005,2
29007,9.852225e+13,2003,3
29008,9.852225e+13,2009,3
29009,9.852225e+13,2010,1


In [12]:
assert treated_flags['treated'].sum() == treatment_counts.shape[0], "The number of treated establishments should match the number of establishments with treatment counts."


In [13]:
# 4. Merge the treated flags with the treatment counts
est_year_treatments = treated_flags.merge(
    treatment_counts,
    on=['est_id', 'year'],
    how='left'
)
est_year_treatments['treatment_count'] = est_year_treatments['treatment_count'].fillna(0).astype(int)
est_year_treatments

Unnamed: 0,est_id,year,treated,treatment_count
0,7.870000e+02,2006,0,0
1,5.844010e+05,2011,0,0
2,1.180001e+08,2003,0,0
3,1.180001e+08,2004,0,0
4,1.180001e+08,2006,0,0
...,...,...,...,...
549070,9.875036e+13,2013,0,0
549071,9.875036e+13,2014,0,0
549072,9.875036e+13,2015,0,0
549073,9.875036e+13,2016,0,0


In [14]:
assert est_year_treatments['treated'].sum() == treated_flags['treated'].sum(), "The total number of treated establishments should match."
assert est_year_treatments['treatment_count'].sum() == treatment_counts['treatment_count'].sum(), "The total treatment counts should match."

In [15]:
# 5. Merge the binary treatment flag and treatment counts with the ESTB data
# Operate on each estb file separately because they are too big to merge all at once
import os
import gc
estb_path = '/Users/koacow/BOSTON UNIVERSITY Dropbox/Ngoc Duy Khoa Cao/Climate Risk and Labor Market/RAIS data/firms and cities/ESTB'
outpath = '/Users/koacow/BOSTON UNIVERSITY Dropbox/Ngoc Duy Khoa Cao/Climate Risk and Labor Market/RAIS data/firms and cities/ESTB/treated'

for fname in sorted(os.listdir(estb_path)):
    if fname.startswith("estb2") and fname.endswith(".csv"):
        year = int(fname[4:8])  # extract year from filename like estb2004.csv
        full_path = os.path.join(estb_path, fname)
        df_year = pd.read_csv(full_path, encoding='iso-8859-1')
        df_year['year'] = year
        df_year = df_year.merge(est_year_treatments, left_on=['cnpj_cei', 'year'], right_on=['est_id', 'year'], how='left')
        df_year = df_year.drop(columns=['est_id'])
        df_year['treated'] = df_year['treated'].fillna(0).astype(int)
        df_year['treatment_count'] = df_year['treatment_count'].fillna(0).astype(int)
        df_year["cnpj_cei"] = df_year["cnpj_cei"].astype(str)
        df_year["cnpj_cei"] = df_year["cnpj_cei"].str.replace('.0', '', regex=False)
        df_year["cnpj_cei"] = df_year["cnpj_cei"].str.zfill(14)
        df_year.to_csv(os.path.join(outpath, f'estb_{year}_treated.csv'), index=False)
        print(df_year.head())
        del df_year
        gc.collect()

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


  cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0        0  78900010       65226.0  00000000010235        0.0          NaN   
1        0  78957000       65226.0  00000000039055        0.0          NaN   
2        0  78961390       65226.0  00000000114952        0.0          NaN   
3        0  78932000       65226.0  00000000138037        0.0          NaN   
4        0  78977105       65226.0  00000000138541        0.0          NaN   

  databaixa dataencerramento                      email  indcei_vinc  ...  \
0       NaN              NaN  gestaodepessoas@bb.com.br          0.0  ...   
1       NaN              NaN  gestaodepessoas@bb.com.br          0.0  ...   
2       NaN              NaN  gestaodepessoas@bb.com.br          0.0  ...   
3       NaN              NaN  gestaodepessoas@bb.com.br          0.0  ...   
4       NaN              NaN  gestaodepessoas@bb.com.br          0.0  ...   

   vinculos_clt  vinculos_estat                           razaosocia

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


  cei_vinc         cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0        0   7188888.0       91111.0  00000000000191        0.0   1966-08-01   
1        0  69005300.0       65226.0  00000000000272        0.0          NaN   
2        0  66010900.0       65226.0  00000000000353        0.0          NaN   
3        0  11010908.0       65226.0  00000000000434        0.0          NaN   
4        0  28010000.0       65226.0  00000000000515        0.0          NaN   

  databaixa dataencerramento email  indcei_vinc  ...  vinculos_clt  \
0       NaN              NaN                0.0  ...           0.0   
1       NaN              NaN                0.0  ...          76.0   
2       NaN              NaN                0.0  ...         118.0   
3       NaN              NaN                0.0  ...          68.0   
4       NaN              NaN                0.0  ...         100.0   

   vinculos_estat                                     razaosocial  tamanho  \
0             0.0   

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


  cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0        0  70073901       65226.0  00000000000191        0.0          NaN   
1        0  69005300       65226.0  00000000000272        0.0          NaN   
2        0  66010900       65226.0  00000000000353        0.0          NaN   
3        0  11010908       65226.0  00000000000434        0.0          NaN   
4        0  28010000       65226.0  00000000000515        0.0          NaN   

  databaixa dataencerramento email  indcei_vinc  ...  vinculos_clt  \
0       NaN              NaN                0.0  ...           0.0   
1       NaN              NaN                0.0  ...          81.0   
2       NaN              NaN                0.0  ...         124.0   
3       NaN              NaN                0.0  ...          57.0   
4       NaN              NaN                0.0  ...          80.0   

   vinculos_estat                                     razaosocial  tamanho  \
0             0.0               

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


  cei_vinc         cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0        0  70073901.0       65226.0  00000000000191        0.0          NaN   
1        0  69005300.0       65226.0  00000000000272        0.0          NaN   
2        0  66010900.0       65226.0  00000000000353        0.0          NaN   
3        0  11010908.0       65226.0  00000000000434        0.0          NaN   
4        0  28010000.0       65226.0  00000000000515        0.0          NaN   

  databaixa dataencerramento email  indcei_vinc  ...  vinculos_estat  \
0       NaN              NaN                0.0  ...             0.0   
1       NaN              NaN                0.0  ...             0.0   
2       NaN              NaN                0.0  ...             0.0   
3       NaN              NaN                0.0  ...             0.0   
4       NaN              NaN                0.0  ...             0.0   

                                      razaosocial  tamanho  tipo_estab  \
0           

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc         cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  70073901.0       65226.0  00000000000191        0.0          NaN   
1         0  69005300.0       65226.0  00000000000272        0.0          NaN   
2         0  66010900.0       65226.0  00000000000353        0.0          NaN   
3         0  11010908.0       65226.0  00000000000434        0.0          NaN   
4         0  28010000.0       65226.0  00000000000515        0.0          NaN   

   databaixa dataencerramento email  indcei_vinc  ...  tamanho  tipo_estab  \
0        NaN              NaN                0.0  ...      1.0        CNPJ   
1        NaN              NaN                0.0  ...      7.0        CNPJ   
2        NaN              NaN                0.0  ...      7.0        CNPJ   
3        NaN              NaN                0.0  ...      6.0        CNPJ   
4        NaN              NaN                0.0  ...      6.0        CNPJ   

   ibge_subsetor  indatividade_ano  cnae2_cl

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


  cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0        0  70073901           NaN  00000000000191        0.0          NaN   
1        0  69005300           NaN  00000000000272        0.0          NaN   
2        0  66010900           NaN  00000000000353        0.0   1966-08-01   
3        0  11010908           NaN  00000000000434        0.0          NaN   
4        0  28010000           NaN  00000000000515        0.0   1966-08-01   

   databaixa dataencerramento email  indcei_vinc  ...  vinculos_estat  \
0        NaN              NaN                0.0  ...             0.0   
1        NaN              NaN                0.0  ...             0.0   
2        NaN              NaN                0.0  ...             0.0   
3        NaN              NaN                0.0  ...             0.0   
4        NaN              NaN                0.0  ...             0.0   

                                      razaosocial  tamanho  tipo_estab  \
0               BA

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc         cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  70200002.0       65226.0  00000000000191        0.0          NaN   
1         0  69005300.0       65226.0  00000000000272        0.0          NaN   
2         0  66010900.0       65226.0  00000000000353        0.0          NaN   
3         0  11010908.0       65226.0  00000000000434        0.0          NaN   
4         0  28010000.0       65226.0  00000000000515        0.0          NaN   

   databaixa dataencerramento email  indcei_vinc  ...  tamanho  tipo_estab  \
0        NaN              NaN                0.0  ...      1.0        CNPJ   
1        NaN              NaN                0.0  ...      7.0        CNPJ   
2        NaN              NaN                0.0  ...      7.0        CNPJ   
3        NaN              NaN                0.0  ...      6.0        CNPJ   
4        NaN              NaN                0.0  ...      6.0        CNPJ   

   ibge_subsetor  indatividade_ano  cnae2_cl

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc         cep  cnae1_classe        cnpj_cei  cnpj_raiz  \
0         0  70073901.0           NaN  00000000000191        0.0   
1         0  69005300.0           NaN  00000000000272        0.0   
2         0  66010900.0           NaN  00000000000353        0.0   
3         0  11010908.0           NaN  00000000000434        0.0   
4         0  28010000.0           NaN  00000000000515        0.0   

   dataabertura  databaixa dataencerramento email  indcei_vinc  ...  tamanho  \
0           NaN        NaN              NaN                0.0  ...      1.0   
1           NaN        NaN              NaN                0.0  ...      6.0   
2           NaN        NaN              NaN                0.0  ...      7.0   
3           NaN        NaN              NaN                0.0  ...      6.0   
4           NaN        NaN              NaN                0.0  ...      6.0   

   tipo_estab  ibge_subsetor  indatividade_ano  cnae2_classe cnae2_subclasse  \
0        CNPJ    INST FINANC  

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  70073901         65226  00000000188484          0   1981-07-14   
1         0  70073901         65226  00000000188808          0   1980-06-27   
2         0  70073901         65226  00000000189022          0   1980-06-27   
3         0  70073901         65226  00000000189456          0   1980-06-27   
4         0  70073901         65226  00000000190039          0   1980-06-27   

  databaixa dataencerramento                                          email  \
0       NaN              NaN                             HONGKONG@BB.COM.BR   
1       NaN              NaN                          GRANDCAYMAN@BB.COM.BR   
2       NaN              NaN                                VIENA@BB.COM.BR   
3       NaN              NaN                                MIAMI@BB.COM.BR   
4       NaN              NaN                                MADRI@BB.COM.BR   

   indcei_vinc  ...  tamanho  tipo_estab  ibge_sub

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  76960970         26301  11264839000153   11264839   2009-10-28   
1         0  76829498         52434  11267177000175   11267177   2009-10-28   
2         0  76801234         85138  11266936000185   11266936   2009-10-29   
3         0  76974000         15814  11266980000195   11266980   2009-10-28   
4         0  76880000         52329  11266619000169   11266619   2009-10-28   

  databaixa dataencerramento                                          email  \
0       NaN              NaN             PROGRESSOCONTABILIDADE@HOTMAIL.COM   
1       NaN              NaN                  CONDORCONTABILIDADE@GLOBO.COM   
2       NaN              NaN                  ALTERNATIVAPVH.RH@HOTMAIL.COM   
3       NaN              NaN                      ESCRITORIOEXACT@GMAIL.COM   
4       NaN              NaN             DEP_PESSOAL@LIDERESCRITORIO.COM.BR   

   indcei_vinc  ...  tamanho  tipo_estab  ibge_sub

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  76914868         51390  03608831000139    3608831   2000-01-21   
1         0  76850000         51390  03608831000210    3608831   2000-10-11   
2         0  76963884         65242  03612764000126    3612764   2000-01-27   
3         0  76929000         52132  03609970000187    3609970   2000-01-21   
4         0  76805696         74209  03611652000231    3611652   2013-08-27   

    databaixa dataencerramento                                          email  \
0  2000-10-24              NaN         ESCRITORIOBANDEIRANTES@BANDEIRANTES.WS   
1  2000-10-11              NaN         ESCRITORIOBANDEIRANTES@BANDEIRANTES.WS   
2  2001-02-25              NaN                             RH@CREDISIS.COM.BR   
3  2005-05-07              NaN             MODELOCONTABILIDADEURUPA@GMAIL.COM   
4  2013-08-27              NaN                 ARAXACONTABILIDADE@HOTMAIL.COM   

   indcei_vinc  ...  tamanho  tipo_est

  df_year = pd.read_csv(full_path, encoding='iso-8859-1')


   cei_vinc       cep  cnae1_classe        cnpj_cei  cnpj_raiz dataabertura  \
0         0  76980000         52493  17165872000102   17165872   2012-11-13   
1         0  76824335         28126  17448741000132   17448741   2013-01-18   
2         0  76952000         50504  17686371000171   17686371   2013-03-04   
3         0  76908354         52450  17686424000154   17686424   2013-03-04   
4         0  76900970         55212  17801252000112   17801252   2013-03-22   

    databaixa dataencerramento                                          email  \
0  2012-11-13              NaN                          TIAGO_RYU@HOTMAIL.COM   
1  2013-01-18              NaN                     ALUC_ABERTURAS@HOTMAIL.COM   
2  2013-03-04              NaN                DESTAQUECONTABILIDADE@GMAIL.COM   
3  2013-03-04              NaN               ACOTEC_CONTABILIDADE@HOTMAIL.COM   
4  2013-03-22              NaN                         ESCLIDERJP@HOTMAIL.COM   

   indcei_vinc  ...  tamanho  tipo_est

In [16]:
total_treated = 0
total_treatment_counts = 0
nan_id_count = 0
for fname in sorted(os.listdir(outpath)):
    if fname.startswith("estb_2") and fname.endswith(".csv"):
        full_path = os.path.join(outpath, fname)
        treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
        total_treated += treated_df['treated'].sum()
        total_treatment_counts += treated_df['treatment_count'].sum()
        nan_id_count += treated_df["cnpj_cei"].isna().sum()

assert total_treated == est_year_treatments['treated'].sum(), "Total treated establishments should match across all years."
assert total_treatment_counts == est_year_treatments['treatment_count'].sum(), "Total treatment counts should match across all years."
print(f"Total treated establishments: {total_treated}")
print(f"Total treatment counts: {total_treatment_counts}")
print(f"Total NaN CNPJ/CEI IDs across all years: {nan_id_count}")

  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')
  treated_df = pd.read_csv(full_path, encoding='iso-8859-1')


Total treated establishments: 29011
Total treatment counts: 46710
Total NaN CNPJ/CEI IDs across all years: 0
