## Purpose

Make the dataset that we can use to get the balance outcomes for the balance tests

In [1]:
# parameters:
# age_lower_bound: lower bound of age range
# age_upper_bound: upper bound of age range
# distance_from_line: distance from central line

age_lower_bound = 15
age_upper_bound = 45
distance_from_line = 300

In [2]:
import pandas as pd
import pathlib
import numpy as np

root = pathlib.Path.cwd().parent

In [3]:
# read in census data
df_census_1930 = pd.read_parquet(root / "data/census/df_ref.parquet")
df_census_1930 = df_census_1930.rename(columns={'arkbild': 'name', 'scbkod': 'ref_code', 'fscbkod': 'ref_code_birth_parish'})

In [4]:
# convert ref_code_birth_parish from dtype: object to int
df_census_1930['ref_code_birth_parish'] = pd.to_numeric(df_census_1930['ref_code_birth_parish'], errors='coerce').astype('Int64')
# convert ref_code from dtype: object to int
df_census_1930['ref_code'] = pd.to_numeric(df_census_1930['ref_code'], errors='coerce').astype('Int64')

## Created treated variable

This means your parish of birth code is in the treated group

In [None]:
# read in crosswalk to geom_id to get treatment from first-stage dataset
crosswalk_df = pd.read_excel(root / "data/parishes/1930_census_map_merged_data.xlsx")
# rename ref_code to ref_code_birth_parish to match census data
crosswalk_df = crosswalk_df.rename(columns={'ref_code': 'ref_code_birth_parish'})
# join the crosswalk with the census data
df_census_1930 = df_census_1930.merge(crosswalk_df, on='ref_code_birth_parish', how='left')
# this will leave us with geom_id as the variable on which to join to the first-stage dataset

In [None]:
# read in the first-stage dataset
first_stage = pd.read_excel(root / "data/first-stage/parish-level-power-station-data-vf.xlsx")
first_stage["geom_id"].value_counts()

In [None]:
# read in the first-stage dataset
first_stage = pd.read_excel(root / "data/first-stage/parish-level-power-station-data-vf.xlsx")
# keep treatment and geom_id
first_stage = first_stage[['treated', 'geom_id']]
# rename treated to electricity_parish_born
first_stage = first_stage.rename(columns={'treated': 'electricity_parish_born'})

# this leaves us with the variable of interest and the geom_id to join on
first_stage["electricity_parish_born"].value_counts(dropna=False)

In [None]:
# drop column electricity_parish_born from df_census_1930
df_census_1930 = df_census_1930.drop(columns=['electricity_parish_born'])
# join df_census_1930 and first_stage by geom_id
df_census_1930 = df_census_1930.merge(first_stage, on='geom_id', how='left')

In [None]:
# fill NaNs in electricity_parish_born with 0
df_census_1930["electricity_parish_born"] = df_census_1930["electricity_parish_born"].fillna(0)

Now for electricity parish living

In [None]:
# read in crosswalk to geom_id to get treatment from first-stage dataset
crosswalk_df = pd.read_excel(root / "data/parishes/1930_census_map_merged_data.xlsx")
# rename ref_code to ref_code_birth_parish to match census data
crosswalk_df = crosswalk_df.rename(columns={'ref_code': 'ref_code'})
# drop geom_id from census data
df_census_1930 = df_census_1930.drop(columns=['geom_id'])
# join the crosswalk with the census data
df_census_1930 = df_census_1930.merge(crosswalk_df, on='ref_code', how='left')

In [None]:
# read in the first-stage dataset
first_stage = pd.read_excel(root / "data/first-stage/parish-level-power-station-data-vf.xlsx")
# keep treatment and geom_id
first_stage = first_stage[['treated', 'geom_id']]
# rename treated to electricity_parish_born
first_stage = first_stage.rename(columns={'treated': 'electricity_parish_living_in'})

In [None]:
# drop column electricity_parish_living from df_census_1930 and geom_id
df_census_1930 = df_census_1930.drop(columns=['electricity_parish_living_in'])


In [None]:
df_census_1930[""]

In [None]:
# join df_census_1930 and first_stage by geom_id
df_census_1930 = df_census_1930.merge(first_stage, on='geom_id', how='left')

Create income variables

In [None]:
# generate log_income_incl_zero from income_incl_zero
df_census_1930['log_income_incl_zero'] = df_census_1930['income_incl_zero'].apply(lambda x: np.log(x+1))

In [None]:
# impute income_incl_zero_imputed_mean by grouping by hisco_code and parish and taking the mean of income_incl_zero
df_census_1930['income_incl_zero_imputed_mean'] = df_census_1930.groupby(['hisco_code', 'ref_code'])['income_incl_zero'].transform('mean')
# now do the same for median
df_census_1930['income_incl_zero_imputed_median'] = df_census_1930.groupby(['hisco_code', 'ref_code'])['income_incl_zero'].transform('median')

In [None]:
# print the average of income_incl_zero by treated
print(df_census_1930.groupby('treated')['income_incl_zero'].mean())

In [None]:
# print the average of the two imputed values by grouping by treated
print(df_census_1930.groupby('treated')[['income_incl_zero_imputed_mean', 'income_incl_zero_imputed_median']].mean())

In [None]:
# impute income_incl_zero by calculating the mean of the income_incl_zero for each ref_code and hisclass_code_abb

df_census_1930['log_income_imputed_mean'] = df_census_1930.groupby(['ref_code', 'hisco_code'])['log_income_incl_zero'].transform(lambda x: x.fillna(x.mean()))

df_census_1930['log_income_imputed_median'] = df_census_1930.groupby(['ref_code', 'hisco_code'])['log_income_incl_zero'].transform(lambda x: x.fillna(x.median()))


In [None]:

# replace log_income_imputed_mean with NaN if log_income_imputed_mean == 0
df_census_1930['log_income_imputed_mean'] = df_census_1930['log_income_imputed_mean'].replace(0, np.nan)
# same for median
df_census_1930['log_income_imputed_median'] = df_census_1930['log_income_imputed_median'].replace(0, np.nan)

## Checks

In [None]:
# group by treated and print the mean of log_income_imputed_mean
print(df_census_1930.groupby('treated')['log_income_incl_zero'].mean())


In [None]:
# replace log_income_incl_zero with NaN if it equals 0
df_census_1930['log_income_incl_zero'] = df_census_1930['log_income_incl_zero'].replace(0, np.nan)

## Create indicators for electricity parish born and electricity parish living

In [None]:
df_census_1930["fscbkod"]

## Filters

In [None]:
# 

In [None]:
# write out head(10) to excel to check if it worked into data/temp
df_census_1930.head(10).to_excel(root / "data/temp/df_census_1930_check.xlsx")

In [None]:

# count share of NaNs in formogh in the dataset
df_census_1930['formogh'].isna().sum() / len(df_census_1930['formogh'])

In [None]:
# export the variables that we care about as a Stata file, compressed if possible

# list of variables is here:

income_variables = ["log_income_incl_zero", "log_income_imputed_mean", "log_income_imputed_median"]

location_variables

variables_to_keep = ['id', 'ref_code', 'hisclass_code_abb', 'log_income_incl_zero', 'log_income_imputed_mean', 'log_income_imputed_median']