First, import needed modules

In [50]:
import numpy as np
import pandas as pd
import os
from functools import reduce

Initalize variables

In [51]:
path = os.getcwd()
print(path)

/Users/jan/Dropbox/UP_EPQM/2222/MA/powerlinemonsters


Import inkar dataset

In [52]:
inkar_raw = pd.read_csv(f'{path}/data/controls/inkar_2021.csv', encoding='UTF-8-SIG')

Rename columns

In [53]:
col_names = ['bereich', 'indikator', 'AVS', 'year', 'wert']
inkar_raw.columns = col_names
inkar_raw.columns

Index(['bereich', 'indikator', 'AVS', 'year', 'wert'], dtype='object')

Fix the AGS

In [54]:
inkar_raw['AVS'] = inkar_raw['AVS'].astype(str)
inkar_raw['AVS'] = np.where(inkar_raw['AVS'].str.len() < 8, '0' + inkar_raw['AVS'], inkar_raw['AVS'])

Convert Wert to numeric

In [55]:
inkar_raw['value'] = inkar_raw['wert'].str.replace(',','.').astype(float)

Filter df

In [56]:
vars_to_keep = ['Bevölkerung gesamt', 'Frauenanteil', 'Einwohnerdichte',  'Arbeitslose', 'Durchschnittsalter der Bevölkerung']
inkar_selected = inkar_raw[inkar_raw['indikator'].isin(vars_to_keep)]

Pivot df

In [57]:
inkar_pivot = pd.pivot(inkar_selected, index=['AVS', 'year'], columns='indikator', values='value')
inkar_pivot = inkar_pivot.reset_index()
inkar_pivot.shape

(106214, 7)

Rename columns

In [58]:
col_names = ['AVS', 'year', 'unemployed_inkar', 'pop_inkar', 'avg_age_inkar', 'pop_density_inkar', 'female_inkar']
inkar_pivot.columns = col_names

Change Verbandsschlüssel to AGS (2013)

In [59]:
avs_trans = pd.read_csv(f'{path}/data/avs_transition.csv', converters={'AGS': str, 'AVS': str})
inkar_controls = inkar_pivot.merge(avs_trans, on='AVS')
inkar_controls['GV'] = np.where(inkar_controls['AGS'] == inkar_controls['AVS'], 0, 1)
del inkar_controls['AVS']
inkar_controls['GV'].value_counts()

1    210910
0    151892
Name: GV, dtype: int64

Inspect which obs are on Gemeindelevel

In [60]:
inkar_gv = inkar_controls[['AGS', 'GV']].groupby('AGS').first()
inkar_gv['GV'].value_counts()
#inkar_controls = inkar_controls[inkar_controls['GV'] == 0] # uncomment to keep only obs on Gemeindelevel
#inkar_controls.shape

1    7651
0    3177
Name: GV, dtype: int64

Import Regionalstatistik income datasets

In [61]:
# initialize income dictionary
income = {2007: 'Einkommen_2007.csv', 2010: 'Einkommen_2010.csv', 2013: 'Einkommen_2013.csv'}
# read in dfs
for year, file in income.items():
    income[year] = pd.read_csv(f'{path}/data/controls/{file}', encoding = 'ISO-8859-1', sep=';', converters={'AGS': str}, na_values=['-', 'x', '.'])

Income: Fix and change AGS to 2013, rename columns and compute income per capita

In [62]:
for year, df in income.items():
    # drop states
    income[year] = income[year][income[year]['AGS'].str.len() > 3]
    # fix AGS
    income[year]['AGS'] = income[year]['AGS'].astype(str)
    income[year]['AGS'] = np.where(income[year]['AGS'].str.len() < 6, income[year]['AGS'] + '0'*3, income[year]['AGS']) # add trailing 000 for kreisfreie Städte (AGS has 4-5 digits before, 7-8 after)
    income[year]['AGS'] = np.where(income[year]['AGS'].str.len() < 8, '0' + income[year]['AGS'], income[year]['AGS']) # add leading 0 for state ids < 10 (AGS has 7 digits)
    # read in AGS transition datasets for same year and previous year
    ags_trans_same = pd.read_csv(f'{path}/data/ags_transition.csv', usecols=[f'AGS_{year}', 'AGS_2013'], converters={f'AGS_{year}': str, 'AGS_2013': str})
    ags_trans_same.rename(columns = {f'AGS_{year}':'AGS_new'}, inplace = True)
    ags_trans_prev = pd.read_csv(f'{path}/data/ags_transition.csv', usecols=[f'AGS_{year-1}', 'AGS_2013'], converters={f'AGS_{year-1}': str, 'AGS_2013': str})
    ags_trans_prev.rename(columns = {f'AGS_{year-1}':'AGS_new'}, inplace = True)
    ags_trans = pd.concat([ags_trans_same, ags_trans_prev])
    # merge income[year] with ags trans
    income[year] = income[year].merge(ags_trans, left_on='AGS', right_on='AGS_new', how='left', indicator=True)
    print(income[year]._merge.value_counts())
    # if merged successfully, replace AGS
    income[year]['AGS'] = np.where(income[year]['AGS_2013'].notna(), income[year]['AGS_2013'], income[year]['AGS'])
    income[year] = income[year].drop(['AGS_2013', 'AGS_new', '_merge'], axis=1)
    # move AGS to front
    AGS = income[year].pop('AGS')
    income[year].insert(0, 'AGS', AGS)
    # rename columns
    col_names = ['AGS', 'year', 'GEN', 'taxable_persons', 'total_income', 'taxes']
    income[year].columns = col_names
    # compute income per capita
    income[year]['income_pc'] = np.log(income[year]['total_income'] / income[year]['taxable_persons'] * 1000)
    # drop duplicates
    income[year] = income[year].drop_duplicates(keep='first')
    # subset df
    income[year] = income[year][['AGS', 'year', 'income_pc']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income[year]['AGS'] = income[year]['AGS'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income[year]['AGS'] = np.where(income[year]['AGS'].str.len() < 6, income[year]['AGS'] + '0'*3, income[year]['AGS']) # add trailing 000 for kreisfreie Städte (AGS has 4-5 digits before, 7-8 after)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

both          28249
left_only      2065
right_only        0
Name: _merge, dtype: int64
both          32928
left_only      1777
right_only        0
Name: _merge, dtype: int64
both          32930
left_only      2565
right_only        0
Name: _merge, dtype: int64


Since CSDID uses the last (pre-treatment) value of controls, set income values for 2008 and 2009 to the nearest observed value

In [63]:
income[2008] = income[2007].copy()
income[2008]['year'] = 2008
income[2009] = income[2010].copy()
income[2009]['year'] = 2009

Concatenate income dfs

In [64]:
income = pd.concat(income.values())
income.head()

Unnamed: 0,AGS,year,income_pc
0,1001000,2007,10.219578
2,1002000,2007,10.198012
4,1003000,2007,10.231703
6,1004000,2007,10.178519
8,1051000,2007,10.267484


Import Religion and Foreign datasets

In [65]:
religion = pd.read_csv(f'{path}/data/controls/religion_2011.csv', encoding = 'ISO-8859-1', sep=';', converters={'AGS': str}, na_values=['-', 'x', '.'])
foreign = pd.read_csv(f'{path}/data/controls/nationalitaet_2011.csv', encoding = 'ISO-8859-1', sep=';', converters={'AGS': str}, na_values=['-', 'x', '.'])

Pivot religion df

In [66]:
religion = pd.pivot_table(religion, index=['AGS'], columns=['Religion'], values='Bevölkerung', aggfunc=np.sum, fill_value=0)
religion.rename(columns = {'Römisch-katholische Kirche': 'catholic'}, inplace = True)
religion = religion.reset_index()
religion = religion[['AGS', 'catholic']]
religion.head()

Religion,AGS,catholic
0,1,167565
1,10,619696
2,1001,5220
3,1002,17368
4,1003,17793


Convert AGS for religion and foreign to AGS in 2013

In [67]:
dfs = [foreign, religion]
# fix AGS
for i, df in enumerate(dfs):
    # drop states
    dfs[i] = dfs[i][dfs[i]['AGS'].str.len() > 3]
    dfs[i]['AGS'] = dfs[i]['AGS'].astype(str)
    dfs[i]['AGS'] = np.where(dfs[i]['AGS'].str.len() < 6, dfs[i]['AGS'] + '0'*3, dfs[i]['AGS']) # add trailing 000 for kreisfreie Städte (AGS has 4-5 digits before, 7-8 after)
    dfs[i]['AGS'] = np.where(dfs[i]['AGS'].str.len() < 8, '0' + dfs[i]['AGS'], dfs[i]['AGS']) # add leading 0 for state ids < 10 (AGS has 7 digits)
    # read in AGS transition datasets for same year and previous year
    ags_trans_same = pd.read_csv(f'{path}/data/ags_transition.csv', usecols=['AGS_2011', 'AGS_2013'], converters={'AGS_2011': str, 'AGS_2013': str})
    ags_trans_same.rename(columns = {'AGS_2011':'AGS_new'}, inplace = True)
    ags_trans_prev = pd.read_csv(f'{path}/data/ags_transition.csv', usecols=['AGS_2010', 'AGS_2013'], converters={'AGS_2010': str, 'AGS_2013': str})
    ags_trans_prev.rename(columns = {'AGS_2010':'AGS_new'}, inplace = True)
    ags_trans = pd.concat([ags_trans_same, ags_trans_prev])
    # merge dfs[i] with ags trans
    dfs[i] = dfs[i].merge(ags_trans, left_on='AGS', right_on='AGS_new', how='left', indicator=True)
    print(dfs[i]._merge.value_counts())
    # drop not merged
    dfs[i] = dfs[i][dfs[i]['_merge'] == 'both']
    # if merged successfully, replace AGS
    dfs[i]['AGS'] = np.where(dfs[i]['AGS_2013'].notna(), dfs[i]['AGS_2013'], dfs[i]['AGS'])
    dfs[i] = dfs[i].drop(['AGS_2013', 'AGS_new', '_merge'], axis=1)
    # move AGS to front
    AGS = dfs[i].pop('AGS')
    dfs[i].insert(0, 'AGS', AGS)
    # drop duplicates
    dfs[i] = dfs[i].drop_duplicates(keep='first')
religion, foreign = dfs

both          32928
left_only      1539
right_only        0
Name: _merge, dtype: int64
both          32928
left_only      1539
right_only        0
Name: _merge, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs[i]['AGS'] = dfs[i]['AGS'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfs[i]['AGS'] = np.where(dfs[i]['AGS'].str.len() < 6, dfs[i]['AGS'] + '0'*3, dfs[i]['AGS']) # add trailing 000 for kreisfreie Städte (AGS has 4-5 digits before, 7-8 after)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Since religion and foreign data is only available in 2011 but for CSDiD I need data for 2008, 09, 10, 12, and 13 (which are reasonably close) set year to these values

In [68]:
dfs = [religion, foreign]
for i, df in enumerate(dfs):
    df_year = {2008: '', 2009: '', 2010: '', 2012:'', 2013:'', 2014:''}
    for year in df_year.keys(): 
        df_year[year] = dfs[i].copy()
        df_year[year]['year'] = year
        #print(df_year[year].columns)
    dfs[i] = pd.concat(df_year.values())
religion, foreign = dfs

Concatenate datasets and group

In [69]:
rs_dfs = [income, religion, foreign]
rs_controls = pd.concat(rs_dfs)
rs_controls = rs_controls.groupby(['AGS', 'year'], as_index=False).first()
rs_controls = rs_controls.sort_values(['AGS', 'year'])
rs_controls.shape

(84591, 7)

Compute share of foreigners and catholics, subset df

In [70]:
rs_controls['foreign'] = rs_controls['Ausländer/-innen'] / rs_controls['Bevölkerung'] * 100
rs_controls['catholic'] = rs_controls['catholic'] / rs_controls['Bevölkerung'] * 100
rs_controls = rs_controls[['AGS', 'year', 'income_pc', 'catholic', 'foreign']]
rs_controls = rs_controls[rs_controls['AGS'].str.len() == 8]
rs_controls.shape

(84591, 5)

Load BBSR datasets

In [71]:
files = ['ref-gemeinden-1990-2000.xlsx', 'ref-gemeinden-2000-2010.xlsx', 'ref-gemeinden-2010-2020.xlsx']
bbsr_dfs = {}
for year in range(1990, 2000):
   bbsr_dfs[year] = pd.read_excel(f'{path}/data/controls/{files[0]}', sheet_name=f'{year}-{year+1}', header=1, decimal = ',')
for year in range(2000, 2010):
   bbsr_dfs[year] = pd.read_excel(f'{path}/data/controls/{files[1]}', sheet_name=f'{year}-{year+1}', header=1, decimal = ',')
for year in range(2010, 2020):
    if year < 2016:
       bbsr_dfs[year] = pd.read_excel(f'{path}/data/controls/{files[2]}', sheet_name=f'{year}-{year+1}', header=1, decimal = ',')
    else: # header changed
       bbsr_dfs[year] = pd.read_excel(f'{path}/data/controls/{files[2]}', sheet_name=f'{year}-{year+1}', header=0, decimal = ',')

Rename and filter columns, Fix the AGS, insert year column

In [72]:
filtered_bbsr = {}
for year, df in bbsr_dfs.items():
    # for 1990-1996 there  is only data on the area and population
    if year <= 1996:
        col_names = ['AGS', 'Name', 'flächenprop.', 'bevölkerungsprop.', 'area_bbsr', 'pop_bbsr', 'Kennziffer.1', 'Name.1']
        bbsr_dfs[year].columns = col_names
        filtered_bbsr[year] = bbsr_dfs[year][[f'AGS', 'area_bbsr', 'pop_bbsr']]
    # for 1996-2020 there is also data on the number of employed
    elif year <= 2020:
        col_names = ['AGS', 'Name', 'flächenprop.', 'bevölkerungsprop.', 'beschäftigtenprop.', 'area_bbsr', 'pop_bbsr', 'employed_bbsr', 'Kennziffer.1', 'Name.1']
        bbsr_dfs[year].columns = col_names
        filtered_bbsr[year] = bbsr_dfs[year][[f'AGS', 'area_bbsr', 'pop_bbsr', 'employed_bbsr']]
    # insert year
    filtered_bbsr[year]['year'] = year
    # fix AGS
    filtered_bbsr[year]['AGS'] = filtered_bbsr[year]['AGS'].astype(str)
    filtered_bbsr[year]['AGS'] = np.where(filtered_bbsr[year]['AGS'].str.len() < 8, '0' + filtered_bbsr[year]['AGS'], filtered_bbsr[year]['AGS'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bbsr[year]['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bbsr[year]['AGS'] = filtered_bbsr[year]['AGS'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bbsr[year]['AGS'] = np.where(filtered_bbsr[year]['AGS'].str.len() < 8, '0' + filtered_bbsr[ye

Convert AGS to 2013

In [73]:
for year, df in filtered_bbsr.items():
    # skip 2013
    if year == 2013:
        continue
    # read in AGS transition dataset
    ags_trans = pd.read_csv(f'{path}/data/ags_transition.csv', usecols=[f'AGS_{year}', 'AGS_2013'], converters={f'AGS_{year}': str, 'AGS_2013': str})
    # merge filtered_bbsr[year] with ags trans
    filtered_bbsr[year] = filtered_bbsr[year].merge(ags_trans, left_on='AGS', right_on=f'AGS_{year}', how='left', indicator=True)
    print(print(filtered_bbsr[year]._merge.value_counts()))
    # delete old AGS and rename
    filtered_bbsr[year]['AGS'] = np.where(filtered_bbsr[year]['AGS_2013'].notna(), filtered_bbsr[year]['AGS_2013'], filtered_bbsr[year]['AGS'])
    filtered_bbsr[year] = filtered_bbsr[year].drop(['AGS_2013', f'AGS_{year}', '_merge'], axis=1)
    # move AGS to front
    AGS = filtered_bbsr[year].pop('AGS')
    filtered_bbsr[year].insert(1, 'AGS', AGS)

both          16620
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16480
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16479
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16497
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16473
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16479
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16475
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16479
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16475
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16501
left_only         0
right_only        0
Name: _merge, dtype: int64
None
both          16473
left_only         0
right_only        0
Name: _merge, dtype:

Concatenate all BBSR dfs

In [74]:
bbsr_controls = pd.concat(filtered_bbsr.values())
bbsr_controls = bbsr_controls.sort_values(['AGS', 'year'])

Concatenate INKAR, RS and BBSR dfs

In [84]:
all_controls = pd.concat([inkar_controls, rs_controls, bbsr_controls])
all_controls.shape

(938180, 14)

In [85]:
grouped_controls = all_controls.groupby(['AGS', 'year'], as_index=False).first()
grouped_controls.shape

(343047, 14)

Combine variables

In [86]:
controls = grouped_controls.copy()
# Population: Inkar has exact number from 95-17 (but some on Gemeindeverbandslevel), 
# BBSR has in thousands -96, in hundreds from 96 (despite header saying it is in thousands)
controls['pop'] = np.where(controls['year'] <= 1996, controls['pop_bbsr'] * 1000, 0)
controls['pop'] = np.where(controls['year'] > 1996, controls['pop_bbsr'] * 100, controls['pop'])
# pop density can now be calculated since we have area and population for all years
controls['pop_density'] = controls['pop'] / controls['area_bbsr']
# share of employed
controls['employed'] = controls['employed_bbsr'] / controls['pop_bbsr'] * 100
# share of unemployed
controls['unemployed'] = controls['unemployed_inkar'] / controls['pop_inkar'] * 100
# avg_age: inkar has more data
controls['avg_age'] = controls['avg_age_inkar']
# same for share of females
controls['female'] = controls['female_inkar']

Inspect means to see if values makes sense

In [87]:
all_means = controls[['AGS', 'year', 'pop_density', 'employed', 'unemployed', 'foreign', 'catholic', 'avg_age', 'female', 'income_pc']].groupby('year').mean()
# Everything but employed looks fine, unemployed here is the share over total population therefore lower than official figures

In [88]:
# CSDiD needs controls for all years --> drop 'foreign', 'catholic', 'income_pc'
controls = controls[['AGS', 'year', 'pop_density', 'unemployed', 'avg_age', 'female']]
controls.head()

Unnamed: 0,AGS,year,pop_density,unemployed,avg_age,female
0,1001000,1990,1536.550745,,,
1,1001000,1991,1547.196593,,,
2,1001000,1992,1550.850461,,,
3,1001000,1993,1559.000709,,,
4,1001000,1994,1558.114812,,,


Export df

In [89]:
controls = controls.set_index(['AGS', 'year'])
controls.to_csv(f'{path}/data/controls.csv', encoding = 'utf-8-sig')

Since I cannot estimate LTW models for BY and HE in 2018 due to missing data for unemployed, female and avg_age forward fill values from 2017

In [90]:
controls['unemployed'] = controls['avg_age'].fillna(method='ffill', axis='index', limit=1)
controls['avg_age'] = controls['avg_age'].fillna(method='ffill', axis='index', limit=1)
controls['female'] = controls['female'].fillna(method='ffill', axis='index', limit=1)

In [91]:
controls.to_csv(f'{path}/data/controls_ltw.csv', encoding = 'utf-8-sig')