First, import needed modules

In [1]:
import numpy as np
import pandas as pd
import os

Initalize variables

In [2]:
path = os.getcwd()
print(path)

/Users/jan/Dropbox/UP_EPQM/2222/MA/powerlinemonsters


Import datasets

In [30]:
controls_raw = pd.read_csv(f'{path}/data/controls/inkar_2021.csv', encoding='UTF-8-SIG')

Rename columns

In [31]:
col_names = ['bereich', 'indikator', 'AVS', 'year', 'wert']
controls_raw.columns = col_names
controls_raw.columns

Index(['bereich', 'indikator', 'AVS', 'year', 'wert'], dtype='object')

Fix the AGS

In [32]:
controls_raw['AVS'] = controls_raw['AVS'].astype(str)
controls_raw['AVS'] = np.where(controls_raw['AVS'].str.len() < 8, '0' + controls_raw['AVS'], controls_raw['AVS'])

Convert Wert to numeric

In [33]:
controls_raw['value'] = controls_raw['wert'].str.replace(',','.').astype(float)
controls_raw.dtypes

bereich       object
indikator     object
AVS           object
year           int64
wert          object
value        float64
dtype: object

In [7]:
list(controls_raw['indikator'].unique())

['Langzeitarbeitslose',
 'Beschäftigtenquote',
 'Erholungsfläche je Einwohner',
 'Siedlungs- und Verkehrsfläche',
 'Bevölkerung (mit Korrektur VZ 1987/Zensus 2011)',
 'Bevölkerung gesamt',
 'Bevölkerung männlich',
 'Bevölkerung weiblich',
 'Einkommensteuer',
 'Frauenanteil',
 'Geborene',
 'Gesamtwanderungssaldo',
 'Gestorbene',
 'Gewerbesteuer',
 'Natürlicher Saldo',
 'Steuereinnahmen',
 'Steuerkraft',
 'Umsatzsteuer',
 'Bodenfläche gesamt',
 'Einwohnerdichte',
 'Neubauwohnungen in Ein- und Zweifamilienhäusern je Einwohner',
 'Neubauwohnungen in Mehrfamilienhäusern',
 'Auspendler',
 'Einpendler',
 'Pendlersaldo',
 'sozialversicherungspflichtig Beschäftigte am Arbeitsort',
 'sozialversicherungspflichtig Beschäftigte am Wohnort',
 'Anteil jüngere Arbeitslose',
 'Anteil männliche jüngere Arbeitslose',
 'Anteil männliche ältere Arbeitslose',
 'Anteil weibliche jüngere Arbeitslose',
 'Anteil weibliche ältere Arbeitslose',
 'Anteil ältere Arbeitslose',
 'Arbeitslose',
 'Arbeitslose Frauen',


In [34]:
vars_to_keep = ['Bevölkerung gesamt', 'Frauenanteil', 'Einwohnerdichte',  'Arbeitslose', 'Durchschnittsalter der Bevölkerung']
controls_selected = controls_raw[controls_raw['indikator'].isin(vars_to_keep)]

Pivot df

In [35]:
controls_pivot = pd.pivot(controls_selected, index=['AVS', 'year'], columns='indikator', values='value')
controls_pivot = controls_pivot.reset_index()
controls_pivot.shape

(106214, 7)

In [36]:
controls_pivot.columns

Index(['AVS', 'year', 'Arbeitslose', 'Bevölkerung gesamt',
       'Durchschnittsalter der Bevölkerung', 'Einwohnerdichte',
       'Frauenanteil'],
      dtype='object', name='indikator')

Rename columns and compute statistics

In [37]:
col_names = ['AVS', 'year', 'unemployed', 'pop', 'avg_age', 'pop_density', 'female']
controls_pivot.columns = col_names

In [41]:
controls = controls_pivot.copy()
controls['unemployed'] = controls['unemployed'] / controls['pop'] * 100
del controls['pop']

Change Verbandsschlüssel to AGS

In [42]:
avs_trans = pd.read_csv(f'{path}/data/avs_transition.csv', converters={'AGS': str, 'AVS': str})

In [43]:
controls = controls.merge(avs_trans, on='AVS')
del controls['AVS']

Export dataset

In [44]:
controls = controls.set_index(['AGS', 'year'])
controls.to_csv(f'{path}/data/controls.csv', encoding = 'utf-8-sig')
controls.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unemployed,avg_age,pop_density,female
AGS,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001000,1995,,,,51.88
1001000,1996,,,1534.9,51.86
1001000,1997,,,,51.69
1001000,1998,6.825423,,,51.73
1001000,1999,6.104276,,,51.72


In [None]:
controls['pop_density'] = controls['Bevölkerung'] / controls['Area']
controls['male'] = controls['männlich'] / controls['Bevölkerung']
controls['female'] = controls['weiblich'] / controls['Bevölkerung']
controls['foreign'] = controls['Ausländer/-innen'] / controls['Bevölkerung']
controls['employed'] = controls['Beschäftigte'] / controls['Bevölkerung']
controls['unemployed'] = controls['Arbeitslose'] / controls['Bevölkerung']
controls['protestant'] = controls['Evangelisch'] / controls['Bevölkerung']
controls['catholic'] = controls['Katholisch'] / controls['Bevölkerung']
controls['religion_other'] = controls['Religion Sonstige/Keine'] / controls['Bevölkerung']
controls = controls[['AGS', 'pop_density', 'female', 'foreign', 'unemployed', 'avg_income', 'avg_age', 'catholic']]
controls.head()

Unnamed: 0,AGS,pop_density,female,foreign,unemployed,avg_income,avg_age,catholic
0,1000000,177.227657,0.514134,0.041805,0.03573,32200.0,44.5,0.059842
1,1001000,1449.735636,0.507233,0.062353,0.062328,27000.0,43.0,0.063459
2,1002000,1987.206068,0.518602,0.070281,0.053999,27500.0,41.8,0.073661
3,1003000,981.770225,0.525299,0.062562,0.052348,27600.0,44.9,0.084606
4,1004000,1078.444786,0.511838,0.060337,0.056648,26200.0,44.4,0.064972
