## Setup

In [None]:
%cd ~/cma/CMA_Fairness_v2

In [None]:
import os
print("Current working directory:", os.getcwd())

In [None]:
import numpy as np
import pandas as pd

## 01 Data Checks

In [None]:
siab = pd.read_csv("data/raw/siab.csv")

In [None]:
# Compute summary statistics
siab.describe(include = 'all')

In [None]:
# Count number of missing values
siab.isna().sum()

In [None]:
grouped = siab.groupby('year')
siab_s = grouped.apply(lambda x: x.sample(n = 5000, random_state = 42))
siab_s = siab_s.reset_index(drop = True) # Ungroup

In [None]:
siab_s.groupby('year').describe(include = 'all')

## 02 Descriptive Stats

In [None]:
siab_t = siab

In [None]:
siab_t['nongerman'] = np.where(siab_t['maxdeutsch1'] == 0, 1, 0) # creates new column 'nongerman', if maxdeutsch1 == 0 then nongerman = 1, else 0 
siab_t.loc[siab_t['maxdeutsch.Missing.'] == 1, 'nongerman'] = np.nan # overwrite nongerman with NaN for any row where maxdeutsch.Missing. equals 1
siab_t['nongerman_male'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 0), 1, 0)
siab_t['nongerman_female'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 1), 1, 0)

In [None]:
desc1 = siab_t[['year', 'ltue']].groupby('year').mean()
desc1.to_latex('./output/desc1.tex', float_format = "%.3f") # Mean LTUE over time

In [None]:
desc2a = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).agg(['sum', 'count'])
desc2b = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).mean()
desc2c = siab_t[['year', 'ltue', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year', 'ltue']).agg(['mean', 'count'])

In [None]:
desc2a.to_latex('./output/desc2a.tex', float_format = "%.3f") # Number of cases over time
desc2b.to_latex('./output/desc2b.tex', float_format = "%.3f") # Socio-demo over time
desc2c.to_latex('./output/desc2c.tex', float_format = "%.3f") # Socio-demo by LTUE over time