In [1]:
%cd ~/cma/CMA_Fairness_v2

/dss/dsshome1/0C/ra93lal2/cma/CMA_Fairness_v2


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os
print("Current working directory:", os.getcwd())

Current working directory: /dss/dsshome1/0C/ra93lal2/cma/CMA_Fairness_v2


## Setup

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

## 01 Data Checks

In [4]:
siab = pd.read_csv("data/raw/siab.csv")

In [None]:
# Compute summary statistics
siab.describe(include = 'all')

In [None]:
# Count number of missing values
siab.isna().sum()

In [None]:
grouped = siab.groupby('year')
siab_s = grouped.apply(lambda x: x.sample(n = 15000, random_state = 42))
siab_s = siab_s.reset_index(drop = True) # Ungroup

In [None]:
siab_s.groupby('year').describe(include = 'all')

## 02 Data Split

Train with 2010 - 2014, calibrate with 2015, test with 2016

In [None]:
siab_train = siab_s[siab_s.year < 2015] 
siab_calib = siab[siab.year == 2015] 
siab_test = siab[siab.year == 2016]

In [None]:
# Keep protected features, data from 2010-2014
#X_train_f = siab_train.iloc[:,4:164]

X_train = siab_train.iloc[:,4:164]

In [None]:
# Drop protected attributes, data from 2010-2014
#X_train_s = X_train.drop(
#    columns = ['frau1', 'maxdeutsch1', 'maxdeutsch.Missing.'])

In [None]:
# ltue, from siab_train, i.e. 2010-2014
y_train = siab_train.iloc[:, [3]]

In [None]:
# Keep protected features, data from 2015
X_calib = siab_calib.iloc[:,4:164]

# Drop protected features, data from 2015
#X_calib_s = X_calib_f.drop(
#    columns = ['frau1', 'maxdeutsch1', 'maxdeutsch.Missing.'])

# ltue, from siab_calib, i.e. 2015
y_calib = siab_calib.iloc[:, [3]]

In [None]:
X_test = siab_test.iloc[:,4:164]

# ltue, from siab_test, i.e. 2016
y_test = siab_test.iloc[:, [3]]

## 03 Descriptive Stats

In [None]:
siab_t = siab_train.copy(deep = True)
siab_t = pd.concat([siab_t, siab_calib, siab_test], ignore_index=True)

In [None]:
siab_t['nongerman'] = np.where(siab_t['maxdeutsch1'] == 0, 1, 0) # creates new column 'nongerman', if maxdeutsch1 == 0 then nongerman = 1, else 0 
siab_t.loc[siab_t['maxdeutsch.Missing.'] == 1, 'nongerman'] = np.nan # overwrite nongerman with NaN for any row where maxdeutsch.Missing. equals 1
siab_t['nongerman_male'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 0), 1, 0)
siab_t['nongerman_female'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 1), 1, 0)

In [None]:
desc1 = siab_t[['year', 'ltue']].groupby('year').mean()
desc1.to_latex('./output/desc1.tex', float_format = "%.3f") # Mean LTUE over time

In [None]:
desc2a = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).agg(['sum', 'count'])
desc2b = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).mean()
desc2c = siab_t[['year', 'ltue', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year', 'ltue']).agg(['mean', 'count'])

In [None]:
desc2a.to_latex('./output/desc2a.tex', float_format = "%.3f") # Number of cases over time
desc2b.to_latex('./output/desc2b.tex', float_format = "%.3f") # Socio-demo over time
desc2c.to_latex('./output/desc2c.tex', float_format = "%.3f") # Socio-demo by LTUE over time

## Save 

In [None]:
X_train.to_csv('./data/X_train.csv', index = False)
#X_train_s.to_csv('./output/X_train_s.csv', index = False)
y_train.to_csv('./data/y_train.csv', index = False)

X_calib.to_csv('./data/X_calib.csv', index = False)
#X_calib_s.to_csv('./output/X_calib_s.csv', index = False)
y_calib.to_csv('./data/y_calib.csv', index = False)

X_test.to_csv('./data/X_test.csv', index = False)
#X_test_s.to_csv('./output/X_test_s.csv', index = False)
y_test.to_csv('./data/y_test.csv', index = False)