In [2]:
%cd /Users/julia/Desktop/Master_Thesis/Example_Code/fairml-multiverse/Kern24_code

/Users/julia/Desktop/Master_Thesis/Example_Code/fairml-multiverse/Kern24_code


In [3]:
import os
print("Current working directory:", os.getcwd())
os.makedirs("output", exist_ok=True)

Current working directory: /Users/julia/Desktop/Master_Thesis/Example_Code/fairml-multiverse/Kern24_code


## Setup

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

## 01 Data Checks

In [5]:
siab = pd.read_csv("./data/siab.csv")

In [6]:
# Compute summary statistics
siab.describe(include = 'all')

Unnamed: 0,id,year,dummy,ltue,frau1,maxdeutsch1,maxdeutsch.Missing.,f3,f4,f5,...,f150,f151,f152,f153,f154,f155,f156,f157,f158,f159
count,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,...,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0,280000.0
mean,140000.5,2013.0,0.4997943,0.147425,0.501504,0.699771,0.100107,-0.002346,-0.00165,0.000986,...,-0.000553,-0.001988,-0.004059,-0.000486,-0.005803,0.001333,0.001124,0.000551,-0.002378,-0.000383
std,80829.182024,2.000004,0.288584,0.35453,0.499999,0.458358,0.300143,0.99889,0.999567,1.000201,...,1.000628,0.998795,1.000278,1.001334,1.000473,1.001695,1.001337,0.999421,0.999304,0.999806
min,1.0,2010.0,4.644453e-08,0.0,0.0,0.0,0.0,-4.274902,-4.575239,-4.621985,...,-4.400128,-4.386344,-4.666788,-4.480229,-4.35411,-4.253507,-4.537547,-4.538731,-4.829286,-4.509095
25%,70000.75,2011.0,0.2501032,0.0,0.0,0.0,0.0,-0.677123,-0.675971,-0.6718,...,-0.676572,-0.677664,-0.679656,-0.675132,-0.68003,-0.675568,-0.674538,-0.673398,-0.676792,-0.674914
50%,140000.5,2013.0,0.5001834,0.0,1.0,1.0,0.0,-0.000918,-0.002643,-0.000555,...,-0.002162,-0.00041,-0.003141,-0.000478,-0.006461,0.001353,0.000653,-0.000796,-0.003642,0.000452
75%,210000.25,2015.0,0.7491143,0.0,1.0,1.0,0.0,0.669508,0.673708,0.674064,...,0.676748,0.67391,0.671253,0.675551,0.669931,0.677375,0.673302,0.67541,0.670351,0.673879
max,280000.0,2016.0,0.9999962,1.0,1.0,1.0,1.0,4.813692,4.699485,5.31922,...,4.790722,4.621271,4.636764,4.282897,5.108427,5.128665,4.519926,4.253741,4.46067,4.477771


In [7]:
# Count number of missing values
siab.isna().sum()

id       0
year     0
dummy    0
ltue     0
frau1    0
        ..
f155     0
f156     0
f157     0
f158     0
f159     0
Length: 164, dtype: int64

In [8]:
# Groups the siab DataFrame by the “year” column; 
# the variable grouped now represents a GroupBy object with one group for each year
grouped = siab.groupby('year')

In [9]:
# Applies a lambda function to each group that randomly samples 5000 observations from the group
siab_s = grouped.apply(lambda x: x.sample(n = 5000, random_state = 42)) # Sample 5000 obs from each year

In [10]:
# Resets the index of the sampled DataFrame siab_s so that it becomes a standard RangeIndex rather than a multi-index (which resulted from the groupby and apply operations).
siab_s = siab_s.reset_index(drop = True) # Ungroup

In [11]:
# Groups the sampled data again by “year” and computes descriptive statistics
siab_s.groupby('year').describe(include = 'all')

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,dummy,dummy,...,f158,f158,f159,f159,f159,f159,f159,f159,f159,f159
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,5000.0,19892.4062,11646.590611,2.0,9564.25,19861.0,29995.25,39993.0,5000.0,0.496591,...,0.669483,3.455664,5000.0,0.001485,1.004769,-3.732094,-0.68304,0.002983,0.656852,3.266133
2011,5000.0,59892.4062,11646.590611,40002.0,49564.25,59861.0,69995.25,79993.0,5000.0,0.505404,...,0.651727,3.215624,5000.0,-0.013944,0.987305,-3.89877,-0.674576,-0.016575,0.662719,3.258793
2012,5000.0,99892.4062,11646.590611,80002.0,89564.25,99861.0,109995.25,119993.0,5000.0,0.498969,...,0.673922,3.637529,5000.0,0.014396,1.01676,-3.352797,-0.664105,0.016133,0.703997,3.943466
2013,5000.0,139892.4062,11646.590611,120002.0,129564.25,139861.0,149995.25,159993.0,5000.0,0.49565,...,0.667805,3.69964,5000.0,-0.012504,0.99734,-3.479955,-0.69378,-0.015432,0.674611,3.599463
2014,5000.0,179892.4062,11646.590611,160002.0,169564.25,179861.0,189995.25,199993.0,5000.0,0.497612,...,0.691988,3.928698,5000.0,0.026836,0.973741,-3.371933,-0.617121,0.03034,0.681748,3.233969
2015,5000.0,219892.4062,11646.590611,200002.0,209564.25,219861.0,229995.25,239993.0,5000.0,0.49816,...,0.661263,3.810956,5000.0,-0.00623,1.009323,-3.234284,-0.672487,-0.017902,0.672486,4.306001
2016,5000.0,259892.4062,11646.590611,240002.0,249564.25,259861.0,269995.25,279993.0,5000.0,0.502798,...,0.686296,3.874077,5000.0,-0.006095,0.995972,-3.472734,-0.681238,-0.023363,0.667176,3.341128


## 02 Data Split

# Train with 2010 - 2015 | 2015
# Test with 2016

In [12]:
# training set should consist of data from 2010–2015
siab_train = siab_s[siab_s.year < 2016] 

In [13]:
siab_train_s = siab_s[siab_s.year == 2015]

In [14]:
siab_test = siab[siab.year == 2016]

In [15]:
# Keep protected features, data from 2010-2015
X_train_f = siab_train.iloc[:,4:164]

In [16]:
# Keep protected features, data from 2015
X_train_fs = siab_train_s.iloc[:,4:164]

In [17]:
# Drop protected attributes, data from 2010-2015
X_train_s = X_train_f.drop(columns = ['frau1', 'maxdeutsch1', 'maxdeutsch.Missing.'])
# Drop protected attributes, data from 2015
X_train_ss = X_train_fs.drop(columns = ['frau1', 'maxdeutsch1', 'maxdeutsch.Missing.'])

# ltue, from siab_train, i.e. 2010-2015
y_train = siab_train.iloc[:, [3]]
#ltue, from siab_train_s, i.e. 2015
y_train_s = siab_train_s.iloc[:, [3]]

In [18]:
# Keep protected features, data from 2016
X_test_f = siab_test.iloc[:,4:164]
# Drop protected attributes, data from 2016
X_test_s = X_test_f.drop(columns = ['frau1', 'maxdeutsch1', 'maxdeutsch.Missing.'])
# ltue, from siab_test, i.e. 2016
y_test = siab_test.iloc[:, [3]]

## 03 Descriptive Stats

In [19]:
# Create a new DataFrame called “siab_t” that is a copy of the training set “siab_train”
siab_t = siab_train.copy(deep = True)
# combine siab_t (training data) with siab_test (2016 data) into a single DataFrame, resetting the index
siab_t = pd.concat([siab_t, siab_test], ignore_index=True)

In [20]:
# Creates a new column, nongerman, which is set to 1 if maxdeutsch1 equals 0 (i.e. not proficient in German) and 0 otherwise
siab_t['nongerman'] = np.where(siab_t['maxdeutsch1'] == 0, 1, 0)

In [21]:
# For rows where maxdeutsch.Missing. is 1 (indicating missing language proficiency info), the nongerman column is set to NaN
siab_t.loc[siab_t['maxdeutsch.Missing.'] == 1, 'nongerman'] = np.nan

In [22]:
# Creates a new column nongerman_male that is 1 if nongerman is 1 and frau1 equals 0 (indicating the individual is male), otherwise 0
siab_t['nongerman_male'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 0), 1, 0)

In [23]:
# Similarly, nongerman_female is 1 if nongerman is 1 and frau1 equals 1 (indicating a female), otherwise 0
siab_t['nongerman_female'] = np.where((siab_t['nongerman'] == 1) & (siab_t['frau1'] == 1), 1, 0)

In [24]:
# Groups the merged data (siab_t) by “year” and calculates the mean of the “ltue” column. This provides the average LTUE rate per year
desc1 = siab_t[['year', 'ltue']].groupby('year').mean()

In [25]:
# Exports the desc1 DataFrame to a LaTeX file in the “output” folder
desc1.to_latex('./output/desc1.tex', float_format = "%.3f") # Mean LTUE over time

  desc1.to_latex('./output/desc1.tex', float_format = "%.3f") # Mean LTUE over time


In [26]:
# Groups data by “year” for selected socio-demographic columns and computes both the sum and count of observations per year
desc2a = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).agg(['sum', 'count'])

# Groups the same columns by “year” and calculates the mean values
desc2b = siab_t[['year', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year']).mean()

# Groups the data by both “year” and “ltue”, then computes mean and count of socio-demographic measures
desc2c = siab_t[['year', 'ltue', 'frau1', 'nongerman', 'nongerman_male', 'nongerman_female']].groupby(['year', 'ltue']).agg(['mean', 'count'])


In [27]:
desc2a.to_latex('./output/desc2a.tex', float_format = "%.3f") # Number of cases over time
desc2b.to_latex('./output/desc2b.tex', float_format = "%.3f") # Socio-demo over time
desc2c.to_latex('./output/desc2c.tex', float_format = "%.3f") # Socio-demo by LTUE over time


  desc2a.to_latex('./output/desc2a.tex', float_format = "%.3f") # Number of cases over time
  desc2b.to_latex('./output/desc2b.tex', float_format = "%.3f") # Socio-demo over time
  desc2c.to_latex('./output/desc2c.tex', float_format = "%.3f") # Socio-demo by LTUE over time


## Save 

In [28]:
X_train_f.to_csv('./output/X_train_f.csv', index = False)
X_train_fs.to_csv('./output/X_train_fs.csv', index = False)
X_train_s.to_csv('./output/X_train_s.csv', index = False)
X_train_ss.to_csv('./output/X_train_ss.csv', index = False)
y_train.to_csv('./output/y_train.csv', index = False)
y_train_s.to_csv('./output/y_train_s.csv', index = False)

X_test_f.to_csv('./output/X_test_f.csv', index = False)
X_test_s.to_csv('./output/X_test_s.csv', index = False)
y_test.to_csv('./output/y_test.csv', index = False)