In [1]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import os
import wget
from sklearn.metrics import accuracy_score
from utils import *

## Overview
We have a database (ULL_database) with information about primary and secondary education students in the Canary Islands 
for 4 academic years. There is information about their academic performance and 
contextual information (about their families, teachers, and school). The database contains a subset of data 
in the form of panel data, meaning information about the same students at different points in time (ULL_panel_data).

Machine learning algorithms can be used to predict at-risk students. 
A student is considered at risk if they are anticipated to have low academic performance in the future. 
Detecting these students would allow for corrective measures to be taken in advance.

As a measure of academic performance, we have the variables "scores".
We have academic performance in Mathematics and in Spanish Language

We specify a model to predict at-risk students. Utilizing the panel data,
the model aims to forecast whether the student will be at risk in the future (in 6th grade)
based on various predictors of current academic performance (3rd grade).

Each observation (row) in ULL_panel_data is a student, with their academic performance in sixth grade 
and their predictors of academic performance from third grade (columns).

## Load and preprocess data

In [2]:
DATA = 'data/'
data = pd.read_csv(os.path.join(DATA, 'ULL_panel_data.csv'), sep=';')

In [3]:
data

Unnamed: 0,id_grade,id_student_16_19,score_MAT,level_MAT,score_LEN,level_LEN,id_student,id_student_original,id_year,id_class_group,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
0,6,1,474.9944,2,385.1411,1,20342,1431,2016,A,...,4.0,4.0,4.0,4.0,4.0,3.0,,,,
1,6,2,508.8362,3,469.4856,2,2819,1432,2016,A,...,4.0,,4.0,4.0,3.0,4.0,,,,
2,6,3,590.2816,3,591.1398,3,19276,1436,2016,A,...,4.0,4.0,4.0,4.0,2.0,4.0,,,,
3,6,5,394.4247,1,493.7984,2,14078,1439,2016,A,...,,,,,,,,,,
4,6,6,530.0070,3,500.0860,3,1695,1447,2016,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15969,6,17267,599.0310,3,615.6177,4,12265,40236,2016,A,...,4.0,,4.0,4.0,4.0,3.0,,,,
15970,6,17268,538.5835,3,647.9100,4,15982,40238,2016,A,...,4.0,,4.0,4.0,4.0,4.0,,,,
15971,6,17269,537.8327,3,445.1313,2,9965,40240,2016,,...,,,,,,,,,,
15972,6,17270,468.8731,2,546.8035,3,1137,40246,2016,B,...,3.0,4.0,3.0,3.0,3.0,3.0,,,,


In [4]:
# Select only the data we want to work for
data = data[['id_student_16_19', 'score_MAT', 'score_LEN', 'score_MAT3', 'score_LEN3', 'a1',
             'mother_education', 'father_education', 'mother_occupation', 'father_occupation', 
             'inmigrant_second_gen', 'start_schooling_age', 'books', 'f12a', 'public_private', 
             'capital_island', 'd14', 'ESCS', 'id_school']]

In [5]:
# Drop observations with missing data in any of the variables that we will use in the models
# Here, synthetic data methods can be used instead to fill in missing values

missing_columns = ['score_MAT3', 'a1', 'mother_education', 'father_education',
    'mother_occupation', 'father_occupation', 'inmigrant_second_gen',
    'start_schooling_age', 'books', 'f12a', 'public_private',
    'capital_island', 'd14']

data = data.dropna(subset=missing_columns)

In [6]:
data

Unnamed: 0,id_student_16_19,score_MAT,score_LEN,score_MAT3,score_LEN3,a1,mother_education,father_education,mother_occupation,father_occupation,inmigrant_second_gen,start_schooling_age,books,f12a,public_private,capital_island,d14,ESCS,id_school
4,6,530.0070,500.0860,368.65,339.47,1,3.0,1.0,4.0,2.0,1.0,1.0,1.0,1.0,2,2,4.0,0.132756,2443
5,7,531.9280,459.4065,387.36,566.44,1,4.0,4.0,4.0,3.0,1.0,1.0,4.0,5.0,2,1,1.0,1.069410,1368
7,9,578.3741,630.4484,549.89,635.53,1,2.0,2.0,3.0,2.0,1.0,2.0,2.0,2.0,2,1,1.0,-1.166950,2500
8,10,481.1748,497.1981,592.00,668.26,2,4.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,1,1,1.0,0.976453,1610
11,13,521.5593,655.0537,490.28,524.98,2,4.0,4.0,3.0,3.0,1.0,1.0,2.0,5.0,2,1,1.0,-0.134441,1859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15968,17266,522.6458,611.0034,574.83,591.86,1,4.0,3.0,3.0,3.0,1.0,1.0,4.0,2.0,2,1,2.0,0.759928,2413
15969,17267,599.0310,615.6177,629.84,460.75,1,4.0,3.0,4.0,3.0,1.0,1.0,4.0,2.0,2,1,2.0,1.410322,2203
15970,17268,538.5835,647.9100,600.20,542.78,1,4.0,2.0,3.0,3.0,1.0,1.0,2.0,3.0,2,1,2.0,0.035227,2095
15971,17269,537.8327,445.1313,610.26,600.15,2,4.0,1.0,3.0,3.0,1.0,1.0,3.0,2.0,2,1,2.0,0.382896,2097


In [7]:
cols = data.columns 
data = pd.DataFrame(data.values.flatten().reshape(-1, data.shape[1]), columns=cols)

In [8]:
# Generate quartiles of scores in sixth grade
data['score_MATq'] = pd.qcut(data['score_MAT'], 4, labels=["1", "2", "3","4"])
data['score_MATq'] = data['score_MATq'].astype(int)
data['score_LENq'] = pd.qcut(data['score_LEN'], 4, labels=["1", "2", "3","4"])
data['score_LENq'] = data['score_LENq'].astype(int)

In [9]:
# Generate median and percentiles 25 and 75 of socioeconomic status (ESCS)
median_ESCS = data['ESCS'].median()
p25_ESCS = data['ESCS'].quantile(0.25)
p75_ESCS = data['ESCS'].quantile(0.75)

# Initialize with null values
data['ESCS_median'] = pd.Series([np.nan] * len(data))
data.loc[data['ESCS'] >= median_ESCS, 'ESCS_median'] = 2
data.loc[data['ESCS'] < median_ESCS, 'ESCS_median'] = 1
data.loc[data['ESCS_median'] == 0, 'ESCS_median'] = np.nan

# Initialize with null values
data['ESCS_p25_p75'] = pd.Series([np.nan] * len(data))
data.loc[data['ESCS'] >= p75_ESCS, 'ESCS_p25_p75'] = 2
data.loc[data['ESCS'] < p25_ESCS, 'ESCS_p25_p75'] = 1
data.loc[(data['ESCS'] >= p25_ESCS) & (data['ESCS'] < p75_ESCS), 'ESCS_p25_p75'] = np.nan

# Some data corrections to make the final results
# Variable d14 top category(4) is the "bad" category (more than 50% of teachers change school), so the results must be inverted
data['d14'] = data['d14'].apply(lambda x: 1 if x == 1 else 0)

In [10]:
data = data.drop("id_student_16_19", axis=1)
data = data.drop("id_school", axis=1)
data.columns

Index(['score_MAT', 'score_LEN', 'score_MAT3', 'score_LEN3', 'a1',
       'mother_education', 'father_education', 'mother_occupation',
       'father_occupation', 'inmigrant_second_gen', 'start_schooling_age',
       'books', 'f12a', 'public_private', 'capital_island', 'd14', 'ESCS',
       'score_MATq', 'score_LENq', 'ESCS_median', 'ESCS_p25_p75'],
      dtype='object')

In [11]:
subjects = ["MAT", "LEN", "MAT3", "LEN3"]
continuous_scores = ["score_" + x for x in subjects]

data[continuous_scores] = (data[continuous_scores] - data[continuous_scores].min()) / (data[continuous_scores].max() - data[continuous_scores].min())

In [12]:
data[continuous_scores].describe()

Unnamed: 0,score_MAT,score_LEN,score_MAT3,score_LEN3
count,8290.0,8290.0,8290.0,8290.0
mean,0.496029,0.57253,0.630309,0.674672
std,0.164199,0.154309,0.183068,0.201217
min,0.0,0.0,0.0,0.0
25%,0.382484,0.463118,0.497701,0.521731
50%,0.485401,0.574571,0.633065,0.697501
75%,0.604836,0.680854,0.757262,0.832557
max,1.0,1.0,1.0,1.0


## Models

The goal of the model is to predict the academic performance in sixth grade ($Y_t$)
using information from the same student in third grade, specifically:

1.  Academic performance in third grade ($Y_{t-1}$)

2.  Sensitive factors or circumstances ($C$)

3.  Predictors uncorrelated with circumstances, also called "effort" ($X$)

**Model 1**:    $$Y_t = α + β1Y_{t-1} + ε$$

**Model 2**:    $$Y_t = α + β1Y_{t-1} + β2C + ε$$

**Model 3**:    

> First step: $$Y_{t-1} = α + β2C + ν$$

- Recover the prediction of $Y_{t-1}$ (academic performance due to circumstances, $C$): $\hat{Y}_{t-1}$

- Recover the residual $ν$ (academic performance due to effort, $X$): $\hat{ν}$

> Second step: $$Y_t = α + β1\hat{Y}_{t-1} + β2\hat{ν} + ε$$

- Recover the prediction of $Y_t$ only due to $\hat{Y}_{t-1}$ (only due to circumstances)

- Recover the prediction of $Y_t$ only due to $\hat{ν}$ (only due to effort)

In theory...

**Model 1**: Using only the academic performance in third grade (benchmark)

**Model 2**: Using the academic performance + circumstances in third grade (less fair - more socially desirable)

**Model 3**: Using the circumstances + effort in third grade (close to Model 2)

- Prediction exclusively of circumstances of Model 3 (much less fair - much more socially desirable)
    
- Prediction exclusively of effort of Model 3 (much more fair - much less socially desirable)

Let's prove it

In [13]:
# Variables for the models
Y_t_1 = "score_MAT3"
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
split = int(len(data) * 0.8)

train_data = data.iloc[:split]
test_data = data.iloc[split:]

train_c = train_data[["a1", "mother_education", "father_education", "mother_occupation", "father_occupation", 
      "inmigrant_second_gen", "start_schooling_age", "books", "f12a", "public_private", "capital_island", "d14"]]
test_c = test_data[["a1", "mother_education", "father_education", "mother_occupation", "father_occupation", 
      "inmigrant_second_gen", "start_schooling_age", "books", "f12a", "public_private", "capital_island", "d14"]]
circumstances = ["a1", "mother_education", "father_education", "mother_occupation", "father_occupation", 
      "inmigrant_second_gen", "start_schooling_age", "books", "f12a", "public_private", "capital_island", "d14"]

# Dummy variables (all variables C are categorical variables)
train_dummy_variables = pd.get_dummies(train_c, columns=circumstances, drop_first = True)
test_dummy_variables = pd.get_dummies(test_c, columns=circumstances, drop_first = True)

# Join Y_t_1 + C
train_data_combined = pd.concat([train_data[Y_t_1], train_dummy_variables], axis=1)
test_data_combined = pd.concat([test_data[Y_t_1], test_dummy_variables], axis=1)

In [14]:
test_data

Unnamed: 0,score_MAT,score_LEN,score_MAT3,score_LEN3,a1,mother_education,father_education,mother_occupation,father_occupation,inmigrant_second_gen,...,books,f12a,public_private,capital_island,d14,ESCS,score_MATq,score_LENq,ESCS_median,ESCS_p25_p75
6632,0.765039,0.657338,0.738193,1.000000,1.0,4.0,3.0,3.0,3.0,1.0,...,4.0,5.0,2.0,1.0,0,1.191348,4,3,2.0,2.0
6633,0.462080,0.636803,0.938956,0.782606,2.0,3.0,2.0,3.0,4.0,1.0,...,2.0,5.0,2.0,1.0,0,0.536982,2,3,2.0,
6634,0.403198,0.369274,0.705725,0.355770,2.0,2.0,4.0,2.0,2.0,1.0,...,1.0,1.0,1.0,1.0,0,-0.708335,2,1,1.0,1.0
6635,0.365334,0.469595,0.526876,0.714836,2.0,2.0,4.0,3.0,3.0,1.0,...,3.0,2.0,2.0,1.0,0,0.382896,1,2,2.0,
6636,0.394645,0.443683,0.462825,0.590729,2.0,1.0,2.0,1.0,3.0,1.0,...,1.0,1.0,2.0,1.0,0,-1.627731,2,1,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8285,0.687545,0.937725,0.693894,0.705915,2.0,4.0,4.0,4.0,4.0,1.0,...,3.0,5.0,1.0,1.0,1,1.947430,4,4,2.0,2.0
8286,0.360611,0.733824,0.880918,0.681435,2.0,3.0,4.0,4.0,4.0,1.0,...,2.0,5.0,1.0,1.0,1,0.536982,1,4,2.0,
8287,0.688505,0.760862,0.575893,0.804020,2.0,3.0,4.0,3.0,3.0,1.0,...,4.0,4.0,2.0,1.0,1,1.502406,4,4,2.0,2.0
8288,0.675909,0.412674,0.421985,0.618252,1.0,4.0,3.0,3.0,3.0,1.0,...,2.0,2.0,1.0,1.0,1,0.289729,4,1,2.0,


In [15]:
test_data.shape

(1658, 21)

In [16]:
# Model 1
model1 = sm.OLS(train_data["score_MAT"], sm.add_constant(train_data[Y_t_1])).fit()
model1.summary()
test_data.loc[:, 'model1_pred'] = model1.predict(sm.add_constant(test_data[Y_t_1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[:, 'model1_pred'] = model1.predict(sm.add_constant(test_data[Y_t_1]))


In [17]:
# Model 1
# model1 = sm.OLS(data["score_MAT"], sm.add_constant(data[Y_t_1])).fit()
# print(model1.summary())
# data['model1_pred'] = model1.fittedvalues

In [18]:
# Model 2
# model2 = sm.OLS(data["score_MAT"], sm.add_constant(data_combined.astype(np.float64))).fit()
# print(model2.summary())
# data['model2_pred'] = model2.fittedvalues

In [19]:
# Model 2
model2 = sm.OLS(train_data["score_MAT"], sm.add_constant(train_data_combined.astype(np.float64))).fit()
print(model2.summary())
test_data.loc[:, 'model2_pred'] = model2.predict(sm.add_constant(test_data_combined.astype(np.float64)))

                            OLS Regression Results                            
Dep. Variable:              score_MAT   R-squared:                       0.270
Model:                            OLS   Adj. R-squared:                  0.267
Method:                 Least Squares   F-statistic:                     90.43
Date:                Thu, 19 Sep 2024   Prob (F-statistic):               0.00
Time:                        14:54:29   Log-Likelihood:                 3608.5
No. Observations:                6632   AIC:                            -7161.
Df Residuals:                    6604   BIC:                            -6971.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[:, 'model2_pred'] = model2.predict(sm.add_constant(test_data_combined.astype(np.float64)))


In [20]:
# Model 3
# model3 = sm.OLS(data["score_MAT3"], sm.add_constant(dummy_variables.astype(np.float64))).fit()
# print(model3.summary())

# First step
# data['Y_t_1_hat'] = model3.fittedvalues
# data['ν_hat'] = model3.resid

# Second step
# model4 = sm.OLS(data["score_MAT"], sm.add_constant(data[["Y_t_1_hat", "ν_hat"]])).fit()
# print(model4.summary())
# data['model3_pred'] = model4.fittedvalues

# Prediction exclusively of circumstances
# data['model3_pred_circum'] = model4.params['const'] + model4.params['Y_t_1_hat'] * data['Y_t_1_hat']
# Prediction exclusively of effort
# mean_circu = data['Y_t_1_hat'].mean()
# data['mean_circu'] = mean_circu
# data['model3_pred_effort'] = (model4.params['const'] + 
                          # model4.params['ν_hat'] * data['ν_hat'] + 
                          # model4.params['Y_t_1_hat'] * mean_circu)

In [21]:
# Model 3
model3 = sm.OLS(train_data["score_MAT3"], sm.add_constant(train_dummy_variables.astype(np.float64))).fit()
print(model3.summary())

# First step
train_data.loc[:, 'Y_t_1_hat'] = model3.fittedvalues
train_data.loc[:, 'ν_hat'] = model3.resid

# Second step
model4 = sm.OLS(train_data["score_MAT"], sm.add_constant(pd.concat([train_data["Y_t_1_hat"], train_data["ν_hat"]], axis = 1))).fit()
print(model4.summary())

test_data.loc[:, 'Y_t_1_hat'] = model3.predict(sm.add_constant(test_dummy_variables.astype(np.float64)))
test_data.loc[:, 'ν_hat'] = test_data["score_MAT3"] - test_data.loc[:, 'Y_t_1_hat']

test_data.loc[:, 'model3_pred'] = model4.predict(sm.add_constant(pd.concat([test_data["Y_t_1_hat"], test_data["ν_hat"]], axis = 1)))

# Prediction exclusively of circumstances
test_data.loc[:, 'model3_pred_circum'] = model4.params['const'] + model4.params['Y_t_1_hat'] * test_data['Y_t_1_hat']
# Prediction exclusively of effort
mean_circu = test_data.loc[:, 'Y_t_1_hat'].mean()
test_data.loc[:, 'mean_circu'] = mean_circu
test_data.loc[:, 'model3_pred_effort'] = (model4.params['const'] + 
                          model4.params['ν_hat'] * test_data['ν_hat'] + 
                          model4.params['Y_t_1_hat'] * mean_circu)

                            OLS Regression Results                            
Dep. Variable:             score_MAT3   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     26.45
Date:                Thu, 19 Sep 2024   Prob (F-statistic):          1.69e-121
Time:                        14:54:29   Log-Likelihood:                 2167.9
No. Observations:                6632   AIC:                            -4282.
Df Residuals:                    6605   BIC:                            -4098.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'Y_t_1_hat'] = model3.fittedvalues
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.loc[:, 'ν_hat'] = model3.resid
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[:, 'Y_t_1_hat'] = model3.predict(sm.add_constant(test_dummy_variables.astype(np.float64)))
A val

In [22]:
# Transform predictions(continuous) to quartiles(categorical)

def discretise_scores(data):
    data.loc[:, 'score_MAT_discrete'] = pd.qcut(data['score_MAT'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_discrete'] = data['score_MAT_discrete'].astype(int)

    data.loc[:, 'score_MAT_pred1'] = pd.qcut(data['model1_pred'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_pred1'] = data['score_MAT_pred1'].astype(int)
    data.loc[:, 'score_MAT_pred2'] = pd.qcut(data['model2_pred'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_pred2'] = data['score_MAT_pred2'].astype(int)
    data.loc[:, 'score_MAT_pred3'] = pd.qcut(data['model3_pred'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_pred3'] = data['score_MAT_pred3'].astype(int)
    data.loc[:, 'score_MAT_pred_C'] = pd.qcut(data['model3_pred_circum'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_pred_C'] = data['score_MAT_pred_C'].astype(int)
    data.loc[:, 'score_MAT_pred_X'] = pd.qcut(data['model3_pred_effort'], 4, labels=["1", "2", "3","4"])
    data.loc[:, 'score_MAT_pred_X'] = data['score_MAT_pred_X'].astype(int)

    # Transform predictions(continuous) to percentiles but percentiles 2 and 3 equal (between 25th and 75th percentile)

    data.loc[:, 'score_MAT_pred1_t'] = data['score_MAT_pred1'].apply(lambda x: 1 if x == 1 else (2 if x == 2 or x == 3 else 3))
    data.loc[:, 'score_MAT_pred2_t'] = data['score_MAT_pred2'].apply(lambda x: 1 if x == 1 else (2 if x == 2 or x == 3 else 3))
    data.loc[:, 'score_MAT_pred3_t'] = data['score_MAT_pred3'].apply(lambda x: 1 if x == 1 else (2 if x == 2 or x == 3 else 3))
    data.loc[:, 'score_MAT_pred_C_t'] = data['score_MAT_pred_C'].apply(lambda x: 1 if x == 1 else (2 if x == 2 or x == 3 else 3))
    data.loc[:, 'score_MAT_pred_X_t'] = data['score_MAT_pred_X'].apply(lambda x: 1 if x == 1 else (2 if x == 2 or x == 3 else 3))

    return data

test_data = discretise_scores(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'score_MAT_discrete'] = pd.qcut(data['score_MAT'], 4, labels=["1", "2", "3","4"])
  data.loc[:, 'score_MAT_discrete'] = data['score_MAT_discrete'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'score_MAT_pred1'] = pd.qcut(data['model1_pred'], 4, labels=["1", "2", "3","4"])
  data.loc[:, 'score_MAT_pred1'] = data['score_MAT_pred1'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

## Results

We focus on **Equalized Odds** (Equality of opportunity).

To calculate Equalized Odds we first calculate recall or sensitivity:

$$TP / (TP + FN)$$

and then we calculate the ratio of recall among different groups to obtain Equalized Odds.

Recall is calculated for Low and High academic performance:
- **Low academic performance**: Below the median or 25th percentile
- **High academic performance**: Above the median or above 75th percentile (top 25 percent)

In [23]:
def compute_results(data):

    recall_dfs_25_75 = []
    recall_dfs_25_75.extend(compute_recall(data, ["f12a"], top_level=5))
    recall_dfs_25_75.extend(compute_recall(data, ["mother_education", "father_education", "mother_occupation", "father_occupation", "books"], top_level=4))
    recall_dfs_25_75.extend(compute_recall(data, ["start_schooling_age"], top_level=1))
    recall_dfs_25_75.extend(compute_recall(data, ["inmigrant_second_gen", "public_private", "capital_island", "a1", "ESCS_median", "ESCS_p25_p75", "d14"], top_level=1))

    recall_dfs_between25_75 = []
    recall_dfs_between25_75.extend(compute_recall_terciles(data, ["f12a"], top_level=5))
    recall_dfs_between25_75.extend(compute_recall_terciles(data, ["mother_education", "father_education", "mother_occupation", "father_occupation", "books"], top_level=4))
    recall_dfs_between25_75.extend(compute_recall_terciles(data, ["start_schooling_age"], top_level=1))
    recall_dfs_between25_75.extend(compute_recall_terciles(data, ["inmigrant_second_gen", "public_private", "capital_island", "a1", "ESCS_median", "ESCS_p25_p75", "d14"], top_level=1))

    recall_dfs_median = []
    recall_dfs_median.extend(compute_recall_median(data, ["f12a"], top_level=5))
    recall_dfs_median.extend(compute_recall_median(data, ["mother_education", "father_education", "mother_occupation", "father_occupation", "books"], top_level=4))
    recall_dfs_median.extend(compute_recall_median(data, ["start_schooling_age"], top_level=1))
    recall_dfs_median.extend(compute_recall_median(data, ["inmigrant_second_gen", "public_private", "capital_island", "a1", "ESCS_median", "ESCS_p25_p75", "d14"], top_level=1))

    # Combine DataFrames
    combined_df_25_75 = pd.concat(recall_dfs_25_75, ignore_index=True)
    combined_df_between25_75 = pd.concat(recall_dfs_between25_75, ignore_index=True)
    combined_df_median = pd.concat(recall_dfs_median, ignore_index=True)

    return combined_df_25_75, combined_df_between25_75, combined_df_median

combined_df_25_75, combined_df_between25_75, combined_df_median = compute_results(test_data)

In [24]:
# Pivot tables
pivot_combined_df_25_75 = combined_df_25_75.pivot_table(index=['Variable', 'Group', 'Percentile'], columns='Model', values='Recall').reset_index()
pivot_combined_df_25_75 = pivot_combined_df_25_75[['Variable', 'Group', 'Percentile', 'pred1', 'pred2', 'pred3', 'pred_C', 'pred_X']]
pivot_combined_df_25_75_sorted = pivot_combined_df_25_75.sort_values(by=['Percentile', 'Variable', 'Group'], ascending=[True, True, False])
pivot_combined_df_between25_75 = combined_df_between25_75.pivot_table(index=['Variable', 'Group', 'Tercile'], columns='Model', values='Recall').reset_index()
pivot_combined_df_between25_75 = pivot_combined_df_between25_75[['Variable', 'Group', 'Tercile', 'pred1_t', 'pred2_t', 'pred3_t', 'pred_C_t', 'pred_X_t']]
pivot_combined_df_between25_75_sorted = pivot_combined_df_between25_75.sort_values(by=['Tercile', 'Variable', 'Group'], ascending=[True, True, False])
pivot_combined_df_median = combined_df_median.pivot_table(index=['Variable', 'Group', 'Pair1', 'Pair2'], columns='Model', values='Recall').reset_index()
pivot_combined_df_median = pivot_combined_df_median[['Variable', 'Group', 'Pair1', 'Pair2', 'pred1', 'pred2', 'pred3', 'pred_C', 'pred_X']]
pivot_combined_df_median_sorted = pivot_combined_df_median.sort_values(by=['Pair1', 'Pair2', 'Variable', 'Group'], ascending=[True, True, True, False])

In [25]:
final_data_25_75 = []

for variable in pivot_combined_df_25_75_sorted['Variable'].unique():
    variable_df = pivot_combined_df_25_75_sorted[pivot_combined_df_25_75_sorted['Variable'] == variable]
    for percentile in variable_df['Percentile'].unique():
        top_row = variable_df[(variable_df['Group'] == 'top') & (variable_df['Percentile'] == percentile)]
        if not top_row.empty:
            top_row = top_row.iloc[0]
            temp_data = []
            for _, row in variable_df[variable_df['Percentile'] == percentile].iterrows():
                odds_row = {
                    'Variable': row['Variable'],
                    'Group': row['Group'],
                    'Percentile': row['Percentile'],
                    'pred1': row['pred1'],
                    'pred2': row['pred2'],
                    'pred3': row['pred3'],
                    'pred_C': row['pred_C'],
                    'pred_X': row['pred_X'],
                    'pred1_odds': calculate_odds(row['pred1'], top_row['pred1']),
                    'pred2_odds': calculate_odds(row['pred2'], top_row['pred2']),
                    'pred3_odds': calculate_odds(row['pred3'], top_row['pred3']),
                    'pred_C_odds': calculate_odds(row['pred_C'], top_row['pred_C']),
                    'pred_X_odds': calculate_odds(row['pred_X'], top_row['pred_X']),
                }
                temp_data.append(odds_row)
            final_data_25_75.extend(temp_data)

final_data_25_75_sorted = pd.DataFrame(final_data_25_75)

In [26]:
final_data_between25_75 = []

for variable in pivot_combined_df_between25_75_sorted['Variable'].unique():
    variable_df = pivot_combined_df_between25_75_sorted[pivot_combined_df_between25_75_sorted['Variable'] == variable]
    for tercile in variable_df['Tercile'].unique():
        top_row = variable_df[(variable_df['Group'] == 'top') & (variable_df['Tercile'] == tercile)]
        if not top_row.empty:
            top_row = top_row.iloc[0]
            temp_data = []
            for _, row in variable_df[variable_df['Tercile'] == tercile].iterrows():
                odds_row = {
                    'Variable': row['Variable'],
                    'Group': row['Group'],
                    'Tercile': row['Tercile'],
                    'pred1_t': row['pred1_t'],
                    'pred2_t': row['pred2_t'],
                    'pred3_t': row['pred3_t'],
                    'pred_C_t': row['pred_C_t'],
                    'pred_X_t': row['pred_X_t'],
                    'pred1_odds': calculate_odds(row['pred1_t'], top_row['pred1_t']),
                    'pred2_odds': calculate_odds(row['pred2_t'], top_row['pred2_t']),
                    'pred3_odds': calculate_odds(row['pred3_t'], top_row['pred3_t']),
                    'pred_C_odds': calculate_odds(row['pred_C_t'], top_row['pred_C_t']),
                    'pred_X_odds': calculate_odds(row['pred_X_t'], top_row['pred_X_t']),
                }
                temp_data.append(odds_row)
            final_data_between25_75.extend(temp_data)

final_data_between25_75_sorted = pd.DataFrame(final_data_between25_75)


In [27]:
final_data_median = []

for variable in pivot_combined_df_median_sorted['Variable'].unique():
    variable_df = pivot_combined_df_median_sorted[pivot_combined_df_median_sorted['Variable'] == variable]
    for pair in variable_df[['Pair1', 'Pair2']].drop_duplicates().values:
        pair1, pair2 = pair
        top_row = variable_df[(variable_df['Group'] == 'top') & (variable_df['Pair1'] == pair1) & (variable_df['Pair2'] == pair2)]
        if not top_row.empty:
            top_row = top_row.iloc[0]
            temp_data = []
            for _, row in variable_df[(variable_df['Pair1'] == pair1) & (variable_df['Pair2'] == pair2)].iterrows():
                odds_row = {
                    'Variable': row['Variable'],
                    'Group': row['Group'],
                    'Pair1': row['Pair1'],
                    'Pair2': row['Pair2'],
                    'pred1': row['pred1'],
                    'pred2': row['pred2'],
                    'pred3': row['pred3'],
                    'pred_C': row['pred_C'],
                    'pred_X': row['pred_X'],
                    'pred1_odds': calculate_odds(row['pred1'], top_row['pred1']),
                    'pred2_odds': calculate_odds(row['pred2'], top_row['pred2']),
                    'pred3_odds': calculate_odds(row['pred3'], top_row['pred3']),
                    'pred_C_odds': calculate_odds(row['pred_C'], top_row['pred_C']),
                    'pred_X_odds': calculate_odds(row['pred_X'], top_row['pred_X']),
                }
                temp_data.append(odds_row)
            final_data_median.extend(temp_data)

final_data_median_sorted = pd.DataFrame(final_data_median)


In [28]:
category_order = ['a1', 'mother_education', 'father_education', 'mother_occupation', 'father_occupation', 'books', 'd14', 'inmigrant_second_gen', 
                  'public_private', 'capital_island', 'start_schooling_age', 'f12a', 'ESCS_median', 'ESCS_p25_p75']

final_data_25_75_sorted['Variable'] = pd.Categorical(final_data_25_75_sorted['Variable'], categories=category_order, ordered=True)
final_data_25_75_sorted = final_data_25_75_sorted.sort_values(by='Variable')
final_data_25_75_sorted = final_data_25_75_sorted[['Variable', 'Group', 'Percentile', 'pred1', 'pred1_odds', 'pred2', 'pred2_odds', 'pred3', 'pred3_odds', 'pred_C', 'pred_C_odds', 'pred_X', 'pred_X_odds']]
final_data_25_75_sorted = final_data_25_75_sorted.sort_values(by=['Percentile', 'Variable', 'Group'], ascending=[True, True, False])
final_data_between25_75_sorted['Variable'] = pd.Categorical(final_data_between25_75_sorted['Variable'], categories=category_order, ordered=True)
final_data_between25_75_sorted = final_data_between25_75_sorted.sort_values(by='Variable')
final_data_between25_75_sorted = final_data_between25_75_sorted[['Variable', 'Group', 'Tercile', 'pred1_t', 'pred1_odds', 'pred2_t', 'pred2_odds', 'pred3_t', 'pred3_odds', 'pred_C_t', 'pred_C_odds', 'pred_X_t', 'pred_X_odds']]
final_data_between25_75_sorted = final_data_between25_75_sorted.sort_values(by=['Tercile', 'Variable', 'Group'], ascending=[True, True, False])
final_data_median_sorted['Variable'] = pd.Categorical(final_data_median_sorted['Variable'], categories=category_order, ordered=True)
final_data_median_sorted = final_data_median_sorted.sort_values(by='Variable')
final_data_median_sorted = final_data_median_sorted[['Variable', 'Group', 'Pair1', 'Pair2', 'pred1', 'pred1_odds', 'pred2', 'pred2_odds', 'pred3', 'pred3_odds', 'pred_C', 'pred_C_odds', 'pred_X', 'pred_X_odds']]
final_data_median_sorted = final_data_median_sorted.sort_values(by=['Pair1', 'Pair2', 'Variable', 'Group'], ascending=[True, True, True, False])

## Export to Excel

In [29]:
# Export to Excel
with pd.ExcelWriter(os.path.join('results', 'results.xlsx')) as writer:
    final_data_25_75_sorted.to_excel(writer, sheet_name='25_75', index=False, float_format='%.4f')
    final_data_median_sorted.to_excel(writer, sheet_name='Median', index=False, float_format='%.4f')
    final_data_between25_75_sorted.to_excel(writer, sheet_name='between25_75', index=False, float_format='%.4f')
    data.to_excel(writer, sheet_name='data', index=False, float_format='%.4f')

## IOP

Inequality of Opportunity is computed by applying an inequality index (Gini, MLD or simple variance) to the set of central moments (specifically, the mean $\mu$) for the $Y$'s conditional distributions with respect to a sensitive attribute's values. Mathematically:
$$IOP = I(w_{i} * \mu(Y|G_{i}), \ldots, w_{m} * \mu(Y|G_{m}))$$ 

$I$ is the inequality index while $G_{1} \ldots G_{m}$ are the $m$ different groups of individuals identified by the values a given senstive attribute can have. For example, if _gender_ is a sensitive attribute then the two resulting groups might be $G_{1} = male$ and $G_{2} = female$. In this situation, _IOP_ would be absent if $$I(w_{1} * \mu(Y|G_{1}), w_{2} * \mu(Y|G_{2})) = 0$$ or close to 0.

Finally, each central moment is weighted by the fraction of samples with a given value of the sensitive attribute. The weights sum up to 1. In the previous example, $w_{1}$ is the fraction of elements with $G = m$ and $w_{2}$ is the fraction of elements with $G = f$. 

In [30]:
test_data.columns

Index(['score_MAT', 'score_LEN', 'score_MAT3', 'score_LEN3', 'a1',
       'mother_education', 'father_education', 'mother_occupation',
       'father_occupation', 'inmigrant_second_gen', 'start_schooling_age',
       'books', 'f12a', 'public_private', 'capital_island', 'd14', 'ESCS',
       'score_MATq', 'score_LENq', 'ESCS_median', 'ESCS_p25_p75',
       'model1_pred', 'model2_pred', 'Y_t_1_hat', 'ν_hat', 'model3_pred',
       'model3_pred_circum', 'mean_circu', 'model3_pred_effort',
       'score_MAT_discrete', 'score_MAT_pred1', 'score_MAT_pred2',
       'score_MAT_pred3', 'score_MAT_pred_C', 'score_MAT_pred_X',
       'score_MAT_pred1_t', 'score_MAT_pred2_t', 'score_MAT_pred3_t',
       'score_MAT_pred_C_t', 'score_MAT_pred_X_t'],
      dtype='object')

## Utility

In [43]:
from sklearn.metrics import confusion_matrix

def compute_accuracy(test_data, model_pred, sa, protected_group, percentile):
    # filter gts and preds based on values of the sensitive attributes
    # protected group (normally one value) vs non protected group(s) (potentially multiple values)
    # compute accuracy for each group

    if test_data[sa].isna().sum() > 0:
        test_data = test_data.dropna(axis=0)
    else:
        test_data = test_data.dropna(axis=1)

    y_true_protected = test_data.loc[test_data[sa] == protected_group]["score_MAT_discrete"].astype(np.int64)
    y_pred_protected = test_data.loc[test_data[sa] == protected_group][model_pred].astype(np.int64)

    y_true_non_protected = test_data.loc[~(test_data[sa].astype(np.int64) == protected_group)]["score_MAT_discrete"]
    y_pred_non_protected = test_data.loc[~(test_data[sa].astype(np.int64) == protected_group)][model_pred]

    # print("#" * 25)
    # print(f"Confusion matrix, protected group ({sa}):")
    # print(confusion_matrix(y_true_protected, y_pred_protected))
    accuracy_protected = round(accuracy_score(y_true_protected, y_pred_protected), 2)
    # print(accuracy_protected)

    # print(f"Confusion matrix, non protected group ({sa}):")
    # print(confusion_matrix(y_true_non_protected, y_pred_non_protected))
    accuracy_non_protected = round(accuracy_score(y_true_non_protected, y_pred_non_protected), 2)
    # print(accuracy_non_protected)

    return accuracy_protected, accuracy_non_protected

def run_experiments(test_data, percentile, ineq_index):
    
    percentiles = test_data['score_MAT'].quantile([0.25, 0.75])
    test_data.loc[:, 'percentile_bin'] = pd.cut(test_data['score_MAT'], 
                                                bins=[test_data['score_MAT'].min()] + list(percentiles) + [test_data['score_MAT'].max()], 
                                                include_lowest=True)

    groups = [group for _, group in test_data.groupby('percentile_bin')]
    percentile_df = {"below-25": groups[0], "between-25-75": groups[1], "above-75": groups[2]}

    data = percentile_df[percentile]

    sensitive_attrs = ['a1', 'mother_education', 'father_education',
       'mother_occupation', 'father_occupation', 'inmigrant_second_gen',
       'start_schooling_age', 'books', 'f12a', 'public_private',
       'capital_island', 'd14', "ESCS_median", "ESCS_p25_p75"]

    columns = ["Model 1", "Model 2", "Model 3", "Circumstances", "Effort"]
    # model_preds_acc = [f"score_MAT_{x}" for x in ["pred1", "pred2", "pred3", "pred_C", "pred_X", "pred1_t", "pred2_t", "pred3_t", "pred_C_t", "pred_X_t"]]
    model_preds = ["model1_pred", "model2_pred", "model3_pred", "model3_pred_circum", "model3_pred_effort"]
    preds = ["pred1", "pred2", "pred3", "pred_C", "pred_X"]

    model_preds_acc = []
    columns_acc = []
    if percentile == "below-25" or percentile == "above-75":
        model_preds_acc = [f"score_MAT_{x}" for x in preds]
        columns_acc = columns
    else:
        columns_acc = [x + " terciles" for x in columns]
        model_preds_acc = [f"score_MAT_{x}_t" for x in preds]
    
    model_pred_rename = {mp: col for mp, col in zip(model_preds, columns)}
    model_pred_acc_rename = {mpa: col for mpa, col in zip(model_preds_acc, columns_acc)}

    sensitive_attrs_values = {
         "a1": 1,
         "mother_education": 4,
         "father_education": 4,
         "mother_occupation": 4,
         "father_occupation": 4,
         "inmigrant_second_gen": 1, 
         "start_schooling_age": 1, 
         "books": 4, 
         "f12a": 5,
         "public_private": 1,
         "capital_island": 1,
         "d14": 1,
         "ESCS_median": 1,
         "ESCS_p25_p75": 1
    }
    
    df = {}
    for mp in model_preds:
        col = []
        data.loc[:, "label"] = binarise_predictions(data[mp], percentile)
        for sa in sensitive_attrs:
            col.append(iop(data, sa, ineq_index=ineq_index))
        df[model_pred_rename[mp]] = col

    df_accuracy = {}
    print(model_preds_acc)
    for mp in model_preds_acc:
        col_acc = []
        for sa in sensitive_attrs:
            col_acc.append(compute_accuracy(data, mp, sa, sensitive_attrs_values[sa], percentile))
        df_accuracy[model_pred_acc_rename[mp]] = col_acc
    return pd.DataFrame(df, index=sensitive_attrs), pd.DataFrame(df_accuracy, index=sensitive_attrs)

## Below 25th percentile

In [44]:
df_below_25, df_acc_below_25 = run_experiments(test_data, percentile="below-25", ineq_index="gini")
df_below_25

  groups = [group for _, group in test_data.groupby('percentile_bin')]


['score_MAT_pred1', 'score_MAT_pred2', 'score_MAT_pred3', 'score_MAT_pred_C', 'score_MAT_pred_X']


Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
a1,0.003667,0.00597,0.013288,0.064155,0.022891
mother_education,0.17727,0.171258,0.180187,0.422084,0.335228
father_education,0.317401,0.376956,0.360833,0.445067,0.205632
mother_occupation,0.617158,0.616969,0.621921,0.601008,0.641561
father_occupation,0.641799,0.635658,0.636558,0.572376,0.658388
inmigrant_second_gen,0.496497,0.496497,0.496497,0.498637,0.49345
start_schooling_age,0.464705,0.421553,0.427907,0.373072,0.504964
books,0.438157,0.516553,0.538544,0.583135,0.423724
f12a,0.574969,0.574675,0.56348,0.592797,0.575497
public_private,0.46349,0.475487,0.472556,0.48678,0.433564


In [46]:
df_acc_below_25

Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
a1,"(0.56, 0.47)","(0.51, 0.48)","(0.53, 0.49)","(0.29, 0.41)","(0.53, 0.46)"
mother_education,"(0.41, 0.58)","(0.31, 0.6)","(0.31, 0.62)","(0.05, 0.52)","(0.5, 0.49)"
father_education,"(0.37, 0.56)","(0.18, 0.59)","(0.24, 0.59)","(0.02, 0.44)","(0.57, 0.48)"
mother_occupation,"(0.46, 0.53)","(0.44, 0.51)","(0.45, 0.53)","(0.23, 0.38)","(0.49, 0.5)"
father_occupation,"(0.43, 0.55)","(0.37, 0.54)","(0.39, 0.55)","(0.25, 0.38)","(0.47, 0.5)"
inmigrant_second_gen,"(0.51, 0.59)","(0.5, 0.41)","(0.51, 0.56)","(0.36, 0.19)","(0.49, 0.52)"
start_schooling_age,"(0.52, 0.51)","(0.48, 0.52)","(0.48, 0.55)","(0.22, 0.54)","(0.53, 0.44)"
books,"(0.38, 0.54)","(0.29, 0.54)","(0.32, 0.55)","(0.03, 0.41)","(0.53, 0.49)"
f12a,"(0.55, 0.5)","(0.49, 0.5)","(0.51, 0.51)","(0.28, 0.39)","(0.56, 0.46)"
public_private,"(0.39, 0.55)","(0.33, 0.54)","(0.3, 0.57)","(0.1, 0.41)","(0.47, 0.5)"


## Between 25th and 75th percentile

In [48]:
df_between_25_75, df_acc_between_25_75 = run_experiments(test_data, percentile="between-25-75", ineq_index="gini")
df_between_25_75

  groups = [group for _, group in test_data.groupby('percentile_bin')]


['score_MAT_pred1_t', 'score_MAT_pred2_t', 'score_MAT_pred3_t', 'score_MAT_pred_C_t', 'score_MAT_pred_X_t']


Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
a1,0.06846,0.072167,0.072167,0.033779,0.07695
mother_education,0.477919,0.485938,0.492089,0.481957,0.497218
father_education,0.238316,0.238311,0.238311,0.237338,0.25041
mother_occupation,0.631889,0.638666,0.639523,0.653763,0.631957
father_occupation,0.643534,0.647229,0.650855,0.654883,0.642655
inmigrant_second_gen,0.494592,0.494364,0.494579,0.492614,0.494149
start_schooling_age,0.546045,0.545798,0.55233,0.553901,0.551849
books,0.394048,0.406741,0.39886,0.498889,0.376933
f12a,0.576416,0.588434,0.593112,0.573892,0.582981
public_private,0.40937,0.41229,0.414391,0.436434,0.418544


In [49]:
df_acc_between_25_75

Unnamed: 0,Model 1 terciles,Model 2 terciles,Model 3 terciles,Circumstances terciles,Effort terciles
a1,"(0.47, 0.42)","(0.46, 0.45)","(0.47, 0.43)","(0.43, 0.34)","(0.43, 0.4)"
mother_education,"(0.44, 0.45)","(0.47, 0.44)","(0.49, 0.42)","(0.53, 0.27)","(0.36, 0.46)"
father_education,"(0.43, 0.45)","(0.52, 0.42)","(0.49, 0.43)","(0.45, 0.36)","(0.33, 0.45)"
mother_occupation,"(0.39, 0.46)","(0.46, 0.45)","(0.47, 0.44)","(0.5, 0.35)","(0.31, 0.45)"
father_occupation,"(0.4, 0.47)","(0.49, 0.44)","(0.47, 0.44)","(0.47, 0.36)","(0.35, 0.44)"
inmigrant_second_gen,"(0.45, 0.43)","(0.45, 0.48)","(0.45, 0.43)","(0.4, 0.27)","(0.42, 0.4)"
start_schooling_age,"(0.43, 0.47)","(0.46, 0.46)","(0.45, 0.46)","(0.42, 0.32)","(0.41, 0.43)"
books,"(0.49, 0.43)","(0.58, 0.42)","(0.57, 0.41)","(0.58, 0.33)","(0.38, 0.43)"
f12a,"(0.46, 0.43)","(0.5, 0.43)","(0.49, 0.42)","(0.44, 0.35)","(0.4, 0.43)"
public_private,"(0.41, 0.46)","(0.43, 0.46)","(0.47, 0.44)","(0.47, 0.36)","(0.38, 0.43)"


## Above 75th percentile

In [50]:
df_above_75, df_acc_above_75 = run_experiments(test_data, percentile="above-75", ineq_index="gini")
df_above_75

  groups = [group for _, group in test_data.groupby('percentile_bin')]


['score_MAT_pred1', 'score_MAT_pred2', 'score_MAT_pred3', 'score_MAT_pred_C', 'score_MAT_pred_X']


Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
a1,0.056377,0.113163,0.075386,0.113163,0.046842
mother_education,0.64487,0.69041,0.681385,0.748328,0.576701
father_education,0.510521,0.595713,0.573782,0.704153,0.410336
mother_occupation,0.633314,0.613446,0.613446,0.586998,0.656412
father_occupation,0.590974,0.552272,0.562781,0.506428,0.621794
inmigrant_second_gen,0.49491,0.493085,0.494006,0.49491,0.492146
start_schooling_age,0.606086,0.613135,0.613135,0.646822,0.575507
books,0.324147,0.319927,0.286426,0.513097,0.391308
f12a,0.614686,0.62235,0.623743,0.669968,0.610641
public_private,0.315508,0.315508,0.295681,0.183301,0.358874


In [51]:
df_acc_above_75

Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
a1,"(0.52, 0.48)","(0.51, 0.47)","(0.53, 0.46)","(0.41, 0.29)","(0.47, 0.47)"
mother_education,"(0.57, 0.41)","(0.63, 0.31)","(0.63, 0.32)","(0.6, 0.03)","(0.46, 0.48)"
father_education,"(0.57, 0.45)","(0.64, 0.37)","(0.62, 0.41)","(0.63, 0.14)","(0.45, 0.49)"
mother_occupation,"(0.6, 0.46)","(0.62, 0.44)","(0.64, 0.45)","(0.64, 0.25)","(0.5, 0.46)"
father_occupation,"(0.62, 0.43)","(0.67, 0.4)","(0.65, 0.42)","(0.55, 0.25)","(0.48, 0.46)"
inmigrant_second_gen,"(0.5, 0.47)","(0.5, 0.41)","(0.51, 0.41)","(0.36, 0.25)","(0.47, 0.44)"
start_schooling_age,"(0.53, 0.41)","(0.54, 0.36)","(0.55, 0.35)","(0.42, 0.16)","(0.46, 0.49)"
books,"(0.6, 0.46)","(0.69, 0.42)","(0.67, 0.44)","(0.79, 0.19)","(0.44, 0.48)"
f12a,"(0.53, 0.48)","(0.54, 0.44)","(0.55, 0.46)","(0.46, 0.25)","(0.48, 0.46)"
public_private,"(0.51, 0.49)","(0.56, 0.46)","(0.56, 0.47)","(0.55, 0.26)","(0.46, 0.47)"


In [52]:
np_means = np.concatenate([
    df_below_25.mean(axis=0).to_numpy().reshape((1, df_below_25.shape[1])),
    df_above_75.mean(axis=0).to_numpy().reshape((1, df_above_75.shape[1])),
    df_between_25_75.mean(axis=0).to_numpy().reshape((1, df_between_25_75.shape[1]))], axis=0)

In [53]:
df_means = pd.DataFrame(np_means, 
                        index=["below_25", "above_75", "between_25_75"], 
                        columns=["Model 1", "Model 2", "Model 3", "Circumstances", "Effort"])
df_means

Unnamed: 0,Model 1,Model 2,Model 3,Circumstances,Effort
below_25,0.402238,0.420499,0.425348,0.464048,0.392087
above_75,0.415552,0.440998,0.428778,0.478373,0.391895
between_25_75,0.377333,0.381353,0.380493,0.389257,0.380224
