In [1]:
import sys
sys.path.append("..")

from utils.evaluation import *
from utils.utils import *

from data import dataset_preprocessing

from utils.evaluation import get_metrics
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso


from scipy import stats

import pandas as pd
import numpy as np
import os

import pickle

In [2]:
dataset_name = "academic_performance"
mode="cv"
RS=68
hct=10
test_ratio=0.2
val_ratio=0.1
folds=5
target = "continuous"
experiment_name = "EDM_results"

### Describe raw data

In [3]:
df = pd.read_excel(f"../data/raw/{dataset_name}/{dataset_name}.xlsx")
df = df.drop("Unnamed: 9",axis=1)
identifiers = ["COD_S11", "Cod_SPro"]
alternative_targets = ["CR_PRO", "QR_PRO", "CC_PRO", "WC_PRO", "FEP_PRO", "ENG_PRO", "QUARTILE", "PERCENTILE",
                       "2ND_DECILE", ]
df = df.drop(identifiers+alternative_targets,axis=1)

In [4]:
y_col = "G_SC"
demographic_cols = ['GENDER', 'EDU_FATHER', 'EDU_MOTHER', 'OCC_FATHER', 'OCC_MOTHER',
       'STRATUM', 'SISBEN', 'PEOPLE_HOUSE', 'INTERNET', 'TV', 'COMPUTER',
       'WASHING_MCH', 'MIC_OVEN', 'CAR', 'DVD', 'FRESH', 'PHONE', 'MOBILE','REVENUE', 'JOB', 'SCHOOL_NAME', 'SCHOOL_NAT', 'SCHOOL_TYPE','SEL', 'SEL_IHE']
perf_cols = ['MAT_S11','CR_S11', 'CC_S11', 'BIO_S11', 'ENG_S11']
activity_cols = []
other_cols = ['UNIVERSITY', 'ACADEMIC_PROGRAM']
set(df.columns)-set([y_col]+demographic_cols+perf_cols+activity_cols+other_cols)

set()

In [5]:
desc_df_dict = {"N": df.shape[0],
           "d": df.shape[1],
           "% NA": df.isna().sum().sum()/sum(df.shape),
           "Target": f"$y \in [${df[y_col].min()}..{df[y_col].max()}]",
           "Performance features": len(perf_cols),
           "Demographic features": len(demographic_cols),
           "Activity features": len(activity_cols),
           "Other features": len(other_cols),
           "Categorical features": len(df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]),     
           "Total cardinality": df[df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]].nunique().sum(),     
#            "High cardinality levels":  list(df.loc[:,list(df.columns[list(np.logical_and(df.nunique() >= 10, df.dtypes == "object"))])].nunique().sort_values().values),
          
}
desc_df = pd.DataFrame([desc_df_dict],index=["cortez"])
desc_df

Unnamed: 0,N,d,% NA,Target,Performance features,Demographic features,Activity features,Other features,Categorical features,Total cardinality
cortez,12411,33,0.0,$y \in [$37..247],5,25,0,2,13,3980


In [6]:
pd.DataFrame(pd.concat([df.drop(y_col,axis=1),pd.Series(df[y_col].values,index=df.index,name="target")],axis=1).corr()["target"])

Unnamed: 0,target
MAT_S11,0.643838
CR_S11,0.653572
CC_S11,0.6349
BIO_S11,0.666635
ENG_S11,0.662169
SEL,0.271465
SEL_IHE,0.3744
target,1.0


In [7]:
print(desc_df.transpose().to_latex())

\begin{tabular}{ll}
\toprule
{} &             cortez \\
\midrule
N                    &              12411 \\
d                    &                 33 \\
\% NA                 &                  0 \\
Target               &  \$y \textbackslash in [\$37..247] \\
Performance features &                  5 \\
Demographic features &                 25 \\
Activity features    &                  0 \\
Other features       &                  2 \\
Categorical features &                 13 \\
Total cardinality    &               3980 \\
\bottomrule
\end{tabular}



### Preprocessing and preparation

In [8]:
data_path = f"{mode}_RS{RS}_hct{hct}"
if mode == "cv":
    data_path += f"_{folds}folds"
elif mode == "train_test":
    data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
elif mode == "train_val_test":
    data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"


# If no data_dict for the configuration exists, run preprocessing, else load data_dict
if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
    dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)


## Evaluation of categorical data treatment methods

In [9]:
conditions = ["ignore", "ohe", "target", "ordinal", "catboost", "glmm"]

In [10]:
early_stopping_rounds = 10
max_evals = 50

In [11]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle"):

    results_encodings = {}
    results_encodings_feature_importances = {}

    for fold in range(folds):
        target_scaler = data_dict[f"target_scaler_{fold}"]
        results_encodings[fold] = {}
        results_encodings_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = target_scaler.inverse_transform(data_dict[f"y_test_{fold}"].reshape(-1,1)).ravel()
        y_train_val = target_scaler.inverse_transform(np.concatenate([y_train,y_val]).reshape(-1,1)).ravel()

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)
        y_test_pred_base = np.ones(y_test.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)

        results_encodings[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_encodings[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_encodings[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]
        results_encodings[fold]["Baseline"]["RMSE Test"] = np.sqrt(results_encodings[fold]["Baseline"]["MSE Test"])
        results_encodings[fold]["Baseline"]["RMSE Train"] = np.sqrt(results_encodings[fold]["Baseline"]["MSE Train"])


        for condition in conditions:
            print(f"Preparing results for fold {fold}, condition={condition}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]

    #         Define condition data subset
            if condition != "ignore":
                z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
                z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
                z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

                X_train = pd.concat([X_train,z_encoded_train],axis=1)
                X_val = pd.concat([X_val,z_encoded_val],axis=1)
                X_test = pd.concat([X_test,z_encoded_test],axis=1)

            X_train_val = pd.concat([X_train,X_val])
            y_train_val = np.concatenate([y_train,y_val])

            # Train base models
            res, feats = evaluate_lr(X_train_val, y_train_val, X_test, y_test, target=target,tune=False, seed=RS, target_scaler=target_scaler)
            results_encodings[fold]["LR_"+condition] = res
            results_encodings_feature_importances[fold]["LR_"+condition] = feats
            results_encodings[fold]["LR_"+condition]["RMSE Test"] = np.sqrt(results_encodings[fold]["LR_"+condition]["MSE Test"])
            results_encodings[fold]["LR_"+condition]["RMSE Train"] = np.sqrt(results_encodings[fold]["LR_"+condition]["MSE Train"])

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS, target_scaler=target_scaler)
            results_encodings[fold]["XGB_"+condition] = res
            results_encodings_feature_importances[fold]["XGB_"+condition] = feats
            results_encodings[fold]["XGB_"+condition]["RMSE Test"] = np.sqrt(results_encodings[fold]["XGB_"+condition]["MSE Test"])
            results_encodings[fold]["XGB_"+condition]["RMSE Train"] = np.sqrt(results_encodings[fold]["XGB_"+condition]["MSE Train"])

            # Train tuned models
            res, feats = evaluate_lr(X_train_val, y_train_val, X_test, y_test, target=target, max_evals=max_evals, tune=True, seed=RS, target_scaler=target_scaler)
            results_encodings[fold]["LR_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["LR_"+condition+"_tuned"] = feats
            results_encodings[fold]["LR_"+condition+"_tuned"]["RMSE Test"] = np.sqrt(results_encodings[fold]["LR_"+condition+"_tuned"]["MSE Test"])
            results_encodings[fold]["LR_"+condition+"_tuned"]["RMSE Train"] = np.sqrt(results_encodings[fold]["LR_"+condition+"_tuned"]["MSE Train"])

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS, target_scaler=target_scaler)
            results_encodings[fold]["XGB_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["XGB_"+condition+"_tuned"] = feats
            results_encodings[fold]["XGB_"+condition+"_tuned"]["RMSE Test"] = np.sqrt(results_encodings[fold]["XGB_"+condition+"_tuned"]["MSE Test"])
            results_encodings[fold]["XGB_"+condition+"_tuned"]["RMSE Train"] = np.sqrt(results_encodings[fold]["XGB_"+condition+"_tuned"]["MSE Train"])
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'wb') as handle:
        pickle.dump(results_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_encodings_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'rb') as handle:
        results_encodings = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'rb') as handle:
        results_encodings_feature_importances = pickle.load(handle)
    for fold in range(folds):
        target_scaler = data_dict[f"target_scaler_{fold}"]
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = target_scaler.inverse_transform(data_dict[f"y_test_{fold}"].reshape(-1,1)).ravel()
        y_train_val = target_scaler.inverse_transform(np.concatenate([y_train,y_val]).reshape(-1,1)).ravel()

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)
        y_test_pred_base = np.ones(y_test.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)

        results_encodings[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_encodings[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_encodings[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]
        results_encodings[fold]["Baseline"]["RMSE Test"] = np.sqrt(results_encodings[fold]["Baseline"]["MSE Test"])
        results_encodings[fold]["Baseline"]["RMSE Train"] = np.sqrt(results_encodings[fold]["Baseline"]["MSE Train"])
        
        
results_encodings_df = pd.DataFrame(results_encodings[0]).transpose().sort_values("MSE Test",ascending=False).round(4)
results_encodings_df[["RMSE Train", "MSE Train", "R2 Train", "RMSE Test", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,RMSE Train,MSE Train,R2 Train,RMSE Test,MSE Test,R2 Test
Baseline,23.1128,534.2038,-0.0,23.1089,534.0217,-0.0005
XGB_catboost,8.0505,64.8113,0.8787,15.9526,254.486,0.5232
XGB_ignore,9.9244,98.4935,0.8156,15.501,240.281,0.5498
XGB_ordinal,8.4633,71.6277,0.8659,15.408,237.4057,0.5552
XGB_target,7.8332,61.3597,0.8851,15.3108,234.4205,0.5608
XGB_glmm,7.6433,58.4195,0.8906,15.294,233.906,0.5618
XGB_ohe,10.1472,102.9665,0.8073,14.834,220.0468,0.5877
LR_target_tuned,13.6004,184.9718,0.6537,14.8251,219.7836,0.5882
LR_target,13.6045,185.083,0.6535,14.8159,219.5094,0.5887
LR_ordinal_tuned,14.214,202.0374,0.6218,14.7629,217.9441,0.5917


### Performance Comparison

In [12]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "LR" in i)]
metric = "RMSE Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

round_mean_at = 2
round_std_at = 2

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())*-1

df_mean = pd.DataFrame((-1*use_df).mean(axis=0).round(round_mean_at).astype(str) + " (" + use_df.std(axis=0).round(round_std_at).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_ignore_tuned,LR_ohe_tuned,LR_target_tuned,LR_ordinal_tuned,LR_catboost_tuned,LR_glmm_tuned
0,23.11 (0.26),14.36 (0.26),14.11 (0.29),14.55 (0.23),14.36 (0.25),14.22 (0.29),14.13 (0.29)


In [13]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "RMSE Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

round_mean_at = 2
round_std_at = 2

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())*-1

df_mean = pd.DataFrame((-1*use_df).mean(axis=0).round(round_mean_at).astype(str) + " (" + use_df.std(axis=0).round(round_std_at).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])
    
def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_ignore_tuned,XGB_ohe_tuned,XGB_target_tuned,XGB_ordinal_tuned,XGB_catboost_tuned,XGB_glmm_tuned
0,23.11 (0.26),14.28 (0.25),14.11 (0.26),14.38 (0.23),14.15 (0.25),14.17 (0.28),14.04 (0.28)


In [14]:
res_df_lr.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_encodings = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_encodings.index = ["LR", "XGB"]
latex_df_encodings

Unnamed: 0,Baseline,ignore,ohe,target,ordinal,catboost,glmm
LR,23.11 (0.26),14.36 (0.26),14.11 (0.29),14.55 (0.23),14.36 (0.25),14.22 (0.29),14.13 (0.29)
XGB,23.11 (0.26),14.28 (0.25),14.11 (0.26),14.38 (0.23),14.15 (0.25),14.17 (0.28),14.04 (0.28)


In [15]:
print(latex_df_encodings.round(2).to_latex())


\begin{tabular}{llllllll}
\toprule
{} &      Baseline &        ignore &           ohe &        target &       ordinal &      catboost &          glmm \\
\midrule
LR  &  23.11 (0.26) &  14.36 (0.26) &  14.11 (0.29) &  14.55 (0.23) &  14.36 (0.25) &  14.22 (0.29) &  14.13 (0.29) \\
XGB &  23.11 (0.26) &  14.28 (0.25) &  14.11 (0.26) &  14.38 (0.23) &  14.15 (0.25) &  14.17 (0.28) &  14.04 (0.28) \\
\bottomrule
\end{tabular}



## Data subset comparisons

As it does not matter which encoding method is used we use 5CV-GLMM encoding for LR and Ordinal encoding for XGB

In [16]:
subsets = {"demo_only": demographic_cols,
           "perfact_only": perf_cols+activity_cols,
           "perfact_and_demo": perf_cols+activity_cols+demographic_cols,
           "all": list(df.columns)
          }

In [17]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle"):

    results_subsets = {}
    results_subsets_feature_importances = {}

    for fold in range(folds):
        target_scaler = data_dict[f"target_scaler_{fold}"]
        results_subsets[fold] = {}
        results_subsets_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = target_scaler.inverse_transform(data_dict[f"y_test_{fold}"].reshape(-1,1)).ravel()
        y_train_val = target_scaler.inverse_transform(np.concatenate([y_train,y_val]).reshape(-1,1)).ravel()

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)
        y_test_pred_base = np.ones(y_test.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)

        results_subsets[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_subsets[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_subsets[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]
        results_subsets[fold]["Baseline"]["RMSE Test"] = np.sqrt(results_subsets[fold]["Baseline"]["MSE Test"])
        results_subsets[fold]["Baseline"]["RMSE Train"] = np.sqrt(results_subsets[fold]["Baseline"]["MSE Train"])


        for subset_key in subsets:
            print(f"Preparing results for fold {fold}, subset={subset_key}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]
        
            y_train_val = np.concatenate([y_train,y_val])

            # Define data subset for LR
            z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
            z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
            z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 
            
            X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
            X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
            X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
            X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

            # Rescale GLMM
            for col in z_glmm_encoded_train.columns:
                z_mean = X_train_val_lr[col].mean()
                z_std = X_train_val_lr[col].std()
            
                X_train_val_lr[col] = (X_train_val_lr[col]-z_mean)/z_std
                X_test_lr[col] = (X_test_lr[col]-z_mean)/z_std
            
            
            # Define data subset for XGB
#             z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
#             z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
#             z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
#             X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
#             X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
#             X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
#             X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])
            X_train_val_xgb = X_train_val_lr
            X_test_xgb = X_test_lr

            # Define data subset for evaluation
            X_train_val_lr = X_train_val_lr[[i for i in X_train_val_lr.columns if i in subsets[subset_key]]]
            X_test_lr = X_test_lr[[i for i in X_test_lr.columns if i in subsets[subset_key]]]
            X_train_val_xgb = X_train_val_xgb[[i for i in X_train_val_xgb.columns if i in subsets[subset_key]]]
            X_test_xgb = X_test_xgb[[i for i in X_test_xgb.columns if i in subsets[subset_key]]]


            # Train base models
            res, feats = evaluate_lr(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target,tune=False, seed=RS, target_scaler=target_scaler)
            results_subsets[fold]["LR_"+subset_key] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key] = feats
            results_subsets[fold]["LR_"+subset_key]["RMSE Test"] = np.sqrt(results_subsets[fold]["LR_"+subset_key]["MSE Test"])
            results_subsets[fold]["LR_"+subset_key]["RMSE Train"] = np.sqrt(results_subsets[fold]["LR_"+subset_key]["MSE Train"])

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS, target_scaler=target_scaler)
            results_subsets[fold]["XGB_"+subset_key] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key] = feats
            results_subsets[fold]["XGB_"+subset_key]["RMSE Test"] = np.sqrt(results_subsets[fold]["XGB_"+subset_key]["MSE Test"])
            results_subsets[fold]["XGB_"+subset_key]["RMSE Train"] = np.sqrt(results_subsets[fold]["XGB_"+subset_key]["MSE Train"])

            # Train tuned models
            res, feats = evaluate_lr(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target, max_evals=max_evals, tune=True, seed=RS, target_scaler=target_scaler)
            results_subsets[fold]["LR_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key+"_tuned"] = feats
            results_subsets[fold]["LR_"+subset_key+"_tuned"]["RMSE Test"] = np.sqrt(results_subsets[fold]["LR_"+subset_key+"_tuned"]["MSE Test"])
            results_subsets[fold]["LR_"+subset_key+"_tuned"]["RMSE Train"] = np.sqrt(results_subsets[fold]["LR_"+subset_key+"_tuned"]["MSE Train"])

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS, target_scaler=target_scaler)
            results_subsets[fold]["XGB_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key+"_tuned"] = feats
            results_subsets[fold]["XGB_"+subset_key+"_tuned"]["RMSE Test"] = np.sqrt(results_subsets[fold]["XGB_"+subset_key+"_tuned"]["MSE Test"])
            results_subsets[fold]["XGB_"+subset_key+"_tuned"]["RMSE Train"] = np.sqrt(results_subsets[fold]["XGB_"+subset_key+"_tuned"]["MSE Train"])
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'wb') as handle:
        pickle.dump(results_subsets, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_subsets_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'rb') as handle:
        results_subsets = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'rb') as handle:
        results_subsets_feature_importances = pickle.load(handle)
    for fold in range(folds):
        target_scaler = data_dict[f"target_scaler_{fold}"]
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = target_scaler.inverse_transform(data_dict[f"y_test_{fold}"].reshape(-1,1)).ravel()
        y_train_val = target_scaler.inverse_transform(np.concatenate([y_train,y_val]).reshape(-1,1)).ravel()

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)
        y_test_pred_base = np.ones(y_test.shape[0])*target_scaler.mean_[0]#*np.mean(y_train_val)

        results_subsets[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_subsets[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_subsets[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]
        results_subsets[fold]["Baseline"]["RMSE Test"] = np.sqrt(results_subsets[fold]["Baseline"]["MSE Test"])
        results_subsets[fold]["Baseline"]["RMSE Train"] = np.sqrt(results_subsets[fold]["Baseline"]["MSE Train"])
        
        
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("RMSE Test",ascending=False).round(4)
results_subsets_df[["RMSE Train", "MSE Train", "R2 Train", "RMSE Test", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,RMSE Train,MSE Train,R2 Train,RMSE Test,MSE Test,R2 Test
Baseline,23.1128,534.2038,-0.0,23.1089,534.0217,-0.0005
XGB_demo_only,12.8886,166.1155,0.689,21.995,483.7794,0.0936
LR_demo_only,20.4779,419.3447,0.215,20.6723,427.3434,0.1993
LR_demo_only_tuned,20.4787,419.3783,0.2149,20.6713,427.3018,0.1994
XGB_demo_only_tuned,19.8311,393.2722,0.2638,20.6397,425.9982,0.2018
XGB_perfact_and_demo,8.3292,69.3756,0.8701,15.6278,244.2288,0.5424
XGB_perfact_only,10.9454,119.8023,0.7757,15.3968,237.0612,0.5558
XGB_all,7.6433,58.4195,0.8906,15.294,233.906,0.5618
LR_perfact_only,14.369,206.467,0.6135,14.8292,219.9046,0.588
LR_perfact_only_tuned,14.3691,206.4711,0.6135,14.8287,219.8893,0.588


In [18]:
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("RMSE Test",ascending=False).round(4)
results_subsets_df[["RMSE Train", "MSE Train", "R2 Train", "RMSE Test", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,RMSE Train,MSE Train,R2 Train,RMSE Test,MSE Test,R2 Test
Baseline,23.1128,534.2038,-0.0,23.1089,534.0217,-0.0005
XGB_demo_only,12.8886,166.1155,0.689,21.995,483.7794,0.0936
LR_demo_only,20.4779,419.3447,0.215,20.6723,427.3434,0.1993
LR_demo_only_tuned,20.4787,419.3783,0.2149,20.6713,427.3018,0.1994
XGB_demo_only_tuned,19.8311,393.2722,0.2638,20.6397,425.9982,0.2018
XGB_perfact_and_demo,8.3292,69.3756,0.8701,15.6278,244.2288,0.5424
XGB_perfact_only,10.9454,119.8023,0.7757,15.3968,237.0612,0.5558
XGB_all,7.6433,58.4195,0.8906,15.294,233.906,0.5618
LR_perfact_only,14.369,206.467,0.6135,14.8292,219.9046,0.588
LR_perfact_only_tuned,14.3691,206.4711,0.6135,14.8287,219.8893,0.588


### Performance Results

In [19]:
# For LR
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "LR" in i)]
metric = "RMSE Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())*-1

df_mean = pd.DataFrame((-1*use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_demo_only_tuned,LR_perfact_only_tuned,LR_perfact_and_demo_tuned,LR_all_tuned
0,23.112 (0.264),20.533 (0.3),14.468 (0.222),14.352 (0.254),14.136 (0.285)


In [20]:
# For XGB
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "RMSE Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())*-1

df_mean = pd.DataFrame((-1*use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_demo_only_tuned,XGB_perfact_only_tuned,XGB_perfact_and_demo_tuned,XGB_all_tuned
0,23.112 (0.264),20.434 (0.34),14.393 (0.196),14.275 (0.225),14.048 (0.292)


In [21]:
res_df_lr.columns = [i[3:-6] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i[4:-6] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_subsets = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_subsets.index = ["LR", "XGB"]
latex_df_subsets

Unnamed: 0,Baseline,demo_only,perfact_only,perfact_and_demo,all
LR,23.112 (0.264),20.533 (0.3),14.468 (0.222),14.352 (0.254),14.136 (0.285)
XGB,23.112 (0.264),20.434 (0.34),14.393 (0.196),14.275 (0.225),14.048 (0.292)


In [22]:
print(latex_df_subsets.round(2).transpose().to_latex())


\begin{tabular}{lll}
\toprule
{} &              LR &             XGB \\
\midrule
Baseline         &  23.112 (0.264) &  23.112 (0.264) \\
demo\_only        &    20.533 (0.3) &   20.434 (0.34) \\
perfact\_only     &  14.468 (0.222) &  14.393 (0.196) \\
perfact\_and\_demo &  14.352 (0.254) &  14.275 (0.225) \\
all              &  14.136 (0.285) &  14.048 (0.292) \\
\bottomrule
\end{tabular}



### Feature Importance

In [23]:
# top_10_importances = {}

# for model in list(results_subsets_feature_importances[fold].keys()):
#     imp_df = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)

#     if "LR" in model:
#         direction = imp_df.apply(lambda x: np.sign(x))
#         imp_df = imp_df.abs()

#     imp_df = imp_df/imp_df.sum(axis=0)

#     mean_imp_df = imp_df.mean(axis=1)
#     std_imp_df = imp_df.std(axis=1)

#     mean_imp_df = mean_imp_df.sort_values(ascending=False)
#     std_imp_df = std_imp_df.loc[mean_imp_df.index]
#     final_imps = mean_imp_df[:10]
#     final_imps["Rest"] = sum(mean_imp_df[10:])
#     top_5_importances[model] = np.array([final_imps.index.values, final_imps.values])

In [24]:
demo_importances = {}
demo_importances_stds = {}

for model in list(results_subsets_feature_importances[fold].keys()):
    if "demo" in model or "all" in model:
        imp_df_all = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)
        
        if "LR" in model:
            direction = imp_df_all.apply(lambda x: np.sign(x))
            imp_df_all = imp_df_all.abs()
        if imp_df_all.sum().sum()!=0:
            imp_df = imp_df_all/imp_df_all.sum(axis=0)
        imp_df = imp_df.fillna(1/imp_df.shape[0])
#         imp_df = imp_df.loc[demographic_cols]

#         mean_imp_df = imp_df.mean(axis=1)
#         std_imp_df = imp_df.std(axis=1)

#         mean_imp_df = mean_imp_df.sort_values(ascending=False)
#         std_imp_df = std_imp_df.loc[mean_imp_df.index]
#         final_imps = mean_imp_df#[:10]
#         final_imps["Rest"] = sum(mean_imp_df[10:])
#         final_imps["Total"] = sum(mean_imp_df)
        demo_importances[model] = np.round(np.mean(imp_df.loc[demographic_cols].sum(axis=0)),2)#final_imps.values
        demo_importances_stds[model] = np.round(np.std(imp_df.loc[demographic_cols].sum(axis=0)),2)#final_imps.values


In [25]:
lr_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "LR" in i and "tuned" in i})
xgb_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "XGB" in i and "tuned" in i})
lr_demo_imp.index = [i[3:-6] for i in lr_demo_imp.index]    
xgb_demo_imp.index = [i[4:-6] for i in xgb_demo_imp.index]    

lr_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "LR" in i and "tuned" in i})
xgb_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "XGB" in i and "tuned" in i})
lr_demo_imp_stds.index = [i[3:-6] for i in lr_demo_imp_stds.index]    
xgb_demo_imp_stds.index = [i[4:-6] for i in xgb_demo_imp_stds.index]    


latex_df_imp = pd.DataFrame([lr_demo_imp.astype(str) + " (" + lr_demo_imp_stds.astype(str) + ")",
                             xgb_demo_imp.astype(str) + " (" + xgb_demo_imp_stds.astype(str) + ")"])
latex_df_imp.index = ["LR", "XGB"]
latex_df_imp

Unnamed: 0,demo_only,perfact_and_demo,all
LR,1.0 (0.0),0.25 (0.03),0.23 (0.02)
XGB,1.0 (0.0),0.23 (0.06),0.19 (0.09)


In [26]:
print(latex_df_imp.transpose().to_latex())

\begin{tabular}{lll}
\toprule
{} &           LR &          XGB \\
\midrule
demo\_only        &    1.0 (0.0) &    1.0 (0.0) \\
perfact\_and\_demo &  0.25 (0.03) &  0.23 (0.06) \\
all              &  0.23 (0.02) &  0.19 (0.09) \\
\bottomrule
\end{tabular}



### Mean impact of using demo features

In [27]:
np.random.seed(RS)
mean_abs_differences = []
for fold in range(folds):
    X_train = data_dict[f"X_train_{fold}"]
    y_train = data_dict[f"y_train_{fold}"]

    X_val = data_dict[f"X_val_{fold}"]
    y_val = data_dict[f"y_val_{fold}"]

    X_test = data_dict[f"X_test_{fold}"]
    y_test = data_dict[f"y_test_{fold}"]

    target_scaler = data_dict[f"target_scaler_{fold}"]

    y_train_val = np.concatenate([y_train,y_val])

    # Define data subset for LR
    z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
    z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
    z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 

    X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
    X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
    X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
    X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

    # Rescale GLMM
    for col in z_glmm_encoded_train.columns:
        z_mean = X_train_val_lr[col].mean()
        z_std = X_train_val_lr[col].std()

        X_train_val_lr[col] = (X_train_val_lr[col]-z_mean)/z_std
        X_test_lr[col] = (X_test_lr[col]-z_mean)/z_std


    # Define data subset for XGB
    z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
    z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
    z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
    X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
    X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
    X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
    X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])

    final_hyperparameters = tune_lasso(X_train_val_lr, y_train_val, max_evals=max_evals, seed=RS)
    lr = Lasso(alpha=final_hyperparameters["alpha"],
               random_state=RS)
#     lr = Lasso(alpha=0.001)
    lr.fit(X_train_val_lr,y_train_val)
    y_pred = target_scaler.inverse_transform(lr.predict(X_test_lr).reshape(-1,1)).ravel()

    is_not_demo = [i not in demographic_cols for i in X_train_val_lr.columns]
    y_pred_notdemo = target_scaler.inverse_transform(np.dot(X_test_lr.loc[:,is_not_demo],lr.coef_[is_not_demo]).reshape(-1,1)).ravel()
    mean_abs_diff = np.round(np.mean(np.abs(y_pred-y_pred_notdemo)),2)
    print(f"Mean absolute Difference w\o Demo: {mean_abs_diff}")
    print(f"RMSE Difference w\o Demo: {np.round(np.sqrt(np.mean(np.power(y_pred-y_pred_notdemo,2))),2)}")
    mean_abs_differences.append(mean_abs_diff)
    # is_demo = [i in demographic_cols for i in X_train_val_lr.columns]
    # y_pred_demo = target_scaler.inverse_transform(np.dot(X_test_lr.loc[:,is_demo],lr.coef_[is_demo]).reshape(-1,1)).ravel()

    # print(f"Mean absolute Difference with Demo: {np.mean(np.abs(y_pred-y_pred_demo))}")
    # print(f"RMSE Difference with Demo: {np.sqrt(np.mean(np.power(y_pred-y_pred_demo,2)))}")

SCORE: 0.3844196869294715                             
SCORE: 0.5687690331195043                                                       
SCORE: 0.720201745892618                                                        
SCORE: 0.4993240193909855                                                       
SCORE: 0.6722046168605272                                                       
SCORE: 0.7408250377352987                                                       
SCORE: 0.39366658306078117                                                      
SCORE: 0.40740431456859855                                                      
SCORE: 0.46113296602228404                                                      
SCORE: 0.580246050109633                                                        
SCORE: 0.6076300853509391                                                        
SCORE: 0.4423272406365726                                                        
SCORE: 0.6502727801054494                           

SCORE: 0.405763561776748                                                          
SCORE: 0.6731209443457145                                                         
100%|██████████| 50/50 [00:09<00:00,  5.14trial/s, best loss: 0.37541132975797814]
The best hyperparameters are :  

{'alpha': 0.0009711868490754763}
Mean absolute Difference w\o Demo: 0.94
RMSE Difference w\o Demo: 1.16
SCORE: 0.5162246517153749                             
SCORE: 0.37582693901262554                            
SCORE: 0.3738943920673966                                                        
SCORE: 0.3877910170552713                                                        
SCORE: 0.6329793424951399                                                       
SCORE: 0.5728079966588925                                                       
SCORE: 0.39484426447370946                                                      
SCORE: 0.6043028249581632                                                       
SCORE: 0.516202

SCORE: 0.37915908060952924                                                        
SCORE: 0.3854136797353644                                                         
SCORE: 0.4083450504658293                                                         
SCORE: 0.4238904558218972                                                         
SCORE: 0.5237190917242334                                                         
SCORE: 0.37543355488318014                                                        
SCORE: 0.5850740218688164                                                         
100%|██████████| 50/50 [00:09<00:00,  5.55trial/s, best loss: 0.37528402912229436]
The best hyperparameters are :  

{'alpha': 0.0017910537667827297}
Mean absolute Difference w\o Demo: 0.94
RMSE Difference w\o Demo: 1.17
SCORE: 0.5103659789269701                             
SCORE: 0.44366425531962517                                                      
SCORE: 0.5248404263966083                                     

In [28]:
np.mean(mean_abs_differences).round(2),np.std(mean_abs_differences).round(2)

(0.97, 0.07)