In [1]:
import sys
sys.path.append("..")

from utils.evaluation import *
from utils.utils import *

from data import dataset_preprocessing

from utils.evaluation import get_metrics
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

from scipy import stats

import pandas as pd
import numpy as np
import os

import pickle

In [2]:
dataset_name = "OULAD"
mode="cv"
RS=68
hct=10
test_ratio=0.2
val_ratio=0.1
folds=5
target = "binary"
experiment_name = "EDM_results"

### Describe raw data

In [3]:
#########  Preprocessing #################################
if not os.path.exists(f"../data/prepared/{dataset_name}/df_prepared.pickle"):

    assessments_df = pd.read_csv(f'../data/raw/{dataset_name}/assessments.csv')
    courses_df = pd.read_csv(f'../data/raw/{dataset_name}/courses.csv')
    studentAssessment_df = pd.read_csv(f'../data/raw/{dataset_name}/studentAssessment.csv')
    studentInfo_df = pd.read_csv(f'../data/raw/{dataset_name}/studentInfo.csv')
    studentRegistration_df = pd.read_csv(f'../data/raw/{dataset_name}/studentRegistration.csv')
    studentVle_df = pd.read_csv(f'../data/raw/{dataset_name}/studentVle.csv')
    vle_df = pd.read_csv(f'../data/raw/{dataset_name}/vle.csv')


    # Remove all withdrawn
    studentInfo_df = studentInfo_df.loc[studentInfo_df.final_result!="Withdrawn"]
    studentInfo_df.shape

    # Assessment performance features
    merged_assessments_df = pd.merge(studentAssessment_df,assessments_df,on="id_assessment")

    avg_tma = [merged_assessments_df.loc[np.logical_and(merged_assessments_df.id_student==i,merged_assessments_df.assessment_type=="TMA"),"score"].mean() for i in studentInfo_df.id_student.values]
    avg_cma = [merged_assessments_df.loc[np.logical_and(merged_assessments_df.id_student==i,merged_assessments_df.assessment_type=="CMA"),"score"].mean() for i in studentInfo_df.id_student.values]
    avg_exam = [merged_assessments_df.loc[np.logical_and(merged_assessments_df.id_student==i,merged_assessments_df.assessment_type=="Exam"),"score"].mean() for i in studentInfo_df.id_student.values]

    studentInfo_df["avg_tma"] = avg_tma
    studentInfo_df["avg_cma"] = avg_cma
    studentInfo_df["avg_exam"] = avg_exam

    # Get VLE features
    vle_merged = pd.merge(studentVle_df,vle_df,on=["code_module", "code_presentation", "id_site"])
    for activity_type in vle_df.activity_type.unique():
        agg = vle_merged.loc[vle_merged.activity_type==activity_type].groupby("id_student")
        count_click_dict = dict(agg.count()["sum_click"])
        sum_click_dict = dict(agg.sum()["sum_click"])
        studentInfo_df[f"Count_Visits_{activity_type}"] = studentInfo_df["id_student"].apply(lambda x: sum_click_dict[x] if x in count_click_dict.keys() else 0)
        studentInfo_df[f"Sum_Clicks_{activity_type}"] = studentInfo_df["id_student"].apply(lambda x: sum_click_dict[x] if x in sum_click_dict.keys() else 0)

    with open(f"../data/prepared/{dataset_name}/df_prepared.pickle", 'wb') as handle:
        pickle.dump(studentInfo_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    df = studentInfo_df

else:    
    with open(f"../data/prepared/{dataset_name}/df_prepared.pickle", 'rb') as handle:
        df = pickle.load(handle)


In [4]:
df = df.drop(["id_student", "code_module", "code_presentation", "avg_exam"],axis=1)

In [5]:
df["final_result"] = df["final_result"].apply(lambda x: 0 if x=="Fail" else 1)

In [6]:
y_col = "final_result"
demographic_cols = ['gender', 'region', 'imd_band', 'age_band', 'disability', 'highest_education']
perf_cols = ["avg_cma", "avg_tma", 'num_of_prev_attempts', 'studied_credits']
activity_cols = [i for i in df.columns if "Sum_Clicks" in i] + [i for i in df.columns if "Count_Visits" in i]
other_cols = []

set(df.columns)-set([y_col]+demographic_cols+perf_cols+activity_cols+other_cols)

set()

In [7]:
desc_df_dict = {"No. of samples": df.shape[0],
           "No. of features": df.shape[1],
           "Performance features": len(perf_cols),
           "Demographic features": len(demographic_cols),
           "Activity features": len(activity_cols),
           "Other features": len(other_cols),
           "Categorical features": len(df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]),     
           "Total cardinality": df[df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]].nunique().sum(),     
           "% NA": df.isna().sum().sum()/sum(df.shape),
           "Target $\textbf{y} \in$": f"[1..{df[y_col].nunique()}]",
#            "High cardinality levels":  list(df.loc[:,list(df.columns[list(np.logical_and(df.nunique() >= 10, df.dtypes == "object"))])].nunique().sort_values().values),
          
}
desc_df = pd.DataFrame([desc_df_dict],index=["cortez"])
desc_df

Unnamed: 0,No. of samples,No. of features,Performance features,Demographic features,Activity features,Other features,Categorical features,Total cardinality,% NA,Target $\textbf{y} \in$
cortez,22437,51,4,6,40,0,4,31,0.480034,[1..2]


In [8]:
print(desc_df.transpose().to_latex())

\begin{tabular}{ll}
\toprule
{} &    cortez \\
\midrule
No. of samples          &     22437 \\
No. of features         &        51 \\
Performance features    &         4 \\
Demographic features    &         6 \\
Activity features       &        40 \\
Other features          &         0 \\
Categorical features    &         4 \\
Total cardinality       &        31 \\
\% NA                    &  0.480034 \\
Target \$\textbackslash textbf\{y\} \textbackslash in\$ &    [1..2] \\
\bottomrule
\end{tabular}



### Preprocessing and preparation

In [9]:
data_path = f"{mode}_RS{RS}_hct{hct}"
if mode == "cv":
    data_path += f"_{folds}folds"
elif mode == "train_test":
    data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
elif mode == "train_val_test":
    data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"


# If no data_dict for the configuration exists, run preprocessing, else load data_dict
if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
    dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)


## Evaluation of categorical data treatment methods

In [10]:
conditions = ["ignore", "ohe", "target", "ordinal", "catboost", "glmm"]

In [11]:
early_stopping_rounds = 10
max_evals = 50

In [12]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle"):

    results_encodings = {}
    results_encodings_feature_importances = {}

    for fold in range(folds):
        results_encodings[fold] = {}
        results_encodings_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        max_ = np.argmax(np.unique(y_train_val,return_counts=True)[1])
        y_train_val_pred_base = np.ones(y_train_val.shape[0])*max_
        y_test_pred_base = np.ones(y_test.shape[0])*max_

        results_encodings[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_encodings[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_encodings[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]

        for condition in conditions:
            print(f"Preparing results for fold {fold}, condition={condition}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]

            # Define condition data subset
            if condition != "ignore":
                z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
                z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
                z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

                X_train = pd.concat([X_train,z_encoded_train],axis=1)
                X_val = pd.concat([X_val,z_encoded_val],axis=1)
                X_test = pd.concat([X_test,z_encoded_test],axis=1)

            X_train_val = pd.concat([X_train,X_val])
            y_train_val = np.concatenate([y_train,y_val])

            # Train base models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target,tune=False, seed=RS)
            results_encodings[fold]["LR_"+condition] = res
            results_encodings_feature_importances[fold]["LR_"+condition] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition] = res
            results_encodings_feature_importances[fold]["XGB_"+condition] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_encodings[fold]["LR_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["LR_"+condition+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["XGB_"+condition+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'wb') as handle:
        pickle.dump(results_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_encodings_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'rb') as handle:
        results_encodings = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'rb') as handle:
        results_encodings_feature_importances = pickle.load(handle)
        
        
results_encodings_df = pd.DataFrame(results_encodings[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_glmm_tuned,0.9107,0.9368,0.9652,0.8754,0.9119,0.9297
XGB_target_tuned,0.9317,0.9513,0.9788,0.8723,0.909,0.9278
XGB_catboost_tuned,0.912,0.9377,0.9663,0.8692,0.9084,0.9282
XGB_ohe_tuned,0.8969,0.927,0.9532,0.8685,0.9069,0.9274
XGB_ignore_tuned,0.9111,0.937,0.9662,0.8676,0.9062,0.9253
XGB_ordinal_tuned,0.9441,0.96,0.9871,0.8674,0.9055,0.9235
XGB_ignore,0.9406,0.9577,0.9866,0.8648,0.904,0.9183
XGB_glmm,0.9528,0.9662,0.9915,0.8636,0.9033,0.9222
XGB_target,0.9488,0.9634,0.9896,0.8641,0.9031,0.9238
XGB_catboost,0.9535,0.9667,0.9915,0.8603,0.9019,0.9166


In [13]:
results_encodings_df = pd.DataFrame(results_encodings[1]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_ignore_tuned,0.9282,0.9489,0.9766,0.8757,0.9122,0.9244
XGB_ohe_tuned,0.898,0.9279,0.9558,0.8734,0.9106,0.926
XGB_glmm_tuned,0.9293,0.9496,0.9787,0.8717,0.9094,0.9266
XGB_catboost_tuned,0.9458,0.9612,0.9875,0.8708,0.9088,0.9254
XGB_glmm,0.9538,0.9669,0.9911,0.8712,0.9088,0.9204
XGB_target,0.9499,0.9641,0.9902,0.8705,0.9087,0.9198
XGB_target_tuned,0.9219,0.9444,0.9737,0.8705,0.9085,0.9252
XGB_ordinal,0.9489,0.9634,0.9901,0.8697,0.9079,0.9197
XGB_ignore,0.9432,0.9595,0.9861,0.8685,0.9075,0.9193
XGB_ordinal_tuned,0.9496,0.9639,0.9892,0.869,0.9075,0.9206


### Performance Comparison

In [14]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(2).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_ignore_tuned,LR_ohe_tuned,LR_target_tuned,LR_ordinal_tuned,LR_catboost_tuned,LR_glmm_tuned
0,0.81 (0.004),0.87 (0.008),0.87 (0.008),0.87 (0.008),0.87 (0.009),0.87 (0.008),0.87 (0.008)


In [15]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(2).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_ignore_tuned,XGB_ohe_tuned,XGB_target_tuned,XGB_ordinal_tuned,XGB_catboost_tuned,XGB_glmm_tuned
0,0.81 (0.004),0.91 (0.005),0.91 (0.004),0.91 (0.004),0.91 (0.001),0.91 (0.002),0.91 (0.004)


In [16]:
res_df_lr.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_encodings = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_encodings.index = ["LR", "XGB"]
latex_df_encodings

Unnamed: 0,Baseline,ignore,ohe,target,ordinal,catboost,glmm
LR,0.81 (0.004),0.87 (0.008),0.87 (0.008),0.87 (0.008),0.87 (0.009),0.87 (0.008),0.87 (0.008)
XGB,0.81 (0.004),0.91 (0.005),0.91 (0.004),0.91 (0.004),0.91 (0.001),0.91 (0.002),0.91 (0.004)


In [17]:
print(latex_df_encodings.round(2).to_latex())


\begin{tabular}{llllllll}
\toprule
{} &      Baseline &        ignore &           ohe &        target &       ordinal &      catboost &          glmm \\
\midrule
LR  &  0.81 (0.004) &  0.87 (0.008) &  0.87 (0.008) &  0.87 (0.008) &  0.87 (0.009) &  0.87 (0.008) &  0.87 (0.008) \\
XGB &  0.81 (0.004) &  0.91 (0.005) &  0.91 (0.004) &  0.91 (0.004) &  0.91 (0.001) &  0.91 (0.002) &  0.91 (0.004) \\
\bottomrule
\end{tabular}



## Data Subset Comparisons

In [18]:
subsets = {"demo_only": demographic_cols,
           "perfact_only": perf_cols+activity_cols,
           "perfact_and_demo": perf_cols+activity_cols+demographic_cols,
           "all": list(df.columns)
          }

In [19]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle"):

    results_subsets = {}
    results_subsets_feature_importances = {}

    for fold in range(folds):
        results_subsets[fold] = {}
        results_subsets_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        max_ = np.argmax(np.unique(y_train_val,return_counts=True)[1])
        y_train_val_pred_base = np.ones(y_train_val.shape[0])*max_
        y_test_pred_base = np.ones(y_test.shape[0])*max_

        results_subsets[fold]["Baseline"] = {}
        eval_res_train = get_metrics(y_train_val, y_train_val_pred_base, target=target)
        for metric in eval_res_train.keys():
            results_subsets[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(y_test, y_test_pred_base, target=target)
        for metric in eval_res_test.keys():
            results_subsets[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]


        for subset_key in subsets:
            print(f"Preparing results for fold {fold}, subset={subset_key}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]
        
            y_train_val = np.concatenate([y_train,y_val])

            # Define data subset for LR
            z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
            z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
            z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 
            X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
            X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
            X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
            X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

            # Rescale GLMM
            for col in z_glmm_encoded_train.columns:
                z_mean = X_train_val_lr[col].mean()
                z_std = X_train_val_lr[col].std()
            
                X_train_val_lr[col] = (X_train_val_lr[col]-z_mean)/z_std
                X_test_lr[col] = (X_test_lr[col]-z_mean)/z_std
                        
            
            # Define data subset for XGB
            z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
            z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
            z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
            X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
            X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
            X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
            X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])


            # Define data subset for evaluation
            X_train_val_lr = X_train_val_lr[[i for i in X_train_val_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_lr = X_test_lr[[i for i in X_test_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_train_val_xgb = X_train_val_xgb[[i for i in X_train_val_xgb.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_xgb = X_test_xgb[[i for i in X_test_xgb.columns if any([j in i for j in subsets[subset_key]])]]


            # Train base models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target,tune=False, seed=RS)
            results_subsets[fold]["LR_"+subset_key] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_subsets[fold]["LR_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'wb') as handle:
        pickle.dump(results_subsets, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_subsets_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'rb') as handle:
        results_subsets = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'rb') as handle:
        results_subsets_feature_importances = pickle.load(handle)
        
        
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_subsets_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_all_tuned,0.9027,0.9312,0.9579,0.8694,0.9077,0.9277
XGB_perfact_and_demo_tuned,0.8968,0.9271,0.9548,0.8683,0.9071,0.9266
XGB_perfact_only_tuned,0.9053,0.9329,0.9609,0.8688,0.907,0.9243
XGB_perfact_only,0.9417,0.9584,0.9862,0.8627,0.9026,0.92
XGB_perfact_and_demo,0.9443,0.9603,0.9886,0.8607,0.901,0.9219
XGB_all,0.9443,0.9603,0.9886,0.8607,0.901,0.9219
LR_perfact_and_demo_tuned,0.8164,0.8699,0.8647,0.8209,0.8728,0.871
LR_all_tuned,0.8164,0.8699,0.8647,0.8209,0.8728,0.871
LR_perfact_and_demo,0.8164,0.8699,0.8648,0.8206,0.8726,0.871
LR_all,0.8164,0.8699,0.8648,0.8206,0.8726,0.871


### Performance Results

In [20]:
# For LR
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_demo_only_tuned,LR_perfact_only_tuned,LR_perfact_and_demo_tuned,LR_all_tuned
0,0.814 (0.004),0.81 (0.004),0.87 (0.008),0.869 (0.008),0.869 (0.008)


In [21]:
# For XGB
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_demo_only_tuned,XGB_perfact_only_tuned,XGB_perfact_and_demo_tuned,XGB_all_tuned
0,0.814 (0.004),0.809 (0.004),0.906 (0.003),0.907 (0.004),0.907 (0.003)


In [22]:
res_df_lr.columns = [i[3:-6] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i[4:-6] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_subsets = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_subsets.index = ["LR", "XGB"]
latex_df_subsets

Unnamed: 0,Baseline,demo_only,perfact_only,perfact_and_demo,all
LR,0.814 (0.004),0.81 (0.004),0.87 (0.008),0.869 (0.008),0.869 (0.008)
XGB,0.814 (0.004),0.809 (0.004),0.906 (0.003),0.907 (0.004),0.907 (0.003)


In [23]:
print(latex_df_subsets.round(2).transpose().to_latex())


\begin{tabular}{lll}
\toprule
{} &             LR &            XGB \\
\midrule
Baseline         &  0.814 (0.004) &  0.814 (0.004) \\
demo\_only        &   0.81 (0.004) &  0.809 (0.004) \\
perfact\_only     &   0.87 (0.008) &  0.906 (0.003) \\
perfact\_and\_demo &  0.869 (0.008) &  0.907 (0.004) \\
all              &  0.869 (0.008) &  0.907 (0.003) \\
\bottomrule
\end{tabular}



### Feature Importance

In [24]:
# top_10_importances = {}

# for model in list(results_subsets_feature_importances[fold].keys()):
#     imp_df = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)

#     if "LR" in model:
#         direction = imp_df.apply(lambda x: np.sign(x))
#         imp_df = imp_df.abs()

#     imp_df = imp_df/imp_df.sum(axis=0)

#     mean_imp_df = imp_df.mean(axis=1)
#     std_imp_df = imp_df.std(axis=1)

#     mean_imp_df = mean_imp_df.sort_values(ascending=False)
#     std_imp_df = std_imp_df.loc[mean_imp_df.index]
#     final_imps = mean_imp_df[:10]
#     final_imps["Rest"] = sum(mean_imp_df[10:])
#     top_5_importances[model] = np.array([final_imps.index.values, final_imps.values])

In [25]:
demo_importances = {}
demo_importances_stds = {}

for model in list(results_subsets_feature_importances[0].keys()):
    if "demo" in model or "all" in model:
        imp_df_all = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)
        
        if "LR" in model:
            direction = imp_df_all.apply(lambda x: np.sign(x))
            imp_df_all = imp_df_all.abs()
        if imp_df_all.sum().sum()!=0:
            imp_df = imp_df_all/imp_df_all.sum(axis=0)
        imp_df = imp_df.fillna(1/imp_df.shape[0])
#         imp_df = imp_df.loc[demographic_cols]

#         mean_imp_df = imp_df.mean(axis=1)
#         std_imp_df = imp_df.std(axis=1)

#         mean_imp_df = mean_imp_df.sort_values(ascending=False)
#         std_imp_df = std_imp_df.loc[mean_imp_df.index]
#         final_imps = mean_imp_df#[:10]
#         final_imps["Rest"] = sum(mean_imp_df[10:])
#         final_imps["Total"] = sum(mean_imp_df)
        demo_importances[model] = np.round(np.mean(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values
        demo_importances_stds[model] = np.round(np.std(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values


In [26]:
lr_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "LR" in i and "tuned" in i})
xgb_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "XGB" in i and "tuned" in i})
lr_demo_imp.index = [i[3:-6] for i in lr_demo_imp.index]    
xgb_demo_imp.index = [i[4:-6] for i in xgb_demo_imp.index]    

lr_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "LR" in i and "tuned" in i})
xgb_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "XGB" in i and "tuned" in i})
lr_demo_imp_stds.index = [i[3:-6] for i in lr_demo_imp_stds.index]    
xgb_demo_imp_stds.index = [i[4:-6] for i in xgb_demo_imp_stds.index]    


latex_df_imp = pd.DataFrame([lr_demo_imp.astype(str) + " (" + lr_demo_imp_stds.astype(str) + ")",
                             xgb_demo_imp.astype(str) + " (" + xgb_demo_imp_stds.astype(str) + ")"])
latex_df_imp.index = ["LR", "XGB"]
latex_df_imp

Unnamed: 0,demo_only,perfact_and_demo,all
LR,1.0 (0.0),0.13 (0.0),0.13 (0.0)
XGB,1.0 (0.0),0.08 (0.0),0.08 (0.01)


In [27]:
print(latex_df_subsets.to_latex())

\begin{tabular}{llllll}
\toprule
{} &       Baseline &      demo\_only &   perfact\_only & perfact\_and\_demo &            all \\
\midrule
LR  &  0.814 (0.004) &   0.81 (0.004) &   0.87 (0.008) &    0.869 (0.008) &  0.869 (0.008) \\
XGB &  0.814 (0.004) &  0.809 (0.004) &  0.906 (0.003) &    0.907 (0.004) &  0.907 (0.003) \\
\bottomrule
\end{tabular}



### Mean impact of using demo features

In [28]:
np.random.seed(RS)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

mean_abs_differences = []
for fold in range(folds):
    X_train = data_dict[f"X_train_{fold}"]
    y_train = data_dict[f"y_train_{fold}"]

    X_val = data_dict[f"X_val_{fold}"]
    y_val = data_dict[f"y_val_{fold}"]

    X_test = data_dict[f"X_test_{fold}"]
    y_test = data_dict[f"y_test_{fold}"]

    y_train_val = np.concatenate([y_train,y_val])

    # Define data subset for LR
    z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
    z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
    z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 

    X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
    X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
    X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
    X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

    # Rescale GLMM
    for col in z_glmm_encoded_train.columns:
        z_mean = X_train_val_lr[col].mean()
        z_std = X_train_val_lr[col].std()

        X_train_val_lr[col] = (X_train_val_lr[col]-z_mean)/z_std
        X_test_lr[col] = (X_test_lr[col]-z_mean)/z_std


    # Define data subset for XGB
    z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
    z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
    z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
    X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
    X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
    X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
    X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])

    final_hyperparameters = tune_logreg_binary(X_train_val_lr, y_train_val,target, max_evals=max_evals, seed=RS)
    lr = LogisticRegression(penalty="l2",
                                       solver="lbfgs",
                                       C=final_hyperparameters["C"],
                                       max_iter=10000,
                                       random_state=RS
                                       )
#     lr = Lasso(alpha=0.001)
    lr.fit(X_train_val_lr,y_train_val)
    y_pred_logits = sigmoid(lr.predict_proba(X_test_lr)[:,1])
    y_pred = lr.predict(X_test_lr)

    is_not_demo = [i not in demographic_cols for i in X_train_val_lr.columns]
    y_pred_logits_notdemo = sigmoid(np.dot(X_test_lr.loc[:,is_not_demo],lr.coef_[0][is_not_demo]))
    y_pred_notdemo = np.round(y_pred_logits_notdemo)

    print(f"% different predictions w\o Demo: {np.mean(y_pred!=y_pred_notdemo)}")
    print(f"Mean absolute % difference w\o Demo: {np.mean(np.abs(y_pred_logits-y_pred_logits_notdemo))}")
    print(f"RMSE Difference w\o Demo: {np.sqrt(np.mean(np.power(y_pred-y_pred_notdemo,2)))}")
    mean_abs_differences.append(np.mean(y_pred!=y_pred_notdemo))

SCORE: 0.4322915300872288                             
SCORE: 0.43228128842956404                                                      
SCORE: 0.43228948172390097                                                       
SCORE: 0.4322899097654872                                                        
SCORE: 0.43228742570731615                                                       
SCORE: 0.4323942102163515                                                        
SCORE: 0.43228998861173196                                                       
SCORE: 0.43229175160722305                                                       
SCORE: 0.4323931314096486                                                        
SCORE: 0.43229218839084016                                                       
SCORE: 0.4322903296119067                                                         
SCORE: 0.4322850873154495                                                         
SCORE: 0.4322883434266374                 

SCORE: 0.43750222069208855                                                      
SCORE: 0.43768394606962086                                                      
SCORE: 0.43755657955796484                                                      
SCORE: 0.4373439297253034                                                       
100%|██████████| 50/50 [00:28<00:00,  1.78trial/s, best loss: 0.437335071326476]
The best hyperparameters are :  

{'C': 0.03399621392273323}
% different predictions w\o Demo: 0.3333333333333333
Mean absolute % difference w\o Demo: 0.29732279518306154
RMSE Difference w\o Demo: 0.5773502691896257
SCORE: 0.4294369489113656                             
SCORE: 0.42934892700162725                                                      
SCORE: 0.42944578611279083                                                       
SCORE: 0.4293473960244123                                                        
SCORE: 0.42947504162747674                                                     

SCORE: 0.43272473980120785                                                        
SCORE: 0.4327511683553663                                                         
SCORE: 0.43284082326850815                                                        
SCORE: 0.432689372370762                                                          
SCORE: 0.43290162152578854                                                        
SCORE: 0.4328778079751638                                                         
SCORE: 0.4328471053830258                                                         
SCORE: 0.4327692510914588                                                         
SCORE: 0.4328223426536826                                                         
SCORE: 0.43286027026121143                                                        
SCORE: 0.43262490047454555                                                        
100%|██████████| 50/50 [00:31<00:00,  1.60trial/s, best loss: 0.43262403786308923]
The 

In [29]:
np.mean(mean_abs_differences).round(2),np.std(mean_abs_differences).round(2)

(0.33, 0.01)

In [30]:
f1(y_test,y_pred),f1(y_test,y_pred_notdemo)

(0.8568711364717071, 0.6823873121869782)