In [1]:
import sys
sys.path.append("..")

from utils.evaluation import *
from utils.utils import *

from data import dataset_preprocessing

from utils.evaluation import get_metrics
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

from scipy import stats

import pandas as pd
import numpy as np
import os

import pickle

In [2]:
dataset_name = "hussain"
mode="cv"
RS=1
hct=10
test_ratio=0.2
val_ratio=0.1
folds=5
target = "categorical"
experiment_name = "5CV_paper_final"

### Describe raw data

In [3]:
import arff # make sure to pip install liac-arff



dataset = arff.load(open(f"../data/raw/{dataset_name}/Sapfile1.arff", 'rt'))
df = pd.DataFrame(dataset['data'], columns=[i[0] for i in dataset["attributes"]])


In [4]:
y_col = "esp"
demographic_cols = ["as", "cst", "fmi", "fo", 'fq', 'fs', 'ge', 'ls', 'me', 'mo', 'mq', 'ms', "ss", "tt"]
perf_cols = ["tnp", "twp", "iap"]
activity_cols = ["arr", "sh", "atd"]
other_cols = ['nf'] # no. of friends
set(df.columns)-set([y_col]+demographic_cols+perf_cols+activity_cols+other_cols)

set()

In [5]:
desc_df_dict = {"No. of samples": df.shape[0],
           "No. of features": df.shape[1],
           "Performance features": len(perf_cols),
           "Demographic features": len(demographic_cols),
           "Activity features": len(activity_cols),
           "Other features": len(other_cols),
           "Categorical features": len(df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]),     
           "Total cardinality": df[df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]].nunique().sum(),     
           "% NA": df.isna().sum().sum()/sum(df.shape),
           "Target $\textbf{y} \in$": f"[1..{df[y_col].nunique()}]",
#            "High cardinality levels":  list(df.loc[:,list(df.columns[list(np.logical_and(df.nunique() >= 10, df.dtypes == "object"))])].nunique().sort_values().values),
          
}
desc_df = pd.DataFrame([desc_df_dict],index=["cortez"])
desc_df

Unnamed: 0,No. of samples,No. of features,Performance features,Demographic features,Activity features,Other features,Categorical features,Total cardinality,% NA,Target $\textbf{y} \in$
cortez,131,22,3,14,3,1,16,67,0.0,[1..4]


In [6]:
print(desc_df.transpose().to_latex())

\begin{tabular}{ll}
\toprule
{} &  cortez \\
\midrule
No. of samples          &     131 \\
No. of features         &      22 \\
Performance features    &       3 \\
Demographic features    &      14 \\
Activity features       &       3 \\
Other features          &       1 \\
Categorical features    &      16 \\
Total cardinality       &      67 \\
\% NA                    &     0.0 \\
Target \$\textbackslash textbf\{y\} \textbackslash in\$ &  [1..4] \\
\bottomrule
\end{tabular}



### Preprocessing and preparation

In [7]:
data_path = f"{mode}_RS{RS}_hct{hct}"
if mode == "cv":
    data_path += f"_{folds}folds"
elif mode == "train_test":
    data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
elif mode == "train_val_test":
    data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"


# If no data_dict for the configuration exists, run preprocessing, else load data_dict
if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
    dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)


## Evaluation of categorical data treatment methods

In [8]:
conditions = ["ignore", "ohe", "target", "ordinal", "catboost", "glmm"]

In [11]:
early_stopping_rounds = 10
max_evals = 1

In [12]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle"):

    results_encodings = {}
    results_encodings_feature_importances = {}

    for fold in range(folds):
        results_encodings[fold] = {}
        results_encodings_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        u,c = np.unique(y_train_val,return_counts=True)
        nb_classes = len(u)
        baseline = np.argmax(c)

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*baseline
        y_test_pred_base = np.ones(y_test.shape[0])*baseline

        results_encodings[fold]["Baseline"] = {}
        eval_res_train = get_metrics(get_one_hot(y_train_val, nb_classes), get_one_hot(y_train_val_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_train.keys():
            results_encodings[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(get_one_hot(y_test, nb_classes), get_one_hot(y_test_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_test.keys():
            results_encodings[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]


        for condition in conditions:
            print(f"Preparing results for fold {fold}, condition={condition}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]

    ## ALL BUT PERFORMANCE:
            # Define data subset for evaluation
    #         X_train = X_train[[i for i in X_train.columns if all([j not in i for j in perf_cols])]]
    #         X_val = X_val[[i for i in X_val.columns if all([j not in i for j in perf_cols])]]
    #         X_test = X_test[[i for i in X_test.columns if all([j not in i for j in perf_cols])]]

            # Define condition data subset
    #         if condition != "ignore":
    #             z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
    #             z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
    #             z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

    #             X_train = pd.concat([X_train,z_encoded_train],axis=1)
    #             X_val = pd.concat([X_val,z_encoded_val],axis=1)
    #             X_test = pd.concat([X_test,z_encoded_test],axis=1)

    # ALL BUT PERFORMANCE & ACTIVITY:
    #         Define data subset for evaluation
            X_train = X_train[[i for i in X_train.columns if all([j not in i for j in perf_cols+activity_cols])]]
            X_val = X_val[[i for i in X_val.columns if all([j not in i for j in perf_cols+activity_cols])]]
            X_test = X_test[[i for i in X_test.columns if all([j not in i for j in perf_cols+activity_cols])]]

    #         Define condition data subset
            if condition != "ignore":
                z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
                z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
                z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

                X_train = pd.concat([X_train,z_encoded_train],axis=1)
                X_val = pd.concat([X_val,z_encoded_val],axis=1)
                X_test = pd.concat([X_test,z_encoded_test],axis=1)


    ## ONLY CATEGORICAL: --> Produces trash as almost never better than baseline
    #         if condition != "ignore":        
    #             X_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
    #             X_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
    #             X_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

    #         else:
    #             continue

            X_train_val = pd.concat([X_train,X_val])
            y_train_val = np.concatenate([y_train,y_val])

            # Train base models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target,tune=False, seed=RS)
            results_encodings[fold]["LR_"+condition] = res
            results_encodings_feature_importances[fold]["LR_"+condition] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition] = res
            results_encodings_feature_importances[fold]["XGB_"+condition] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_encodings[fold]["LR_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["LR_"+condition+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["XGB_"+condition+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'wb') as handle:
        pickle.dump(results_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_encodings_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'rb') as handle:
        results_encodings = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'rb') as handle:
        results_encodings_feature_importances = pickle.load(handle)
        
        
results_encodings_df = pd.DataFrame(results_encodings[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Preparing results for fold 0, condition=ignore
SCORE: 1.1886530664996733                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 35.71trial/s, best loss: 1.1886530664996733]
The best hyperparameters are :  

{'C': 0.055522167024990354}
Default performance on Test: 1.3305249133874055
SCORE: 1.241719925056295                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.79trial/s, best loss: 1.241719925056295]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.4426839592573333, 'n_estimators': 305.0}
Test Performance after first tuning round: 1.4913895531475576
SCORE: 1.2140397339616196                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:0

100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.61trial/s, best loss: 1.0725422225737646]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.18202223522796737, 'n_estimators': 215.0, 'seed': 0, 'max_depth': 15.0, 'min_child_weight': 3.0}
Test Performance after second tuning round: 0.8983303665905547
SCORE: 1.0326391664919343                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.93trial/s, best loss: 1.0326391664919343]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.18202223522796737, 'n_estimators': 215.0, 'seed': 0, 'max_depth': 15.0, 'min_child_weight': 3.0, 'colsample_bytree': 0.6422528723260953, 'subsample': 0.6313668683972973}
Test Performance after third tuning round: 0.9647630779605811
SCORE: 1.2499736218924864                                                                                         

SCORE: 1.3845897076865545                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.12trial/s, best loss: 1.3845897076865545]
The best hyperparameters are :  

{'learning_rate': 0.0227307642018375, 'n_estimators': 419.0, 'seed': 0, 'max_depth': 4.0, 'min_child_weight': 4.0, 'colsample_bytree': 0.8957691336420902, 'subsample': 0.9623777384321047, 'gamma': 3.393384843217633, 'reg_alpha': 8.0, 'reg_lambda': 3.779063190549759}
Test Performance after last tuning round: 1.3210486163206236
Preparing results for fold 1, condition=ohe
SCORE: 0.8691431798733763                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.98trial/s, best loss: 0.8691431798733763]
The best hyperparameters are :  

{'C': 0.2730547940486917}
Default performance on Test: 1.5188810029269446
SCORE:

SCORE: 1.192321348152122                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.83trial/s, best loss: 1.192321348152122]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.27188907939550405, 'n_estimators': 149.0}
Test Performance after first tuning round: 1.8192365153066818
SCORE: 1.168705431194378                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.15trial/s, best loss: 1.168705431194378]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.27188907939550405, 'n_estimators': 149.0, 'seed': 0, 'max_depth': 7.0, 'min_child_weight': 5.0}
Test Performance after second tuning round: 1.6651379000338575
SCORE: 1.1503592368871889                                                                                              
10

SCORE: 1.3219954580091116                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.54trial/s, best loss: 1.3219954580091116]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.031025240948914685, 'n_estimators': 88.0, 'seed': 0, 'max_depth': 9.0, 'min_child_weight': 2.0, 'colsample_bytree': 0.7038561526341564, 'subsample': 0.5966973006706313}
Test Performance after third tuning round: 0.8994728453127354
SCORE: 1.3748060828905073                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.65trial/s, best loss: 1.3748060828905073]
The best hyperparameters are :  

{'learning_rate': 0.031025240948914685, 'n_estimators': 88.0, 'seed': 0, 'max_depth': 9.0, 'min_child_weight': 2.0, 'colsample_bytree': 0.7038561526341564, 'subsample': 0.5966973006706313,

Test Performance after last tuning round: 1.1750486921420964
Preparing results for fold 2, condition=glmm
SCORE: 0.8350927580725195                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.40trial/s, best loss: 0.8350927580725195]
The best hyperparameters are :  

{'C': 0.33365650399746993}
Default performance on Test: 1.1812612223438603
SCORE: 1.1811235584298394                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.86trial/s, best loss: 1.1811235584298394]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.2514609293272148, 'n_estimators': 389.0}
Test Performance after first tuning round: 1.2277540735586772
SCORE: 1.1202814750873187                                                                                              
100%|█████████

SCORE: 1.2343060362194085                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.78trial/s, best loss: 1.2343060362194085]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.07025861066831315, 'n_estimators': 153.0, 'seed': 0, 'max_depth': 11.0, 'min_child_weight': 6.0}
Test Performance after second tuning round: 1.0377734366091027
SCORE: 1.1416583011649917                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.63trial/s, best loss: 1.1416583011649917]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.07025861066831315, 'n_estimators': 153.0, 'seed': 0, 'max_depth': 11.0, 'min_child_weight': 6.0, 'colsample_bytree': 0.9500002375809609, 'subsample': 0.8448622810427866}
Test Performance after third tuning round: 1.07547817279

SCORE: 1.372047151238121                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.38trial/s, best loss: 1.372047151238121]
The best hyperparameters are :  

{'learning_rate': 0.1750417468390788, 'n_estimators': 272.0, 'seed': 0, 'max_depth': 4.0, 'min_child_weight': 4.0, 'colsample_bytree': 0.9325921342424375, 'subsample': 0.7001895279679604, 'gamma': 6.910156556412882, 'reg_alpha': 4.0, 'reg_lambda': 2.5515511719729638}
Test Performance after last tuning round: 1.1825487046158554
Preparing results for fold 4, condition=ignore
SCORE: 1.1986935566883763                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.30trial/s, best loss: 1.1986935566883763]
The best hyperparameters are :  

{'C': 0.6625397169186523}
Default performance on Test: 1.488985227233453
SCO

SCORE: 1.0080844653353245                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.55trial/s, best loss: 1.0080844653353245]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.3583877038506449, 'n_estimators': 478.0}
Test Performance after first tuning round: 1.3976144241391741
SCORE: 1.1083569022478104                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.55trial/s, best loss: 1.1083569022478104]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.3583877038506449, 'n_estimators': 478.0, 'seed': 0, 'max_depth': 17.0, 'min_child_weight': 5.0}
Test Performance after second tuning round: 1.5032019229637388
SCORE: 1.077919318562937                                                                                               
100

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_target,1.0,1.0,1.0,0.7037,0.7675,0.8706
LR_glmm,0.8462,0.8385,0.9687,0.5926,0.6806,0.8661
LR_ordinal_tuned,0.7692,0.7983,0.9273,0.5185,0.6303,0.8427
LR_ordinal,0.8077,0.8272,0.9347,0.4815,0.6036,0.8348
LR_ohe,0.9231,0.9208,0.9921,0.7778,0.5985,0.9183
XGB_ohe,1.0,1.0,1.0,0.6667,0.5083,0.8564
XGB_glmm,1.0,1.0,1.0,0.6296,0.4865,0.8394
XGB_ordinal,1.0,1.0,1.0,0.5926,0.4546,0.8791
LR_target,0.7981,0.8034,0.9317,0.5556,0.4067,0.8721
XGB_ordinal_tuned,0.625,0.4151,0.8445,0.5556,0.4038,0.7006


In [13]:
results_encodings_df = pd.DataFrame(results_encodings[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_target,1.0,1.0,1.0,0.7037,0.7675,0.8706
LR_glmm,0.8462,0.8385,0.9687,0.5926,0.6806,0.8661
LR_ordinal_tuned,0.7692,0.7983,0.9273,0.5185,0.6303,0.8427
LR_ordinal,0.8077,0.8272,0.9347,0.4815,0.6036,0.8348
LR_ohe,0.9231,0.9208,0.9921,0.7778,0.5985,0.9183
XGB_ohe,1.0,1.0,1.0,0.6667,0.5083,0.8564
XGB_glmm,1.0,1.0,1.0,0.6296,0.4865,0.8394
XGB_ordinal,1.0,1.0,1.0,0.5926,0.4546,0.8791
LR_target,0.7981,0.8034,0.9317,0.5556,0.4067,0.8721
XGB_ordinal_tuned,0.625,0.4151,0.8445,0.5556,0.4038,0.7006


### Effectiveness of Parameter Tuning


In [14]:
models = results_encodings[0].keys()
metric = "F1 Test"

encodings_folds_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())
encodings_mean_df = encodings_folds_df.mean(axis=0)
encodings_std_df = encodings_folds_df.std(axis=0)

methods = sorted(list(encodings_mean_df.index))[1:]
not_tuned = ["Baseline"]+methods[::2]
tuned = ["Baseline"]+methods[1::2]

res_df_tune_comp_mean = pd.DataFrame([encodings_mean_df.loc[not_tuned].values,encodings_mean_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()
res_df_tune_comp_std = pd.DataFrame([encodings_std_df.loc[not_tuned].values,encodings_std_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()

res_df_tune_comp_mean.round(2).style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,Untuned,Tuned
Baseline,0.14,0.14
LR_catboost,0.28,0.28
LR_glmm,0.52,0.44
LR_ignore,0.29,0.25
LR_ohe,0.48,0.41
LR_ordinal,0.5,0.51
LR_target,0.53,0.4
XGB_catboost,0.24,0.16
XGB_glmm,0.52,0.25
XGB_ignore,0.25,0.14


In [15]:
latex_df = res_df_tune_comp_mean.round(2).astype(str) + " (" +  res_df_tune_comp_std.round(3).astype(str) + ")"
latex_df

Unnamed: 0,Untuned,Tuned
Baseline,0.14 (0.026),0.14 (0.026)
LR_catboost,0.28 (0.032),0.28 (0.032)
LR_glmm,0.52 (0.147),0.44 (0.113)
LR_ignore,0.29 (0.111),0.25 (0.136)
LR_ohe,0.48 (0.151),0.41 (0.14)
LR_ordinal,0.5 (0.077),0.51 (0.091)
LR_target,0.53 (0.15),0.4 (0.027)
XGB_catboost,0.24 (0.053),0.16 (0.027)
XGB_glmm,0.52 (0.088),0.25 (0.112)
XGB_ignore,0.25 (0.094),0.14 (0.026)


In [16]:
print(latex_df.to_latex())

\begin{tabular}{lll}
\toprule
{} &       Untuned &         Tuned \\
\midrule
Baseline     &  0.14 (0.026) &  0.14 (0.026) \\
LR\_catboost  &  0.28 (0.032) &  0.28 (0.032) \\
LR\_glmm      &  0.52 (0.147) &  0.44 (0.113) \\
LR\_ignore    &  0.29 (0.111) &  0.25 (0.136) \\
LR\_ohe       &  0.48 (0.151) &   0.41 (0.14) \\
LR\_ordinal   &   0.5 (0.077) &  0.51 (0.091) \\
LR\_target    &   0.53 (0.15) &   0.4 (0.027) \\
XGB\_catboost &  0.24 (0.053) &  0.16 (0.027) \\
XGB\_glmm     &  0.52 (0.088) &  0.25 (0.112) \\
XGB\_ignore   &  0.25 (0.094) &  0.14 (0.026) \\
XGB\_ohe      &  0.51 (0.082) &  0.21 (0.116) \\
XGB\_ordinal  &  0.51 (0.081) &  0.31 (0.102) \\
XGB\_target   &  0.58 (0.128) &  0.26 (0.105) \\
\bottomrule
\end{tabular}



### Performance Comparison

In [17]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_ignore_tuned,LR_ohe_tuned,LR_target_tuned,LR_ordinal_tuned,LR_catboost_tuned,LR_glmm_tuned
0,0.145 (0.026),0.253 (0.136),0.408 (0.14),0.395 (0.027),0.51 (0.091),0.282 (0.032),0.435 (0.113)


In [18]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_ignore_tuned,XGB_ohe_tuned,XGB_target_tuned,XGB_ordinal_tuned,XGB_catboost_tuned,XGB_glmm_tuned
0,0.145 (0.026),0.145 (0.026),0.21 (0.116),0.262 (0.105),0.312 (0.102),0.161 (0.027),0.254 (0.112)


In [19]:
res_df_lr.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_encodings = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_encodings.index = ["LR", "XGB"]
latex_df_encodings

Unnamed: 0,Baseline,ignore,ohe,target,ordinal,catboost,glmm
LR,0.145 (0.026),0.253 (0.136),0.408 (0.14),0.395 (0.027),0.51 (0.091),0.282 (0.032),0.435 (0.113)
XGB,0.145 (0.026),0.145 (0.026),0.21 (0.116),0.262 (0.105),0.312 (0.102),0.161 (0.027),0.254 (0.112)


In [20]:
print(latex_df_encodings.round(2).to_latex())


\begin{tabular}{llllllll}
\toprule
{} &       Baseline &         ignore &           ohe &         target &        ordinal &       catboost &           glmm \\
\midrule
LR  &  0.145 (0.026) &  0.253 (0.136) &  0.408 (0.14) &  0.395 (0.027) &   0.51 (0.091) &  0.282 (0.032) &  0.435 (0.113) \\
XGB &  0.145 (0.026) &  0.145 (0.026) &  0.21 (0.116) &  0.262 (0.105) &  0.312 (0.102) &  0.161 (0.027) &  0.254 (0.112) \\
\bottomrule
\end{tabular}



## Data Subset Comparisons

In [21]:
subsets = {"demo_only": demographic_cols,
           "performance_only": perf_cols,
           "activity_only": activity_cols,
           "activity_and_demo": activity_cols+demographic_cols,
           "performance_and_demo": perf_cols+demographic_cols,
           "all": list(df.columns)
          }

In [22]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle"):

    results_subsets = {}
    results_subsets_feature_importances = {}

    for fold in range(folds):
        results_subsets[fold] = {}
        results_subsets_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        u,c = np.unique(y_train_val,return_counts=True)
        nb_classes = len(u)
        baseline = np.argmax(c)

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*baseline
        y_test_pred_base = np.ones(y_test.shape[0])*baseline

        results_subsets[fold]["Baseline"] = {}
        eval_res_train = get_metrics(get_one_hot(y_train_val, nb_classes), get_one_hot(y_train_val_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_train.keys():
            results_subsets[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(get_one_hot(y_test, nb_classes), get_one_hot(y_test_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_test.keys():
            results_subsets[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]


        for subset_key in subsets:
            print(f"Preparing results for fold {fold}, subset={subset_key}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]
        
            y_train_val = np.concatenate([y_train,y_val])

            # Define data subset for LR
            z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
            z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
            z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 
            X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
            X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
            X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
            X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

            # Define data subset for XGB
            z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
            z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
            z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
            X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
            X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
            X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
            X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])


            # Define data subset for evaluation
            X_train_val_lr = X_train_val_lr[[i for i in X_train_val_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_lr = X_test_lr[[i for i in X_test_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_train_val_xgb = X_train_val_xgb[[i for i in X_train_val_xgb.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_xgb = X_test_xgb[[i for i in X_test_xgb.columns if any([j in i for j in subsets[subset_key]])]]


            # Train base models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target,tune=False, seed=RS)
            results_subsets[fold]["LR_"+subset_key] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_subsets[fold]["LR_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'wb') as handle:
        pickle.dump(results_subsets, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_subsets_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'rb') as handle:
        results_subsets = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'rb') as handle:
        results_subsets_feature_importances = pickle.load(handle)
        
        
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_subsets_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Preparing results for fold 0, subset=demo_only
SCORE: 0.9153996771988113                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.60trial/s, best loss: 0.9153996771988113]
The best hyperparameters are :  

{'C': 0.752515516368836}
Default performance on Test: 1.854904579092599
SCORE: 1.2706343938526703                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.70trial/s, best loss: 1.2706343938526703]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.20423751643165355, 'n_estimators': 267.0}
Test Performance after first tuning round: 2.06854902737716
SCORE: 1.295591273318283                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  

100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.36trial/s, best loss: 1.1081276782436131]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.3612664035823087, 'n_estimators': 88.0, 'seed': 0, 'max_depth': 13.0, 'min_child_weight': 4.0}
Test Performance after second tuning round: 1.0409916433547988
SCORE: 1.0987592671820605                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.65trial/s, best loss: 1.0987592671820605]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.3612664035823087, 'n_estimators': 88.0, 'seed': 0, 'max_depth': 13.0, 'min_child_weight': 4.0, 'colsample_bytree': 0.88785368647772, 'subsample': 0.6311315095369214}
Test Performance after third tuning round: 1.025262927672765
SCORE: 1.3228139898213516                                                                                              
1

SCORE: 1.3620049240405963                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.75trial/s, best loss: 1.3620049240405963]
The best hyperparameters are :  

{'learning_rate': 0.37835321399535604, 'n_estimators': 135.0, 'seed': 0, 'max_depth': 13.0, 'min_child_weight': 9.0, 'colsample_bytree': 0.950401788304069, 'subsample': 0.9768179513646851, 'gamma': 1.568581674379195, 'reg_alpha': 9.0, 'reg_lambda': 2.987315422198936}
Test Performance after last tuning round: 1.3196863195580137
Preparing results for fold 1, subset=performance_only
SCORE: 1.1553035210403073                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.45trial/s, best loss: 1.1553035210403073]
The best hyperparameters are :  

{'C': 0.06539897457192942}
Default performance on Test: 0.95589693108

Default performance on Test: 1.1075855774280081
SCORE: 1.0977278099607708                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.72trial/s, best loss: 1.0977278099607708]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.23883918530327486, 'n_estimators': 173.0}
Test Performance after first tuning round: 1.1118675036562555
SCORE: 1.0723309246938224                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.72trial/s, best loss: 1.0723309246938224]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.23883918530327486, 'n_estimators': 173.0, 'seed': 0, 'max_depth': 5.0, 'min_child_weight': 1.0}
Test Performance after second tuning round: 1.0314023369805776
SCORE: 0.9856608072042207                                                 

SCORE: 1.3213906321473299                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.78trial/s, best loss: 1.3213906321473299]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.013346486186740673, 'n_estimators': 448.0, 'seed': 0, 'max_depth': 8.0, 'min_child_weight': 2.0, 'colsample_bytree': 0.9061075416898151, 'subsample': 0.6855037946075293}
Test Performance after third tuning round: 0.9314471495545913
SCORE: 1.3860084470426597                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.85trial/s, best loss: 1.3860084470426597]
The best hyperparameters are :  

{'learning_rate': 0.013346486186740673, 'n_estimators': 448.0, 'seed': 0, 'max_depth': 8.0, 'min_child_weight': 2.0, 'colsample_bytree': 0.9061075416898151, 'subsample': 0.685503794607529

Test Performance after last tuning round: 1.2151936686244715
Preparing results for fold 2, subset=all
SCORE: 0.8510820710720359                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.22trial/s, best loss: 0.8510820710720359]
The best hyperparameters are :  

{'C': 0.1644399714249691}
Default performance on Test: 0.9692623948336883
SCORE: 1.198768694516285                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.91trial/s, best loss: 1.198768694516285]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.05673378115449451, 'n_estimators': 374.0}
Test Performance after first tuning round: 0.9650246930649521
SCORE: 1.1025354957666278                                                                                              
100%|█████████████

SCORE: 1.2954518427502504                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.71trial/s, best loss: 1.2954518427502504]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.07738025912171735, 'n_estimators': 115.0, 'seed': 0, 'max_depth': 14.0, 'min_child_weight': 0.0}
Test Performance after second tuning round: 1.5090994040566947
SCORE: 1.2709296021029577                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.53trial/s, best loss: 1.2709296021029577]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.07738025912171735, 'n_estimators': 115.0, 'seed': 0, 'max_depth': 14.0, 'min_child_weight': 0.0, 'colsample_bytree': 0.6203765151150111, 'subsample': 0.903117188348646}
Test Performance after third tuning round: 1.207681148546

Test Performance after third tuning round: 1.5995122314133046
SCORE: 1.2746216466830336                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.33trial/s, best loss: 1.2746216466830336]
The best hyperparameters are :  

{'learning_rate': 0.4847358406569528, 'n_estimators': 377.0, 'seed': 0, 'max_depth': 11.0, 'min_child_weight': 1.0, 'colsample_bytree': 0.563415136376966, 'subsample': 0.8804889117626642, 'gamma': 5.097567911277314, 'reg_alpha': 5.0, 'reg_lambda': 2.597477666618161}
Test Performance after last tuning round: 1.167883695877595
Preparing results for fold 4, subset=demo_only
SCORE: 0.942201292674761                                                                                               
100%|███████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.83trial/s, best loss: 0.942201292674761]
The best hyperparameters are :  

{'C': 0.52026939

100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.99trial/s, best loss: 0.9349540192697582]
The best hyperparameters are :  

{'C': 0.42784316717513793}
Default performance on Test: 1.5145929673704077
SCORE: 1.2207554047365963                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.61trial/s, best loss: 1.2207554047365963]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.2576104966589553, 'n_estimators': 312.0}
Test Performance after first tuning round: 1.6682766734209067
SCORE: 1.2474876524095522                                                                                              
100%|██████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.76trial/s, best loss: 1.2474876524095522]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.2576104966589553, 'n_estimators': 312.0, 'seed': 0, '

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
LR_all_tuned,0.8558,0.8487,0.9687,0.5926,0.6867,0.8848
LR_all,0.8654,0.8554,0.9719,0.5926,0.6867,0.8783
LR_performance_and_demo,0.8365,0.8318,0.9589,0.5926,0.6806,0.8596
LR_performance_only,0.5673,0.4897,0.7624,0.5556,0.6373,0.8989
LR_performance_only_tuned,0.5673,0.4897,0.7592,0.5556,0.6373,0.8989
XGB_performance_only,0.7692,0.7598,0.9288,0.5185,0.6102,0.8273
XGB_all,1.0,1.0,1.0,0.6296,0.4865,0.8974
LR_activity_and_demo_tuned,0.7885,0.7454,0.9488,0.6296,0.4855,0.8672
LR_activity_and_demo,0.8173,0.7937,0.9514,0.6296,0.4855,0.8672
XGB_performance_and_demo,1.0,1.0,1.0,0.5926,0.4546,0.8469


In [23]:
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_subsets_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
LR_all_tuned,0.8558,0.8487,0.9687,0.5926,0.6867,0.8848
LR_all,0.8654,0.8554,0.9719,0.5926,0.6867,0.8783
LR_performance_and_demo,0.8365,0.8318,0.9589,0.5926,0.6806,0.8596
LR_performance_only,0.5673,0.4897,0.7624,0.5556,0.6373,0.8989
LR_performance_only_tuned,0.5673,0.4897,0.7592,0.5556,0.6373,0.8989
XGB_performance_only,0.7692,0.7598,0.9288,0.5185,0.6102,0.8273
XGB_all,1.0,1.0,1.0,0.6296,0.4865,0.8974
LR_activity_and_demo_tuned,0.7885,0.7454,0.9488,0.6296,0.4855,0.8672
LR_activity_and_demo,0.8173,0.7937,0.9514,0.6296,0.4855,0.8672
XGB_performance_and_demo,1.0,1.0,1.0,0.5926,0.4546,0.8469


### Effectiveness of Parameter Tuning

In [24]:
models = results_subsets[0].keys()
metric = "F1 Test"

subsets_folds_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())
subsets_mean_df = subsets_folds_df.mean(axis=0)
subsets_std_df = subsets_folds_df.std(axis=0)

methods = sorted(list(subsets_mean_df.index))[1:]
not_tuned = ["Baseline"]+methods[::2]
tuned = ["Baseline"]+methods[1::2]

res_df_tune_comp_mean = pd.DataFrame([subsets_mean_df.loc[not_tuned].values,subsets_mean_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()
res_df_tune_comp_std = pd.DataFrame([subsets_std_df.loc[not_tuned].values,subsets_std_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()

res_df_tune_comp_mean.round(2).style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,Untuned,Tuned
Baseline,0.14,0.14
LR_activity_and_demo,0.44,0.43
LR_activity_only,0.4,0.39
LR_all,0.52,0.52
LR_demo_only,0.42,0.44
LR_performance_and_demo,0.5,0.46
LR_performance_only,0.43,0.38
XGB_activity_and_demo,0.37,0.15
XGB_activity_only,0.29,0.14
XGB_all,0.53,0.22


In [25]:
latex_df = res_df_tune_comp_mean.round(2).astype(str) + " (" +  res_df_tune_comp_std.round(3).astype(str) + ")"
latex_df

Unnamed: 0,Untuned,Tuned
Baseline,0.14 (0.026),0.14 (0.026)
LR_activity_and_demo,0.44 (0.095),0.43 (0.084)
LR_activity_only,0.4 (0.105),0.39 (0.099)
LR_all,0.52 (0.111),0.52 (0.112)
LR_demo_only,0.42 (0.106),0.44 (0.108)
LR_performance_and_demo,0.5 (0.163),0.46 (0.125)
LR_performance_only,0.43 (0.125),0.38 (0.169)
XGB_activity_and_demo,0.37 (0.082),0.15 (0.025)
XGB_activity_only,0.29 (0.055),0.14 (0.026)
XGB_all,0.53 (0.07),0.22 (0.103)


In [26]:
res_df_tune_comp_diff = res_df_tune_comp_mean[["Tuned"]]-res_df_tune_comp_mean[["Untuned"]].values
res_df_tune_comp_diff.round(2)

Unnamed: 0,Tuned
Baseline,0.0
LR_activity_and_demo,-0.0
LR_activity_only,-0.01
LR_all,-0.0
LR_demo_only,0.03
LR_performance_and_demo,-0.05
LR_performance_only,-0.05
XGB_activity_and_demo,-0.23
XGB_activity_only,-0.15
XGB_all,-0.32


In [27]:
res_df_tune_comp_diff_lr = res_df_tune_comp_diff.loc[[i for i in res_df_tune_comp_diff.index if (i == "Baseline" or "LR" in i)]]
res_df_tune_comp_diff_xgb = res_df_tune_comp_diff.loc[[i for i in res_df_tune_comp_diff.index if (i == "Baseline" or "XGB" in i)]]

res_df_tune_comp_diff_lr.index = [i[3:] if i!="Baseline" else "Baseline" for i in res_df_tune_comp_diff_lr.index]
res_df_tune_comp_diff_xgb.index = [i[4:] if i!="Baseline" else "Baseline" for i in res_df_tune_comp_diff_xgb.index]
res_df_tune_comp_diff_xgb

Unnamed: 0,Tuned
Baseline,0.0
activity_and_demo,-0.228252
activity_only,-0.146236
all,-0.316066
demo_only,-0.129091
performance_and_demo,-0.226843
performance_only,-0.312727


In [28]:
latex_df_diff = pd.concat([res_df_tune_comp_diff_lr,res_df_tune_comp_diff_xgb],axis=1)
latex_df_diff.columns = ["LR", "XGB"]
latex_df_diff

Unnamed: 0,LR,XGB
Baseline,0.0,0.0
activity_and_demo,-0.0026,-0.228252
activity_only,-0.005416,-0.146236
all,-0.002858,-0.316066
demo_only,0.028533,-0.129091
performance_and_demo,-0.045445,-0.226843
performance_only,-0.045952,-0.312727


In [29]:
print(latex_df_diff.round(2).to_latex())

\begin{tabular}{lrr}
\toprule
{} &    LR &   XGB \\
\midrule
Baseline             &  0.00 &  0.00 \\
activity\_and\_demo    & -0.00 & -0.23 \\
activity\_only        & -0.01 & -0.15 \\
all                  & -0.00 & -0.32 \\
demo\_only            &  0.03 & -0.13 \\
performance\_and\_demo & -0.05 & -0.23 \\
performance\_only     & -0.05 & -0.31 \\
\bottomrule
\end{tabular}



### Performance Results

In [30]:
# For LR
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_demo_only_tuned,LR_performance_only_tuned,LR_activity_only_tuned,LR_activity_and_demo_tuned,LR_performance_and_demo_tuned,LR_all_tuned
0,0.145 (0.026),0.445 (0.108),0.379 (0.169),0.39 (0.099),0.433 (0.084),0.455 (0.125),0.52 (0.112)


In [31]:
# For XGB
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_demo_only_tuned,XGB_performance_only_tuned,XGB_activity_only_tuned,XGB_activity_and_demo_tuned,XGB_performance_and_demo_tuned,XGB_all_tuned
0,0.145 (0.026),0.169 (0.074),0.262 (0.111),0.145 (0.026),0.145 (0.025),0.264 (0.114),0.216 (0.103)


In [32]:
res_df_lr.columns = [i[3:-6] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i[4:-6] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_subsets = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_subsets.index = ["LR", "XGB"]
latex_df_subsets

Unnamed: 0,Baseline,demo_only,performance_only,activity_only,activity_and_demo,performance_and_demo,all
LR,0.145 (0.026),0.445 (0.108),0.379 (0.169),0.39 (0.099),0.433 (0.084),0.455 (0.125),0.52 (0.112)
XGB,0.145 (0.026),0.169 (0.074),0.262 (0.111),0.145 (0.026),0.145 (0.025),0.264 (0.114),0.216 (0.103)


In [33]:
print(latex_df_subsets.round(2).transpose().to_latex())


\begin{tabular}{lll}
\toprule
{} &             LR &            XGB \\
\midrule
Baseline             &  0.145 (0.026) &  0.145 (0.026) \\
demo\_only            &  0.445 (0.108) &  0.169 (0.074) \\
performance\_only     &  0.379 (0.169) &  0.262 (0.111) \\
activity\_only        &   0.39 (0.099) &  0.145 (0.026) \\
activity\_and\_demo    &  0.433 (0.084) &  0.145 (0.025) \\
performance\_and\_demo &  0.455 (0.125) &  0.264 (0.114) \\
all                  &   0.52 (0.112) &  0.216 (0.103) \\
\bottomrule
\end{tabular}



In [34]:
X_train_val_lr

Unnamed: 0,ge,arr,ls,as,ss,fq_c0,fq_c1,fq_c2,fq_c3,twp_c0,...,tnp_c2,tnp_c3,fmi_c0,fmi_c1,fmi_c2,fmi_c3,atd_c0,atd_c1,atd_c2,atd_c3
102,1.0,1.0,1.0,0.0,0.0,0.291281,-0.145776,0.121496,0.057080,-0.775396,...,-0.003979,0.038619,0.005802,-0.036608,-0.000893,-0.400812,0.001315,-0.119226,0.002267,1.229653
117,1.0,0.0,1.0,1.0,0.0,-0.103676,-0.257282,-0.033518,-0.000324,-0.592144,...,0.000353,0.065040,0.010272,-0.016779,-0.000914,-0.394815,0.000751,-0.158153,-0.001676,-0.296369
103,1.0,1.0,1.0,0.0,0.0,0.502198,1.098808,-0.127057,0.005813,0.356272,...,-0.007294,0.038619,0.039193,-0.005874,-0.001082,-0.456516,0.000835,-0.114217,0.002101,-0.323181
44,1.0,0.0,1.0,1.0,1.0,-0.101849,-0.325024,0.013729,0.009153,-0.135188,...,-0.001678,0.683465,0.003701,-0.001564,-0.327751,0.007415,0.004118,0.080787,0.020920,0.004837
114,0.0,1.0,0.0,0.0,0.0,-0.087583,1.240984,0.013729,-0.005026,0.773648,...,-0.001678,0.683465,-0.002158,-0.002342,-0.327751,-0.006180,-0.002679,-0.015779,-0.004249,-0.002889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1.0,1.0,0.0,1.0,1.0,0.368145,1.334509,-0.159979,-0.003733,-0.692770,...,0.000166,0.042433,-0.015118,-0.019396,-0.000779,1.202608,0.002223,-0.118191,0.002729,-0.244963
111,1.0,1.0,1.0,0.0,0.0,0.368145,-0.057942,0.210878,0.013544,-0.692770,...,-0.004281,0.042433,-0.015118,0.083121,-0.000779,-0.386979,-0.001871,0.242136,-0.003034,-0.244963
42,1.0,1.0,1.0,0.0,1.0,0.368145,-0.057942,0.210878,0.013544,0.229023,...,0.000166,0.042433,0.007322,-0.036489,-0.000779,1.202608,0.000819,0.242136,0.002729,1.254629
97,0.0,1.0,1.0,1.0,0.0,0.368145,-0.057942,-0.159979,-0.001074,-0.692770,...,0.002659,0.042433,-0.015118,0.083121,-0.000779,1.202608,-0.001871,-0.130966,0.002729,-0.298006


### Feature Importance

In [35]:
# top_10_importances = {}

# for model in list(results_subsets_feature_importances[fold].keys()):
#     imp_df = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)

#     if "LR" in model:
#         direction = imp_df.apply(lambda x: np.sign(x))
#         imp_df = imp_df.abs()

#     imp_df = imp_df/imp_df.sum(axis=0)

#     mean_imp_df = imp_df.mean(axis=1)
#     std_imp_df = imp_df.std(axis=1)

#     mean_imp_df = mean_imp_df.sort_values(ascending=False)
#     std_imp_df = std_imp_df.loc[mean_imp_df.index]
#     final_imps = mean_imp_df[:10]
#     final_imps["Rest"] = sum(mean_imp_df[10:])
#     top_5_importances[model] = np.array([final_imps.index.values, final_imps.values])

In [36]:
demo_importances = {}
demo_importances_stds = {}

for model in list(results_subsets_feature_importances[fold].keys()):
    if "demo" in model or "all" in model:
        imp_df_all = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)
        
        if "LR" in model:
            direction = imp_df_all.apply(lambda x: np.sign(x))
            imp_df_all = imp_df_all.abs()
        if imp_df_all.sum().sum()!=0:
            imp_df = imp_df_all/imp_df_all.sum(axis=0)
        imp_df = imp_df.fillna(1/imp_df.shape[0])
#         imp_df = imp_df.loc[demographic_cols]

#         mean_imp_df = imp_df.mean(axis=1)
#         std_imp_df = imp_df.std(axis=1)

#         mean_imp_df = mean_imp_df.sort_values(ascending=False)
#         std_imp_df = std_imp_df.loc[mean_imp_df.index]
#         final_imps = mean_imp_df#[:10]
#         final_imps["Rest"] = sum(mean_imp_df[10:])
#         final_imps["Total"] = sum(mean_imp_df)
        demo_importances[model] = np.round(np.mean(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values
        demo_importances_stds[model] = np.round(np.std(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values


In [37]:
lr_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "LR" in i and "tuned" in i})
xgb_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "XGB" in i and "tuned" in i})
lr_demo_imp.index = [i[3:-6] for i in lr_demo_imp.index]    
xgb_demo_imp.index = [i[4:-6] for i in xgb_demo_imp.index]    

lr_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "LR" in i and "tuned" in i})
xgb_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "XGB" in i and "tuned" in i})
lr_demo_imp_stds.index = [i[3:-6] for i in lr_demo_imp_stds.index]    
xgb_demo_imp_stds.index = [i[4:-6] for i in xgb_demo_imp_stds.index]    


latex_df_imp = pd.DataFrame([lr_demo_imp.astype(str) + " (" + lr_demo_imp_stds.astype(str) + ")",
                             xgb_demo_imp.astype(str) + " (" + xgb_demo_imp_stds.astype(str) + ")"])
latex_df_imp.index = ["LR", "XGB"]
latex_df_imp

Unnamed: 0,demo_only,activity_and_demo,performance_and_demo,all
LR,1.0 (0.0),0.81 (0.04),0.78 (0.04),0.63 (0.03)
XGB,1.0 (0.0),0.71 (0.12),0.23 (0.32),0.26 (0.32)


In [38]:
print(latex_df_subsets.to_latex())

\begin{tabular}{llllllll}
\toprule
{} &       Baseline &      demo\_only & performance\_only &  activity\_only & activity\_and\_demo & performance\_and\_demo &            all \\
\midrule
LR  &  0.145 (0.026) &  0.445 (0.108) &    0.379 (0.169) &   0.39 (0.099) &     0.433 (0.084) &        0.455 (0.125) &   0.52 (0.112) \\
XGB &  0.145 (0.026) &  0.169 (0.074) &    0.262 (0.111) &  0.145 (0.026) &     0.145 (0.025) &        0.264 (0.114) &  0.216 (0.103) \\
\bottomrule
\end{tabular}

