In [1]:
import sys
sys.path.append("..")

from utils.evaluation import *
from utils.utils import *

from data import dataset_preprocessing

from utils.evaluation import get_metrics
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

from scipy import stats

import pandas as pd
import numpy as np
import os

import pickle

In [2]:
dataset_name = "xAPI-Edu-Data"
mode="cv"
RS=1
hct=10
test_ratio=0.2
val_ratio=0.1
folds=5
target = "categorical"
experiment_name = "5CV_paper_final"

### Describe raw data

In [3]:
df = pd.read_csv(f"../data/raw/{dataset_name}/{dataset_name}.csv",sep=",")


In [4]:
y_col = "Class"
demographic_cols = ["gender", "NationalITy", "PlaceofBirth", "Relation"]
perf_cols = []
activity_cols = ["raisedhands", "VisITedResources", "AnnouncementsView", "Discussion", 'StudentAbsenceDays']
other_cols = ['GradeID', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'SectionID', 'Semester', 'StageID', "Topic"]
set(df.columns)-set([y_col]+demographic_cols+perf_cols+activity_cols+other_cols)

set()

In [5]:
desc_df_dict = {"No. of samples": df.shape[0],
           "No. of features": df.shape[1],
           "Performance features": len(perf_cols),
           "Demographic features": len(demographic_cols),
           "Activity features": len(activity_cols),
           "Other features": len(other_cols),
           "Categorical features": len(df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]),     
           "Total cardinality": df[df.columns[list(np.logical_and(df.nunique() > 2, df.dtypes == "object"))]].nunique().sum(),     
           "% NA": df.isna().sum().sum()/sum(df.shape),
           "Target $\textbf{y} \in$": f"[1..{df[y_col].nunique()}]",
#            "High cardinality levels":  list(df.loc[:,list(df.columns[list(np.logical_and(df.nunique() >= 10, df.dtypes == "object"))])].nunique().sort_values().values),
          
}
desc_df = pd.DataFrame([desc_df_dict],index=["cortez"])
desc_df

Unnamed: 0,No. of samples,No. of features,Performance features,Demographic features,Activity features,Other features,Categorical features,Total cardinality,% NA,Target $\textbf{y} \in$
cortez,480,17,0,4,5,7,7,59,0.0,[1..3]


In [6]:
print(desc_df.transpose().to_latex())

\begin{tabular}{ll}
\toprule
{} &  cortez \\
\midrule
No. of samples          &     480 \\
No. of features         &      17 \\
Performance features    &       0 \\
Demographic features    &       4 \\
Activity features       &       5 \\
Other features          &       7 \\
Categorical features    &       7 \\
Total cardinality       &      59 \\
\% NA                    &     0.0 \\
Target \$\textbackslash textbf\{y\} \textbackslash in\$ &  [1..3] \\
\bottomrule
\end{tabular}



### Preprocessing and preparation

In [7]:
data_path = f"{mode}_RS{RS}_hct{hct}"
if mode == "cv":
    data_path += f"_{folds}folds"
elif mode == "train_test":
    data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
elif mode == "train_val_test":
    data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"


# If no data_dict for the configuration exists, run preprocessing, else load data_dict
if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
    dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)


## Evaluation of categorical data treatment methods

In [8]:
conditions = ["ignore", "ohe", "target", "ordinal", "catboost", "glmm"]

In [11]:
early_stopping_rounds = 10
max_evals = 5

In [12]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle"):

    results_encodings = {}
    results_encodings_feature_importances = {}

    for fold in range(folds):
        results_encodings[fold] = {}
        results_encodings_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        u,c = np.unique(y_train_val,return_counts=True)
        nb_classes = len(u)
        baseline = np.argmax(c)

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*baseline
        y_test_pred_base = np.ones(y_test.shape[0])*baseline

        results_encodings[fold]["Baseline"] = {}
        eval_res_train = get_metrics(get_one_hot(y_train_val, nb_classes), get_one_hot(y_train_val_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_train.keys():
            results_encodings[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(get_one_hot(y_test, nb_classes), get_one_hot(y_test_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_test.keys():
            results_encodings[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]


        for condition in conditions:
            print(f"Preparing results for fold {fold}, condition={condition}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]

    ## ALL BUT PERFORMANCE:
            # Define data subset for evaluation
    #         X_train = X_train[[i for i in X_train.columns if all([j not in i for j in perf_cols])]]
    #         X_val = X_val[[i for i in X_val.columns if all([j not in i for j in perf_cols])]]
    #         X_test = X_test[[i for i in X_test.columns if all([j not in i for j in perf_cols])]]

            # Define condition data subset
    #         if condition != "ignore":
    #             z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
    #             z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
    #             z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

    #             X_train = pd.concat([X_train,z_encoded_train],axis=1)
    #             X_val = pd.concat([X_val,z_encoded_val],axis=1)
    #             X_test = pd.concat([X_test,z_encoded_test],axis=1)

    # ALL BUT PERFORMANCE & ACTIVITY:
    #         Define data subset for evaluation
            X_train = X_train[[i for i in X_train.columns if all([j not in i for j in perf_cols+activity_cols])]]
            X_val = X_val[[i for i in X_val.columns if all([j not in i for j in perf_cols+activity_cols])]]
            X_test = X_test[[i for i in X_test.columns if all([j not in i for j in perf_cols+activity_cols])]]

    #         Define condition data subset
            if condition != "ignore":
                z_encoded_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
                z_encoded_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
                z_encoded_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

                X_train = pd.concat([X_train,z_encoded_train],axis=1)
                X_val = pd.concat([X_val,z_encoded_val],axis=1)
                X_test = pd.concat([X_test,z_encoded_test],axis=1)


    ## ONLY CATEGORICAL: --> Produces trash as almost never better than baseline
    #         if condition != "ignore":        
    #             X_train = data_dict[f"z_{condition}_encoded_train_{fold}"] 
    #             X_val = data_dict[f"z_{condition}_encoded_val_{fold}"] 
    #             X_test = data_dict[f"z_{condition}_encoded_test_{fold}"] 

    #         else:
    #             continue

            X_train_val = pd.concat([X_train,X_val])
            y_train_val = np.concatenate([y_train,y_val])

            # Train base models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target,tune=False, seed=RS)
            results_encodings[fold]["LR_"+condition] = res
            results_encodings_feature_importances[fold]["LR_"+condition] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition] = res
            results_encodings_feature_importances[fold]["XGB_"+condition] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val, y_train_val, X_test, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_encodings[fold]["LR_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["LR_"+condition+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val, y_train_val, X_test, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_encodings[fold]["XGB_"+condition+"_tuned"] = res
            results_encodings_feature_importances[fold]["XGB_"+condition+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'wb') as handle:
        pickle.dump(results_encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_encodings_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings.pickle", 'rb') as handle:
        results_encodings = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_encodings_feature_importances.pickle", 'rb') as handle:
        results_encodings_feature_importances = pickle.load(handle)
        
        
results_encodings_df = pd.DataFrame(results_encodings[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Preparing results for fold 0, condition=ignore
SCORE: 0.8798341003235166                                                                                              
SCORE: 0.8783411219628767                                                                                              
SCORE: 0.8795992681771894                                                                                              
SCORE: 0.878736098145853                                                                                               
SCORE: 0.8800708660503507                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 22.32trial/s, best loss: 0.8783411219628767]
The best hyperparameters are :  

{'C': 0.42084757045798626}
Default performance on Test: 1.1234728318618208
SCORE: 0.9780210454034544                                                                                              
SCOR

100%|███████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.00trial/s, best loss: 0.856945866338817]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.3739805635515384, 'n_estimators': 454.0, 'seed': 0, 'max_depth': 4.0, 'min_child_weight': 4.0, 'colsample_bytree': 0.7890896057568629, 'subsample': 0.6516332526271862}
Test Performance after third tuning round: 1.3748161063200282
SCORE: 1.048764601433804                                                                                               
SCORE: 1.0350845483973157                                                                                              
SCORE: 1.0116941415168967                                                                                              
SCORE: 1.0324158838131863                                                                                              
SCORE: 1.0145463993586088                                                                              

SCORE: 0.9108753814782297                                                                                              
SCORE: 0.9151152259593136                                                                                              
SCORE: 0.8692311553803732                                                                                              
SCORE: 0.8780864245524848                                                                                              
SCORE: 0.9215494196400966                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.51trial/s, best loss: 0.8692311553803732]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.4525452424385113, 'n_estimators': 397.0, 'seed': 0, 'max_depth': 7.0, 'min_child_weight': 5.0}
Test Performance after second tuning round: 1.483721996057133
SCORE: 0.8841513268782064                               

SCORE: 0.8626258437485405                                                                                              
SCORE: 0.8505568062540055                                                                                              
SCORE: 0.8590646995580512                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 22.01trial/s, best loss: 0.8505568062540055]
The best hyperparameters are :  

{'C': 0.8032610235731569}
Default performance on Test: 1.6843542987805684
SCORE: 0.9308800616842079                                                                                              
SCORE: 0.9517452421072633                                                                                              
SCORE: 0.9913711298730569                                                                                              
SCORE: 0.9580707447428136                           

SCORE: 0.9807192879600727                                                                                              
SCORE: 1.0152803819409257                                                                                              
SCORE: 1.0301589735719108                                                                                              
SCORE: 0.9654007492590331                                                                                              
SCORE: 1.0295770785822747                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00,  6.99trial/s, best loss: 0.9654007492590331]
The best hyperparameters are :  

{'learning_rate': 0.38198733495557513, 'n_estimators': 435.0, 'seed': 0, 'max_depth': 2.0, 'min_child_weight': 8.0, 'colsample_bytree': 0.8902879397346706, 'subsample': 0.6138428844173514, 'gamma': 0.7990362995007777, 'reg_alpha': 5.0, 'reg_lambd

SCORE: 1.1237539280396183                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.29trial/s, best loss: 0.8783546811795672]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.4467224431775137, 'n_estimators': 294.0, 'seed': 0, 'max_depth': 5.0, 'min_child_weight': 8.0}
Test Performance after second tuning round: 1.1944304671784396
SCORE: 0.8916148344658161                                                                                              
SCORE: 0.885453886495417                                                                                               
SCORE: 0.9081556001681141                                                                                              
SCORE: 0.8972128934595679                                                                                              
SCORE: 0.8824161571464678                              

Default performance on Test: 1.489482734850217
SCORE: 0.9420023555707662                                                                                              
SCORE: 1.020354367655274                                                                                               
SCORE: 0.9524906970595651                                                                                              
SCORE: 1.0054881554802382                                                                                              
SCORE: 1.0059818139148002                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.57trial/s, best loss: 0.9420023555707662]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.07118446676738699, 'n_estimators': 336.0}
Test Performance after first tuning round: 1.4462703991661099
SCORE: 0.9585874786285178                                    

SCORE: 0.9299465599002789                                                                                              
SCORE: 0.9277801076353966                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.09trial/s, best loss: 0.9260463357250053]
The best hyperparameters are :  

{'learning_rate': 0.22050324793450163, 'n_estimators': 423.0, 'seed': 0, 'max_depth': 13.0, 'min_child_weight': 5.0, 'colsample_bytree': 0.560081285128315, 'subsample': 0.783211822897376, 'gamma': 0.9631493794002354, 'reg_alpha': 4.0, 'reg_lambda': 2.5684841370233342}
Test Performance after last tuning round: 0.8282233893070948
Preparing results for fold 2, condition=ignore
SCORE: 0.8834430563892981                                                                                              
SCORE: 0.8852208895411966                                                                                    

SCORE: 0.845144668126909                                                                                               
SCORE: 0.8442993448077225                                                                                              
SCORE: 0.868506156587791                                                                                               
SCORE: 0.8307472861207778                                                                                              
SCORE: 0.8880169765071851                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.05trial/s, best loss: 0.8307472861207778]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.3304679921219226, 'n_estimators': 143.0, 'seed': 0, 'max_depth': 6.0, 'min_child_weight': 3.0, 'colsample_bytree': 0.8786954953423174, 'subsample': 0.6690114903066742}
Test Performance after third tuning round: 1.

SCORE: 0.9317663100029557                                                                                              
SCORE: 0.9104088588030675                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.59trial/s, best loss: 0.9104088588030675]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.22820772086317553, 'n_estimators': 498.0}
Test Performance after first tuning round: 1.3857443105479852
SCORE: 0.8452834650122252                                                                                              
SCORE: 0.8889706313052343                                                                                              
SCORE: 0.8330490957290575                                                                                              
SCORE: 1.0562719467518504                                                                                   

Test Performance after last tuning round: 0.9279253863779443
Preparing results for fold 2, condition=glmm
SCORE: 0.8788830791437713                                                                                              
SCORE: 0.87361025507759                                                                                                
SCORE: 0.8735308942926319                                                                                              
SCORE: 1.0393906530858925                                                                                              
SCORE: 0.8769524809446322                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.04trial/s, best loss: 0.8735308942926319]
The best hyperparameters are :  

{'C': 0.25631029737223443}
Default performance on Test: 1.226677160295867
SCORE: 0.9778374117315074                                         

100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.93trial/s, best loss: 0.8875113889550942]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.4047510019495855, 'n_estimators': 193.0, 'seed': 0, 'max_depth': 2.0, 'min_child_weight': 5.0, 'colsample_bytree': 0.6223100489799032, 'subsample': 0.9501267912191095}
Test Performance after third tuning round: 0.9213199220724286
SCORE: 0.9474615243486308                                                                                              
SCORE: 1.0157444328665626                                                                                              
SCORE: 0.9755496836480002                                                                                              
SCORE: 1.0373673291574514                                                                                              
SCORE: 0.9422894083007624                                                                              

SCORE: 0.9451509195732687                                                                                              
SCORE: 0.923477767564548                                                                                               
SCORE: 0.9174471407364158                                                                                              
SCORE: 0.9353364878275228                                                                                              
SCORE: 0.8774877864430015                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.30trial/s, best loss: 0.8774877864430015]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.1327111432106453, 'n_estimators': 81.0, 'seed': 0, 'max_depth': 5.0, 'min_child_weight': 1.0}
Test Performance after second tuning round: 0.8352922051726703
SCORE: 0.8819222280112557                               

SCORE: 0.9007722438815021                                                                                              
SCORE: 0.8842201438321607                                                                                              
SCORE: 0.8836491858692895                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00,  8.44trial/s, best loss: 0.8828185571543591]
The best hyperparameters are :  

{'C': 0.5161294942596787}
Default performance on Test: 1.1825214653187477
SCORE: 1.0344073764434816                                                                                              
SCORE: 0.989164846149953                                                                                               
SCORE: 1.0086961091702353                                                                                              
SCORE: 1.0344227824332914                           

Test Performance after third tuning round: 1.5934893259539418
SCORE: 0.8855459781277737                                                                                              
SCORE: 1.0217347708518703                                                                                              
SCORE: 0.9772207560128903                                                                                              
SCORE: 0.9084034882719181                                                                                              
SCORE: 0.8961366281927351                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.31trial/s, best loss: 0.8855459781277737]
The best hyperparameters are :  

{'learning_rate': 0.4792466794319517, 'n_estimators': 398.0, 'seed': 0, 'max_depth': 3.0, 'min_child_weight': 6.0, 'colsample_bytree': 0.9947939789105118, 'subsample': 0.77396664006137

SCORE: 0.8842844502206562                                                                                              
SCORE: 0.8898891206647763                                                                                              
100%|███████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.12trial/s, best loss: 0.841709867101258]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.40531543101622397, 'n_estimators': 283.0, 'seed': 0, 'max_depth': 11.0, 'min_child_weight': 3.0}
Test Performance after second tuning round: 1.5761075569670862
SCORE: 0.8478840573313235                                                                                              
SCORE: 0.8694533225590882                                                                                              
SCORE: 0.9030119937665638                                                                                              
SCORE: 0.8854665665083801                            

100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.53trial/s, best loss: 0.8974981595728637]
The best hyperparameters are :  

{'C': 0.19692593286626767}
Default performance on Test: 1.3575898321473925
SCORE: 0.8658366679907944                                                                                              
SCORE: 0.9896118322232761                                                                                              
SCORE: 0.8614647465207333                                                                                              
SCORE: 0.8678570214725589                                                                                              
SCORE: 0.9886576539211095                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.11trial/s, best loss: 0.8614647465207333]
The best hyperparameters after step 1  are :  

{'l

SCORE: 1.057026018208245                                                                                               
SCORE: 0.9692096728110686                                                                                              
SCORE: 0.9136147749848439                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.04trial/s, best loss: 0.9136147749848439]
The best hyperparameters are :  

{'learning_rate': 0.4536950188121696, 'n_estimators': 160.0, 'seed': 0, 'max_depth': 1.0, 'min_child_weight': 6.0, 'colsample_bytree': 0.6946057834340434, 'subsample': 0.9881193177772198, 'gamma': 0.37990687218840985, 'reg_alpha': 3.0, 'reg_lambda': 2.0860549523124208}
Test Performance after last tuning round: 0.8616773027730001
Preparing results for fold 4, condition=glmm
SCORE: 0.8671466176073185                                                                                     

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_ohe,0.9245,0.9235,0.9926,0.5729,0.5863,0.7815
LR_ignore,0.5781,0.5788,0.7747,0.5625,0.5665,0.7174
LR_ignore_tuned,0.5807,0.5795,0.7738,0.5521,0.5606,0.7174
XGB_glmm,0.9766,0.9754,0.9994,0.5417,0.5443,0.7297
XGB_target,0.9297,0.9295,0.9937,0.5312,0.5389,0.7483
XGB_ordinal,0.9297,0.9293,0.9938,0.5208,0.5362,0.7568
LR_target_tuned,0.6042,0.5987,0.8068,0.5208,0.5264,0.7139
LR_target,0.6016,0.5967,0.8068,0.5208,0.5264,0.7137
LR_catboost_tuned,0.5885,0.5845,0.7899,0.5312,0.5215,0.7098
LR_catboost,0.5885,0.5844,0.7922,0.5312,0.5215,0.7052


In [13]:
results_encodings_df = pd.DataFrame(results_encodings[1]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_encodings_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_ohe,0.9167,0.9175,0.9919,0.6146,0.6127,0.7914
XGB_ohe_tuned,0.6016,0.5907,0.7745,0.6042,0.5967,0.7973
XGB_glmm,0.9766,0.9765,0.9994,0.5938,0.5806,0.7509
LR_ohe_tuned,0.6328,0.6339,0.8306,0.5833,0.5789,0.7699
XGB_glmm_tuned,0.7318,0.7324,0.8837,0.5833,0.5744,0.8055
XGB_ignore,0.7005,0.6995,0.8864,0.5833,0.5704,0.7204
XGB_target,0.9193,0.9199,0.9923,0.5729,0.5661,0.7628
LR_catboost,0.5781,0.5806,0.7762,0.5625,0.5645,0.7638
LR_ohe,0.6589,0.6623,0.8358,0.5625,0.5601,0.7661
LR_ignore_tuned,0.599,0.6026,0.7652,0.5521,0.559,0.7657


### Effectiveness of Parameter Tuning


In [14]:
models = results_encodings[0].keys()
metric = "F1 Test"

encodings_folds_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())
encodings_mean_df = encodings_folds_df.mean(axis=0)
encodings_std_df = encodings_folds_df.std(axis=0)

methods = sorted(list(encodings_mean_df.index))[1:]
not_tuned = ["Baseline"]+methods[::2]
tuned = ["Baseline"]+methods[1::2]

res_df_tune_comp_mean = pd.DataFrame([encodings_mean_df.loc[not_tuned].values,encodings_mean_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()
res_df_tune_comp_std = pd.DataFrame([encodings_std_df.loc[not_tuned].values,encodings_std_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()

res_df_tune_comp_mean.round(2).style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,Untuned,Tuned
Baseline,0.2,0.2
LR_catboost,0.55,0.54
LR_glmm,0.53,0.53
LR_ignore,0.54,0.55
LR_ohe,0.56,0.56
LR_ordinal,0.53,0.52
LR_target,0.53,0.55
XGB_catboost,0.51,0.51
XGB_glmm,0.55,0.55
XGB_ignore,0.53,0.54


In [15]:
models = results_encodings[0].keys()
metric = "F1 Test"

encodings_folds_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())
encodings_mean_df = encodings_folds_df.mean(axis=0)
encodings_std_df = encodings_folds_df.std(axis=0)

methods = sorted(list(encodings_mean_df.index))[1:]
not_tuned = ["Baseline"]+methods[::2]
tuned = ["Baseline"]+methods[1::2]

res_df_tune_comp_mean = pd.DataFrame([encodings_mean_df.loc[not_tuned].values,encodings_mean_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()
res_df_tune_comp_std = pd.DataFrame([encodings_std_df.loc[not_tuned].values,encodings_std_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()

res_df_tune_comp_mean.round(2).style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,Untuned,Tuned
Baseline,0.2,0.2
LR_catboost,0.55,0.54
LR_glmm,0.53,0.53
LR_ignore,0.54,0.55
LR_ohe,0.56,0.56
LR_ordinal,0.53,0.52
LR_target,0.53,0.55
XGB_catboost,0.51,0.51
XGB_glmm,0.55,0.55
XGB_ignore,0.53,0.54


In [16]:
latex_df = res_df_tune_comp_mean.round(2).astype(str) + " (" +  res_df_tune_comp_std.round(3).astype(str) + ")"
latex_df

Unnamed: 0,Untuned,Tuned
Baseline,0.2 (0.016),0.2 (0.016)
LR_catboost,0.55 (0.017),0.54 (0.014)
LR_glmm,0.53 (0.03),0.53 (0.024)
LR_ignore,0.54 (0.025),0.55 (0.028)
LR_ohe,0.56 (0.04),0.56 (0.032)
LR_ordinal,0.53 (0.027),0.52 (0.018)
LR_target,0.53 (0.008),0.55 (0.034)
XGB_catboost,0.51 (0.084),0.51 (0.083)
XGB_glmm,0.55 (0.028),0.55 (0.068)
XGB_ignore,0.53 (0.032),0.54 (0.028)


In [17]:
print(latex_df.to_latex())

\begin{tabular}{lll}
\toprule
{} &       Untuned &         Tuned \\
\midrule
Baseline     &   0.2 (0.016) &   0.2 (0.016) \\
LR\_catboost  &  0.55 (0.017) &  0.54 (0.014) \\
LR\_glmm      &   0.53 (0.03) &  0.53 (0.024) \\
LR\_ignore    &  0.54 (0.025) &  0.55 (0.028) \\
LR\_ohe       &   0.56 (0.04) &  0.56 (0.032) \\
LR\_ordinal   &  0.53 (0.027) &  0.52 (0.018) \\
LR\_target    &  0.53 (0.008) &  0.55 (0.034) \\
XGB\_catboost &  0.51 (0.084) &  0.51 (0.083) \\
XGB\_glmm     &  0.55 (0.028) &  0.55 (0.068) \\
XGB\_ignore   &  0.53 (0.032) &  0.54 (0.028) \\
XGB\_ohe      &   0.6 (0.026) &  0.56 (0.079) \\
XGB\_ordinal  &  0.58 (0.052) &  0.57 (0.047) \\
XGB\_target   &   0.6 (0.061) &   0.57 (0.07) \\
\bottomrule
\end{tabular}



### Performance Comparison

In [18]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_ignore_tuned,LR_ohe_tuned,LR_target_tuned,LR_ordinal_tuned,LR_catboost_tuned,LR_glmm_tuned
0,0.203 (0.016),0.551 (0.028),0.558 (0.032),0.549 (0.034),0.522 (0.018),0.544 (0.014),0.534 (0.024)


In [19]:
# For LR
models = ["Baseline"]+[i for i in results_encodings[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_encodings[fold_num]).loc[metric,models] for fold_num in results_encodings.keys()],index=results_encodings.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_ignore_tuned,XGB_ohe_tuned,XGB_target_tuned,XGB_ordinal_tuned,XGB_catboost_tuned,XGB_glmm_tuned
0,0.203 (0.016),0.541 (0.028),0.558 (0.079),0.565 (0.07),0.573 (0.047),0.507 (0.083),0.551 (0.068)


In [20]:
res_df_lr.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i.split("_")[1] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_encodings = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_encodings.index = ["LR", "XGB"]
latex_df_encodings

Unnamed: 0,Baseline,ignore,ohe,target,ordinal,catboost,glmm
LR,0.203 (0.016),0.551 (0.028),0.558 (0.032),0.549 (0.034),0.522 (0.018),0.544 (0.014),0.534 (0.024)
XGB,0.203 (0.016),0.541 (0.028),0.558 (0.079),0.565 (0.07),0.573 (0.047),0.507 (0.083),0.551 (0.068)


In [21]:
print(latex_df_encodings.round(2).to_latex())


\begin{tabular}{llllllll}
\toprule
{} &       Baseline &         ignore &            ohe &         target &        ordinal &       catboost &           glmm \\
\midrule
LR  &  0.203 (0.016) &  0.551 (0.028) &  0.558 (0.032) &  0.549 (0.034) &  0.522 (0.018) &  0.544 (0.014) &  0.534 (0.024) \\
XGB &  0.203 (0.016) &  0.541 (0.028) &  0.558 (0.079) &   0.565 (0.07) &  0.573 (0.047) &  0.507 (0.083) &  0.551 (0.068) \\
\bottomrule
\end{tabular}



## Data Subset Comparisons

In [22]:
subsets = {
    "demo_only": demographic_cols,
#            "performance_only": perf_cols,
           "activity_only": activity_cols,
           "activity_and_demo": activity_cols+demographic_cols,
#            "performance_and_demo": perf_cols+demographic_cols,
           "all": list(df.columns)
          }

In [23]:
if not os.path.exists(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle"):

    results_subsets = {}
    results_subsets_feature_importances = {}

    for fold in range(folds):
        results_subsets[fold] = {}
        results_subsets_feature_importances[fold] = {}
        # Create baseline
        y_train = data_dict[f"y_train_{fold}"]
        y_val = data_dict[f"y_val_{fold}"]
        y_test = data_dict[f"y_test_{fold}"]
        y_train_val = np.concatenate([y_train,y_val])

        u,c = np.unique(y_train_val,return_counts=True)
        nb_classes = len(u)
        baseline = np.argmax(c)

        y_train_val_pred_base = np.ones(y_train_val.shape[0])*baseline
        y_test_pred_base = np.ones(y_test.shape[0])*baseline

        results_subsets[fold]["Baseline"] = {}
        eval_res_train = get_metrics(get_one_hot(y_train_val, nb_classes), get_one_hot(y_train_val_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_train.keys():
            results_subsets[fold]["Baseline"][metric + " Train"] = eval_res_train[metric]
        eval_res_test = get_metrics(get_one_hot(y_test, nb_classes), get_one_hot(y_test_pred_base.astype(int), nb_classes), target=target)
        for metric in eval_res_test.keys():
            results_subsets[fold]["Baseline"][metric + " Test"] = eval_res_test[metric]


        for subset_key in subsets:
            print(f"Preparing results for fold {fold}, subset={subset_key}")
            # Retrieve data
            z_cols = data_dict["z_cols"]

            X_train = data_dict[f"X_train_{fold}"]
            y_train = data_dict[f"y_train_{fold}"]

            X_val = data_dict[f"X_val_{fold}"]
            y_val = data_dict[f"y_val_{fold}"]

            X_test = data_dict[f"X_test_{fold}"]
            y_test = data_dict[f"y_test_{fold}"]
        
            y_train_val = np.concatenate([y_train,y_val])

            # Define data subset for LR
            z_glmm_encoded_train = data_dict[f"z_glmm_encoded_train_{fold}"] 
            z_glmm_encoded_val = data_dict[f"z_glmm_encoded_val_{fold}"] 
            z_glmm_encoded_test = data_dict[f"z_glmm_encoded_test_{fold}"] 
            X_train_lr = pd.concat([X_train,z_glmm_encoded_train],axis=1)
            X_val_lr = pd.concat([X_val,z_glmm_encoded_val],axis=1)
            X_test_lr = pd.concat([X_test,z_glmm_encoded_test],axis=1)      
            X_train_val_lr = pd.concat([X_train_lr,X_val_lr])

            # Define data subset for XGB
            z_ordinal_encoded_train = data_dict[f"z_ordinal_encoded_train_{fold}"] 
            z_ordinal_encoded_val = data_dict[f"z_ordinal_encoded_val_{fold}"] 
            z_ordinal_encoded_test = data_dict[f"z_ordinal_encoded_test_{fold}"] 
            X_train_xgb = pd.concat([X_train,z_ordinal_encoded_train],axis=1)
            X_val_xgb = pd.concat([X_val,z_ordinal_encoded_val],axis=1)
            X_test_xgb = pd.concat([X_test,z_ordinal_encoded_test],axis=1)
            X_train_val_xgb = pd.concat([X_train_xgb,X_val_xgb])


            # Define data subset for evaluation
            X_train_val_lr = X_train_val_lr[[i for i in X_train_val_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_lr = X_test_lr[[i for i in X_test_lr.columns if any([j in i for j in subsets[subset_key]])]]
            X_train_val_xgb = X_train_val_xgb[[i for i in X_train_val_xgb.columns if any([j in i for j in subsets[subset_key]])]]
            X_test_xgb = X_test_xgb[[i for i in X_test_xgb.columns if any([j in i for j in subsets[subset_key]])]]


            # Train base models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target,tune=False, seed=RS)
            results_subsets[fold]["LR_"+subset_key] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=False, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key] = feats

            # Train tuned models
            res, feats = evaluate_logreg(X_train_val_lr, y_train_val, X_test_lr, y_test, target=target, max_evals=max_evals, tune=True, seed=RS)
            results_subsets[fold]["LR_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["LR_"+subset_key+"_tuned"] = feats

            res, feats = evaluate_xgb(X_train_val_xgb, y_train_val, X_test_xgb, y_test, target, tune=True, max_evals=max_evals, early_stopping_rounds=early_stopping_rounds, seed=RS)
            results_subsets[fold]["XGB_"+subset_key+"_tuned"] = res
            results_subsets_feature_importances[fold]["XGB_"+subset_key+"_tuned"] = feats
    
    if not os.path.exists(f"../results/{dataset_name}/{experiment_name}"):
        os.makedirs(f"../results/{dataset_name}/{experiment_name}")
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'wb') as handle:
        pickle.dump(results_subsets, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'wb') as handle:
        pickle.dump(results_subsets_feature_importances, handle, protocol=pickle.HIGHEST_PROTOCOL)

else:
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets.pickle", 'rb') as handle:
        results_subsets = pickle.load(handle)
    with open(f"../results/{dataset_name}/{experiment_name}/results_subsets_feature_importances.pickle", 'rb') as handle:
        results_subsets_feature_importances = pickle.load(handle)
        
        
results_subsets_df = pd.DataFrame(results_subsets[0]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_subsets_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Preparing results for fold 0, subset=demo_only
SCORE: 0.9284024518814847                                                                                              
SCORE: 0.9283580909083871                                                                                              
SCORE: 0.9237805279262592                                                                                              
SCORE: 0.9609442186439884                                                                                              
SCORE: 0.9257057048826931                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 27.03trial/s, best loss: 0.9237805279262592]
The best hyperparameters are :  

{'C': 0.9503755133113749}
Default performance on Test: 1.2392985448747174
SCORE: 0.9349665292961238                                                                                              
SCORE

100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.67trial/s, best loss: 0.6297444395996141]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.32831503112545807, 'n_estimators': 372.0, 'seed': 0, 'max_depth': 11.0, 'min_child_weight': 8.0, 'colsample_bytree': 0.7572645676745111, 'subsample': 0.6508300373527265}
Test Performance after third tuning round: 0.8635712573206189
SCORE: 0.70256665405861                                                                                                
SCORE: 0.7394172514247621                                                                                              
SCORE: 0.7629937779977762                                                                                              
SCORE: 0.6853960374502244                                                                                              
SCORE: 0.9486332796805794                                                                            

SCORE: 0.7326937782595276                                                                                              
SCORE: 0.6736150155072602                                                                                              
SCORE: 0.5999509887363973                                                                                              
SCORE: 0.5793830138752565                                                                                              
SCORE: 0.6426714063096729                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.76trial/s, best loss: 0.5793830138752565]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.31247310757641, 'n_estimators': 491.0, 'seed': 0, 'max_depth': 2.0, 'min_child_weight': 1.0}
Test Performance after second tuning round: 0.8262056629538046
SCORE: 0.5689616054824483                                

SCORE: 0.6385612734018467                                                                                              
SCORE: 0.6370686972993729                                                                                              
SCORE: 0.6482135888560379                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.83trial/s, best loss: 0.6370686972993729]
The best hyperparameters are :  

{'C': 0.6152112699148091}
Default performance on Test: 0.7469593248050793
SCORE: 0.718413727621394                                                                                               
SCORE: 0.6988667996851861                                                                                              
SCORE: 0.6842484856752628                                                                                              
SCORE: 0.6935299349870879                           

Test Performance after third tuning round: 0.5630515016521973
SCORE: 0.767248851125461                                                                                               
SCORE: 0.7572684669442167                                                                                              
SCORE: 0.8332324200650799                                                                                              
SCORE: 0.7330801407685773                                                                                              
SCORE: 0.7751948993945572                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.72trial/s, best loss: 0.7330801407685773]
The best hyperparameters are :  

{'learning_rate': 0.27431991854605897, 'n_estimators': 144.0, 'seed': 0, 'max_depth': 3.0, 'min_child_weight': 7.0, 'colsample_bytree': 0.7572973008430173, 'subsample': 0.9473210363734

SCORE: 0.9578826803147832                                                                                              
SCORE: 0.9672696564374048                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.08trial/s, best loss: 0.9578826803147832]
The best hyperparameters after step 2 are :  

{'learning_rate': 0.4357091598496667, 'n_estimators': 484.0, 'seed': 0, 'max_depth': 3.0, 'min_child_weight': 3.0}
Test Performance after second tuning round: 0.8308879223193593
SCORE: 0.976535189801781                                                                                               
SCORE: 0.9485329211967197                                                                                              
SCORE: 0.948240831765079                                                                                               
SCORE: 0.9450811510458286                              

100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.37trial/s, best loss: 0.5949557606055059]
The best hyperparameters are :  

{'C': 0.729922178488326}
Default performance on Test: 0.7383795236810039
SCORE: 0.68608914658954                                                                                                
SCORE: 0.683530890397126                                                                                               
SCORE: 0.6870703578636065                                                                                              
SCORE: 0.7274299356600273                                                                                              
SCORE: 0.6649230225896173                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.96trial/s, best loss: 0.6649230225896173]
The best hyperparameters after step 1  are :  

{'lea

SCORE: 0.7775991689368109                                                                                              
SCORE: 0.6418313621151266                                                                                              
SCORE: 0.8113167951232146                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.43trial/s, best loss: 0.6418313621151266]
The best hyperparameters are :  

{'learning_rate': 0.20044997198492498, 'n_estimators': 230.0, 'seed': 0, 'max_depth': 8.0, 'min_child_weight': 10.0, 'colsample_bytree': 0.8918572896091755, 'subsample': 0.7832926872518551, 'gamma': 1.2477729866084581, 'reg_alpha': 4.0, 'reg_lambda': 1.3561151146772312}
Test Performance after last tuning round: 0.6093868890526372
Preparing results for fold 3, subset=demo_only
SCORE: 0.9348298603914715                                                                                  

SCORE: 0.6887497879527299                                                                                              
SCORE: 0.6828615705371565                                                                                              
SCORE: 0.6463485788852614                                                                                              
SCORE: 0.6730916037322585                                                                                              
SCORE: 0.717170927691312                                                                                               
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.31trial/s, best loss: 0.6463485788852614]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.4867790023268621, 'n_estimators': 139.0, 'seed': 0, 'max_depth': 7.0, 'min_child_weight': 7.0, 'colsample_bytree': 0.7371179841858468, 'subsample': 0.810769344777706}
Test Performance after third tuning round: 0.7

SCORE: 0.6356155825260832                                                                                              
SCORE: 0.634204468111224                                                                                               
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.16trial/s, best loss: 0.6291744003870596]
The best hyperparameters after step 1  are :  

{'learning_rate': 0.3545273647894992, 'n_estimators': 229.0}
Test Performance after first tuning round: 0.7029378484547038
SCORE: 0.6040365340203263                                                                                              
SCORE: 0.5777867071337375                                                                                              
SCORE: 0.6058710261250619                                                                                              
SCORE: 0.6010896614462828                                                                                    

Test Performance after last tuning round: 0.9227358543323549
Preparing results for fold 4, subset=activity_only
SCORE: 0.6187288972833442                                                                                              
SCORE: 0.5917798073490517                                                                                              
SCORE: 0.615468303603947                                                                                               
SCORE: 0.5931141781017938                                                                                              
SCORE: 0.5949930361664536                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.24trial/s, best loss: 0.5917798073490517]
The best hyperparameters are :  

{'C': 0.8385179995092874}
Default performance on Test: 1.0076843199266508
SCORE: 0.7244776855554361                                   

SCORE: 0.6000133933132923                                                                                              
100%|██████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.69trial/s, best loss: 0.5826218616398589]
The best hyperparameters after step 3 are :  

{'learning_rate': 0.45841991593392495, 'n_estimators': 137.0, 'seed': 0, 'max_depth': 2.0, 'min_child_weight': 5.0, 'colsample_bytree': 0.6698051640038161, 'subsample': 0.5019866587213133}
Test Performance after third tuning round: 0.7209656484855153
SCORE: 0.7472486718198019                                                                                              
SCORE: 0.8032155081232475                                                                                              
SCORE: 0.7829031398645717                                                                                              
SCORE: 0.8019236277941619                                                                             

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
XGB_all,1.0,1.0,1.0,0.7812,0.7892,0.9085
LR_all_tuned,0.7943,0.7961,0.9285,0.7292,0.7372,0.8847
LR_all,0.7943,0.7961,0.9286,0.7292,0.7372,0.8846
XGB_all_tuned,0.8307,0.8305,0.9383,0.7188,0.73,0.8958
XGB_activity_and_demo,1.0,1.0,1.0,0.7083,0.7298,0.8754
LR_activity_and_demo_tuned,0.7682,0.769,0.9056,0.7083,0.7246,0.8898
LR_activity_only,0.7266,0.7292,0.883,0.6979,0.7189,0.8969
LR_activity_only_tuned,0.7292,0.7316,0.8827,0.6979,0.7189,0.896
LR_activity_and_demo,0.7682,0.7696,0.9078,0.6979,0.7157,0.8893
XGB_activity_and_demo_tuned,0.8802,0.8804,0.965,0.6771,0.6935,0.8766


In [24]:
results_subsets_df = pd.DataFrame(results_subsets[1]).transpose().sort_values("F1 Test",ascending=False).round(4)
results_subsets_df[["Accuracy Train", "F1 Train", "AUROC Train", "Accuracy Test", "F1 Test", "AUROC Test"]].style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,Accuracy Train,F1 Train,AUROC Train,Accuracy Test,F1 Test,AUROC Test
LR_all,0.7812,0.7902,0.9182,0.8229,0.8243,0.9313
XGB_activity_and_demo,1.0,1.0,1.0,0.8125,0.8136,0.9078
XGB_all_tuned,0.7839,0.7909,0.9143,0.8021,0.8061,0.9276
LR_all_tuned,0.7812,0.7906,0.9159,0.8021,0.8033,0.9338
XGB_all,1.0,1.0,1.0,0.8021,0.8023,0.9223
XGB_activity_and_demo_tuned,0.7943,0.8017,0.9187,0.7917,0.7956,0.9281
LR_activity_only_tuned,0.7188,0.7326,0.8795,0.7917,0.7948,0.91
XGB_activity_only,1.0,1.0,1.0,0.7917,0.7891,0.9078
XGB_activity_only_tuned,0.7604,0.7727,0.913,0.7812,0.786,0.9136
LR_activity_only,0.724,0.7374,0.8796,0.7812,0.7844,0.9103


### Effectiveness of Parameter Tuning

In [25]:
models = results_subsets[0].keys()
metric = "F1 Test"

subsets_folds_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())
subsets_mean_df = subsets_folds_df.mean(axis=0)
subsets_std_df = subsets_folds_df.std(axis=0)

methods = sorted(list(subsets_mean_df.index))[1:]
not_tuned = ["Baseline"]+methods[::2]
tuned = ["Baseline"]+methods[1::2]

res_df_tune_comp_mean = pd.DataFrame([subsets_mean_df.loc[not_tuned].values,subsets_mean_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()
res_df_tune_comp_std = pd.DataFrame([subsets_std_df.loc[not_tuned].values,subsets_std_df.loc[tuned]],index=["Untuned","Tuned"],columns=not_tuned).transpose()

res_df_tune_comp_mean.round(2).style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,Untuned,Tuned
Baseline,0.2,0.2
LR_activity_and_demo,0.72,0.72
LR_activity_only,0.72,0.73
LR_all,0.75,0.75
LR_demo_only,0.5,0.51
XGB_activity_and_demo,0.76,0.72
XGB_activity_only,0.72,0.7
XGB_all,0.78,0.75
XGB_demo_only,0.54,0.49


In [26]:
latex_df = res_df_tune_comp_mean.round(2).astype(str) + " (" +  res_df_tune_comp_std.round(3).astype(str) + ")"
latex_df

Unnamed: 0,Untuned,Tuned
Baseline,0.2 (0.016),0.2 (0.016)
LR_activity_and_demo,0.72 (0.022),0.72 (0.022)
LR_activity_only,0.72 (0.037),0.73 (0.04)
LR_all,0.75 (0.047),0.75 (0.045)
LR_demo_only,0.5 (0.047),0.51 (0.041)
XGB_activity_and_demo,0.76 (0.036),0.72 (0.047)
XGB_activity_only,0.72 (0.048),0.7 (0.057)
XGB_all,0.78 (0.015),0.75 (0.035)
XGB_demo_only,0.54 (0.048),0.49 (0.052)


In [27]:
res_df_tune_comp_diff = res_df_tune_comp_mean[["Tuned"]]-res_df_tune_comp_mean[["Untuned"]].values
res_df_tune_comp_diff.round(2)

Unnamed: 0,Tuned
Baseline,0.0
LR_activity_and_demo,0.0
LR_activity_only,0.0
LR_all,-0.01
LR_demo_only,0.01
XGB_activity_and_demo,-0.05
XGB_activity_only,-0.01
XGB_all,-0.03
XGB_demo_only,-0.05


In [28]:
res_df_tune_comp_diff_lr = res_df_tune_comp_diff.loc[[i for i in res_df_tune_comp_diff.index if (i == "Baseline" or "LR" in i)]]
res_df_tune_comp_diff_xgb = res_df_tune_comp_diff.loc[[i for i in res_df_tune_comp_diff.index if (i == "Baseline" or "XGB" in i)]]

res_df_tune_comp_diff_lr.index = [i[3:] if i!="Baseline" else "Baseline" for i in res_df_tune_comp_diff_lr.index]
res_df_tune_comp_diff_xgb.index = [i[4:] if i!="Baseline" else "Baseline" for i in res_df_tune_comp_diff_xgb.index]
res_df_tune_comp_diff_xgb

Unnamed: 0,Tuned
Baseline,0.0
activity_and_demo,-0.047662
activity_only,-0.013039
all,-0.033293
demo_only,-0.045561


In [29]:
latex_df_diff = pd.concat([res_df_tune_comp_diff_lr,res_df_tune_comp_diff_xgb],axis=1)
latex_df_diff.columns = ["LR", "XGB"]
latex_df_diff

Unnamed: 0,LR,XGB
Baseline,0.0,0.0
activity_and_demo,0.003645,-0.047662
activity_only,0.004012,-0.013039
all,-0.005774,-0.033293
demo_only,0.006388,-0.045561


In [30]:
print(latex_df_diff.round(2).to_latex())

\begin{tabular}{lrr}
\toprule
{} &    LR &   XGB \\
\midrule
Baseline          &  0.00 &  0.00 \\
activity\_and\_demo &  0.00 & -0.05 \\
activity\_only     &  0.00 & -0.01 \\
all               & -0.01 & -0.03 \\
demo\_only         &  0.01 & -0.05 \\
\bottomrule
\end{tabular}



### Performance Results

In [31]:
# For LR
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "LR" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_lr = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_lr.style.apply(negative_bold)


Unnamed: 0,Baseline,LR_demo_only_tuned,LR_activity_only_tuned,LR_activity_and_demo_tuned,LR_all_tuned
0,0.203 (0.016),0.508 (0.041),0.726 (0.04),0.721 (0.022),0.746 (0.045)


In [32]:
# For XGB
models = ["Baseline"]+[i for i in results_subsets[0].keys() if ("tuned" in i and "XGB" in i)]
metric = "F1 Test"

#####
dataset_res_dict = {}
best_models = {}
t_test_results = {}

use_df = pd.DataFrame([pd.DataFrame(results_subsets[fold_num]).loc[metric,models] for fold_num in results_subsets.keys()],index=results_subsets.keys())

df_mean = pd.DataFrame((use_df).mean(axis=0).round(3).astype(str) + " (" + use_df.std(axis=0).round(3).astype(str) + ")").transpose()
model_dict = {i: df_mean[i].values[0] for i in df_mean.columns}

best_model = use_df.columns[use_df.mean(axis=0).argmax()]

t_test_res = np.array([stats.ttest_rel(use_df[best_model].values, use_df[model].values)[1] for model in models]).round(3)
t_test_res[np.isnan(t_test_res)] = 1.
    
res_df_xgb = pd.DataFrame([model_dict])

def negative_bold(val):
    i = np.where(val.name==np.array(models))[0][0]
    return ["font-weight: bold"  if t_test_res[i]>=0.05 else "" for dataset_name in val.keys()]
    # Case without transpose:
#     return ["font-weight: bold"  if t_test_results[val.name][i]>=0.05 else "" for i in range(len(val))]

res_df_xgb.style.apply(negative_bold)


Unnamed: 0,Baseline,XGB_demo_only_tuned,XGB_activity_only_tuned,XGB_activity_and_demo_tuned,XGB_all_tuned
0,0.203 (0.016),0.491 (0.052),0.703 (0.057),0.716 (0.047),0.747 (0.035)


In [33]:
res_df_lr.columns = [i[3:-6] if i != "Baseline" else "Baseline" for i in res_df_lr.columns]    
res_df_xgb.columns = [i[4:-6] if i != "Baseline" else "Baseline" for i in res_df_xgb.columns]    

latex_df_subsets = pd.concat([res_df_lr,res_df_xgb],axis=0)
latex_df_subsets.index = ["LR", "XGB"]
latex_df_subsets

Unnamed: 0,Baseline,demo_only,activity_only,activity_and_demo,all
LR,0.203 (0.016),0.508 (0.041),0.726 (0.04),0.721 (0.022),0.746 (0.045)
XGB,0.203 (0.016),0.491 (0.052),0.703 (0.057),0.716 (0.047),0.747 (0.035)


In [34]:
print(latex_df_subsets.round(2).transpose().to_latex())


\begin{tabular}{lll}
\toprule
{} &             LR &            XGB \\
\midrule
Baseline          &  0.203 (0.016) &  0.203 (0.016) \\
demo\_only         &  0.508 (0.041) &  0.491 (0.052) \\
activity\_only     &   0.726 (0.04) &  0.703 (0.057) \\
activity\_and\_demo &  0.721 (0.022) &  0.716 (0.047) \\
all               &  0.746 (0.045) &  0.747 (0.035) \\
\bottomrule
\end{tabular}



### Feature Importance

In [35]:
# top_10_importances = {}

# for model in list(results_subsets_feature_importances[fold].keys()):
#     imp_df = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)

#     if "LR" in model:
#         direction = imp_df.apply(lambda x: np.sign(x))
#         imp_df = imp_df.abs()

#     imp_df = imp_df/imp_df.sum(axis=0)

#     mean_imp_df = imp_df.mean(axis=1)
#     std_imp_df = imp_df.std(axis=1)

#     mean_imp_df = mean_imp_df.sort_values(ascending=False)
#     std_imp_df = std_imp_df.loc[mean_imp_df.index]
#     final_imps = mean_imp_df[:10]
#     final_imps["Rest"] = sum(mean_imp_df[10:])
#     top_5_importances[model] = np.array([final_imps.index.values, final_imps.values])

In [37]:
demo_importances = {}
demo_importances_stds = {}

for model in list(results_subsets_feature_importances[fold].keys()):
    if "demo" in model or "all" in model:
        imp_df_all = pd.concat([results_subsets_feature_importances[fold][model] for fold in range(folds)],axis=1)
        
        if "LR" in model:
            direction = imp_df_all.apply(lambda x: np.sign(x))
            imp_df_all = imp_df_all.abs()
        if imp_df_all.sum().sum()!=0:
            imp_df = imp_df_all/imp_df_all.sum(axis=0)
        imp_df = imp_df.fillna(1/imp_df.shape[0])
#         imp_df = imp_df.loc[demographic_cols]

#         mean_imp_df = imp_df.mean(axis=1)
#         std_imp_df = imp_df.std(axis=1)

#         mean_imp_df = mean_imp_df.sort_values(ascending=False)
#         std_imp_df = std_imp_df.loc[mean_imp_df.index]
#         final_imps = mean_imp_df#[:10]
#         final_imps["Rest"] = sum(mean_imp_df[10:])
#         final_imps["Total"] = sum(mean_imp_df)
        demo_importances[model] = np.round(np.mean(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values
        demo_importances_stds[model] = np.round(np.std(imp_df.loc[[i for i in imp_df.index if any([j in i for j in demographic_cols])]].sum(axis=0)),2)#final_imps.values


In [38]:
lr_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "LR" in i and "tuned" in i})
xgb_demo_imp = pd.Series({i: demo_importances[i] for i in demo_importances if "XGB" in i and "tuned" in i})
lr_demo_imp.index = [i[3:-6] for i in lr_demo_imp.index]    
xgb_demo_imp.index = [i[4:-6] for i in xgb_demo_imp.index]    

lr_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "LR" in i and "tuned" in i})
xgb_demo_imp_stds = pd.Series({i: demo_importances_stds[i] for i in demo_importances_stds if "XGB" in i and "tuned" in i})
lr_demo_imp_stds.index = [i[3:-6] for i in lr_demo_imp_stds.index]    
xgb_demo_imp_stds.index = [i[4:-6] for i in xgb_demo_imp_stds.index]    


latex_df_imp = pd.DataFrame([lr_demo_imp.astype(str) + " (" + lr_demo_imp_stds.astype(str) + ")",
                             xgb_demo_imp.astype(str) + " (" + xgb_demo_imp_stds.astype(str) + ")"])
latex_df_imp.index = ["LR", "XGB"]
latex_df_imp

Unnamed: 0,demo_only,activity_and_demo,all
LR,1.0 (0.0),0.47 (0.2),0.29 (0.1)
XGB,1.0 (0.0),0.2 (0.06),0.18 (0.05)


In [39]:
print(latex_df_subsets.to_latex())

\begin{tabular}{llllll}
\toprule
{} &       Baseline &      demo\_only &  activity\_only & activity\_and\_demo &            all \\
\midrule
LR  &  0.203 (0.016) &  0.508 (0.041) &   0.726 (0.04) &     0.721 (0.022) &  0.746 (0.045) \\
XGB &  0.203 (0.016) &  0.491 (0.052) &  0.703 (0.057) &     0.716 (0.047) &  0.747 (0.035) \\
\bottomrule
\end{tabular}

