In [1]:
import sys
sys.path.append("..")
sys.path.append("../lmmnn/")

from utils.training_functions import *
from utils.evaluation import *
from data import dataset_preprocessing

import pickle

RS = 42



#### Download and save data from Pargent et al. by running "data/download_pargent2022_datasets.py before running this notebook

In [2]:
dataset_name = "academic_performance"
mode="train_val_test"
RS=42
hct=10
test_ratio=0.2
val_ratio=0.1
folds=None
target = "continuous"

data_path = f"{mode}_RS{RS}_hct{hct}"
if mode == "cv":
    data_path += f"_{folds}folds"
elif mode == "train_test":
    data_path += f"_split{1-test_ratio*100}-{test_ratio*100}"
elif mode == "train_val_test":
    data_path += f"_split{round(100-(test_ratio+val_ratio)*100)}-{round(test_ratio*100)}-{round(val_ratio*100)}"


# If no data_dict for the configuration exists, run preprocessing, else load data_dict
if not os.path.exists(f"../data/prepared/{dataset_name}/"+data_path+"/data_dict.pickle"):
    dataset_preprocessing.process_dataset(dataset_name, target, mode, RS, hct, test_ratio, val_ratio, folds)
with open(f"../data/prepared/{dataset_name}/{data_path}/data_dict.pickle", 'rb') as handle:
        data_dict = pickle.load(handle)


In [3]:
z_ohe_encoded_train = data_dict["z_ohe_encoded_train"] 
z_ohe_encoded_val = data_dict["z_ohe_encoded_val"] 
z_ohe_encoded_test = data_dict["z_ohe_encoded_test"] 

z_target_encoded_train = data_dict["z_target_encoded_train"] 
z_target_encoded_val = data_dict["z_target_encoded_val"] 
z_target_encoded_test = data_dict["z_target_encoded_test"] 

X_train = data_dict["X_train"]
Z_train = data_dict["Z_train"]
y_train = data_dict["y_train"]

X_val = data_dict["X_val"]
Z_val = data_dict["Z_val"]
y_val = data_dict["y_val"]

X_test = data_dict["X_test"]
Z_test = data_dict["Z_test"]
y_test = data_dict["y_test"]

z_cols = data_dict["z_cols"]



### Correlations to target

In [4]:
pd.DataFrame(pd.concat([X_train,pd.Series(y_train,index=X_train.index,name="target")],axis=1).corr()["target"])

Unnamed: 0,target
GENDER,0.044683
INTERNET,0.188251
TV,0.135723
COMPUTER,0.125973
WASHING_MCH,0.10752
MIC_OVEN,0.171792
CAR,0.170735
DVD,0.099642
FRESH,0.038084
PHONE,0.03413


In [5]:
config = {
    # General Parameters
    "general_parameters": {
        "target": "continuous",
        "metrics": ["mse", "r2"],
        "model_name": "tabtransformer",
                             },

    # NN Parameters
    "nn_parameters": {"epochs": 200,
                      "batch_size":  100,
                      "patience": 5,
                      "stop_metric": "val_mse"},
    
    "embed_parameters":
        {"embed_dims_method": "sqrt"}
     

        }

In [6]:
train_data = [X_train, Z_train, y_train, z_ohe_encoded_train, z_target_encoded_train]
val_data = [X_val, Z_val, y_val, z_ohe_encoded_val, z_target_encoded_val]
test_data = [X_test, Z_test, y_test, z_ohe_encoded_test, z_target_encoded_test]


In [7]:
X_train_perfonly = X_train[["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"]]
X_val_perfonly = X_val[["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"]]
X_test_perfonly = X_test[["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"]]

z_target_encoded_train_perfonly = z_target_encoded_train[["ACADEMIC_PROGRAM", "UNIVERSITY"]]
z_target_encoded_val_perfonly = z_target_encoded_val[["ACADEMIC_PROGRAM", "UNIVERSITY"]]
z_target_encoded_test_perfonly = z_target_encoded_test[["ACADEMIC_PROGRAM", "UNIVERSITY"]]

z_ohe_encoded_train_perfonly = z_ohe_encoded_train[[i for i in z_ohe_encoded_train.columns if ("ACADEMIC_PROGRAM" in i or "UNIVERSITY" in i)]]
z_ohe_encoded_val_perfonly = z_ohe_encoded_val[[i for i in z_ohe_encoded_train.columns if ("ACADEMIC_PROGRAM" in i or "UNIVERSITY" in i)]]
z_ohe_encoded_test_perfonly = z_ohe_encoded_test[[i for i in z_ohe_encoded_train.columns if ("ACADEMIC_PROGRAM" in i or "UNIVERSITY" in i)]]

Z_train_perfonly = pd.DataFrame(Z_train,columns=z_cols)[["ACADEMIC_PROGRAM", "UNIVERSITY"]].values
Z_val_perfonly = pd.DataFrame(Z_val,columns=z_cols)[["ACADEMIC_PROGRAM", "UNIVERSITY"]].values
Z_test_perfonly = pd.DataFrame(Z_test,columns=z_cols)[["ACADEMIC_PROGRAM", "UNIVERSITY"]].values

train_data_perfonly = [X_train_perfonly, Z_train_perfonly, y_train, z_ohe_encoded_train_perfonly, z_target_encoded_train_perfonly]
val_data_perfonly = [X_val_perfonly, Z_val_perfonly, y_val, z_ohe_encoded_val_perfonly, z_target_encoded_val_perfonly]
test_data_perfonly = [X_test_perfonly, Z_test_perfonly, y_test, z_ohe_encoded_test_perfonly, z_target_encoded_test_perfonly]


In [8]:
X_train_noperf = X_train.drop(["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"],axis=1)
X_val_noperf = X_val.drop(["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"],axis=1)
X_test_noperf = X_test.drop(["MAT_S11", "CR_S11", "CC_S11", "BIO_S11", "ENG_S11"],axis=1)

train_data_noperf = [X_train_noperf, Z_train, y_train, z_ohe_encoded_train, z_target_encoded_train]
val_data_noperf = [X_val_noperf, Z_val, y_val, z_ohe_encoded_val, z_target_encoded_val]
test_data_noperf = [X_test_noperf, Z_test, y_test, z_ohe_encoded_test, z_target_encoded_test]


## Train the models to evaluate

In [9]:
save_path = f"../results/{dataset_name}/{data_path}"
if not os.path.exists(f"{save_path}/results_dict.pickle"):
    results_dict = train_models(train_data, val_data, test_data, config, RS=RS, save_results=False,save_path=save_path)
else:
    with open(f"{save_path}/results_dict.pickle", 'rb') as handle:
        results_dict = pickle.load(handle)

results = results_dict["results"]
model_lmmnn_info = results_dict["model_lmmnn_info"]    

Load base model
Train XGBoost without z features
Train XGBoost with target encoding
Train Linear Model without z features
Train Linear Model with target encoding
Train NN without Z features
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Train NN with target encoding
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Train NN with OHE
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200


Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Train NN with Embeddings
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Train LMMNN
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200


In [11]:
save_path = f"../results/{dataset_name}/{data_path}"
if not os.path.exists(f"{save_path}/results_dict_perfonly.pickle"):
    results_dict_perfonly = train_models(train_data_perfonly, val_data_perfonly, test_data_perfonly, config, RS=RS, save_results=False,save_path=save_path)
else:
    with open(f"{save_path}/results_dict_perfonly.pickle", 'rb') as handle:
        results_dict_perfonly = pickle.load(handle)

results_perfonly = results_dict_perfonly["results"]
model_lmmnn_info_perfonly = results_dict_perfonly["model_lmmnn_info"]    

Load base model
Train XGBoost without z features
Train XGBoost with target encoding
Train Linear Model without z features
Train Linear Model with target encoding
Train NN without Z features
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Train NN with target encoding
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200


Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Train NN with OHE
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Train NN with Embeddings
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200


Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Train LMMNN
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200


In [12]:
save_path = f"../results/{dataset_name}/{data_path}"
if not os.path.exists(f"{save_path}/results_dict_noperf.pickle"):
    results_dict_noperf = train_models(train_data_noperf, val_data_noperf, test_data_noperf, config, RS=RS, save_results=False,save_path=save_path)
else:
    with open(f"{save_path}/results_dict.pickle_noperf", 'rb') as handle:
        results_dict_noperf = pickle.load(handle)

results_noperf = results_dict_noperf["results"]
model_lmmnn_info_noperf = results_dict_noperf["model_lmmnn_info"]    

Load base model
Train XGBoost without z features
Train XGBoost with target encoding
Train Linear Model without z features
Train Linear Model with target encoding
Train NN without Z features
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Train NN with target encoding
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Train NN with OHE
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Train NN with Embeddings
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Train LMMNN
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200


## Evaluation

### Performance on whole data

In [13]:
models_use = ["LR", "XGB", "LR_te", "XGB_te", "NN", "NN_te", "NN_ohe", "NN_embed", "LMMNN"]

In [14]:
results_df = pd.DataFrame(results).transpose().loc[models_use].sort_values("MSE Test",ascending=False).round(4)


In [15]:
results_df[["MSE Train", "R2 Train", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,MSE Train,R2 Train,MSE Test,R2 Test
NN_embed,0.2209,0.7791,0.449,0.5384
XGB_te,0.1144,0.8856,0.4476,0.5398
NN_ohe,0.2373,0.7627,0.4334,0.5544
XGB,0.1635,0.8365,0.4305,0.5574
NN_te,0.3278,0.6722,0.4022,0.5865
LR_te,0.3419,0.6581,0.4007,0.588
LMMNN,0.2684,0.7316,0.3919,0.5971
NN,0.3673,0.6327,0.388,0.6011
LR,0.3823,0.6177,0.3866,0.6025


#### Learned variance parameters

In [16]:
pd.DataFrame(model_lmmnn_info["sigmas"][1],index=data_dict["z_cols"]).transpose().round(3)

Unnamed: 0,EDU_FATHER,EDU_MOTHER,OCC_FATHER,OCC_MOTHER,PEOPLE_HOUSE,SCHOOL_NAME,UNIVERSITY,ACADEMIC_PROGRAM
0,0.082,0.003,0.0,0.007,0.002,0.074,0.023,0.004


### Modeling results only with previous performance

In [18]:
results_df_perfonly = pd.DataFrame(results_perfonly).transpose().loc[models_use].sort_values("MSE Test",ascending=False).round(4)


In [19]:
results_df_perfonly[["MSE Train", "R2 Train", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,MSE Train,R2 Train,MSE Test,R2 Test
NN,0.9922,0.0078,0.9727,-0.0
LMMNN,0.5905,0.4095,0.6003,0.3828
NN_te,0.4693,0.5307,0.4769,0.5097
XGB,0.2121,0.7879,0.4346,0.5532
XGB_te,0.1647,0.8353,0.4184,0.5699
LR,0.3903,0.6097,0.3911,0.5979
NN_ohe,0.3343,0.6657,0.3903,0.5987
NN_embed,0.3598,0.6402,0.3811,0.6082
LR_te,0.3707,0.6293,0.3766,0.6128


In [23]:
pd.DataFrame(model_lmmnn_info_perfonly["sigmas"][1],index=z_cols[-2:]).transpose().round(3)

Unnamed: 0,UNIVERSITY,ACADEMIC_PROGRAM
0,0.096,0.284


### Modeling results without previous performance

In [24]:
results_df_noperf = pd.DataFrame(results_noperf).transpose().loc[models_use].sort_values("MSE Test",ascending=False).round(4)


In [25]:
results_df_noperf[["MSE Train", "R2 Train", "MSE Test", "R2 Test"]].style.highlight_min(subset=["MSE Train", "MSE Test"], color = 'lightgreen', axis = 0).highlight_max(subset=["R2 Train", "R2 Test"], color = 'lightgreen', axis = 0)

Unnamed: 0,MSE Train,R2 Train,MSE Test,R2 Test
XGB,0.535,0.465,0.8589,0.117
LR,0.8047,0.1953,0.797,0.1807
NN,0.7715,0.2285,0.7936,0.1842
NN_embed,0.3821,0.6179,0.6997,0.2806
NN_ohe,0.3823,0.6177,0.6777,0.3033
XGB_te,0.2016,0.7984,0.6681,0.3132
NN_te,0.4959,0.5041,0.64,0.342
LR_te,0.5083,0.4917,0.6348,0.3474
LMMNN,0.4495,0.5505,0.6069,0.376


In [26]:
pd.DataFrame(model_lmmnn_info_noperf["sigmas"][1],index=data_dict["z_cols"]).transpose().round(3)

Unnamed: 0,EDU_FATHER,EDU_MOTHER,OCC_FATHER,OCC_MOTHER,PEOPLE_HOUSE,SCHOOL_NAME,UNIVERSITY,ACADEMIC_PROGRAM
0,0.053,0.007,0.0,0.006,0.0,0.105,0.21,0.029
