# Projet de statistiques appliquées - ENSAE 2A

### Veillon Juliette, Andru Kilian, Lacour Xavier and Massin Keryann

### We will proceed to a succession of Two-Stage Least Squares regressions so as to try to predict our Global Health Index.

Downloading of libraries

In [3]:
# Importation des librairies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # To standardize the data
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [62]:
!pip install cvxpy
import cvxpy as cp



In [63]:
import file_04_HMLasso as hml

In [64]:
import time

import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

# Data downloading, creation of the training, validation and test dataframes

In [65]:
colonnes_liste = pd.read_csv("data_03_columns_types.csv", index_col=0)

In [66]:
# Data downloading
donnees = pd.read_csv("data_03.csv")  # Takes between 10 and 15 minutes

A column needs its format changed. The cell takes some time to be executed (indicators every 1000 row has been added.

In [67]:
temp = np.where(donnees['genetic_Section_A_or_E'] == 'E', 1, np.where(donnees['genetic_Section_A_or_E'] == 'A', 0, np.nan))
donnees["genetic_Section_A_or_E"] = temp

In [68]:
Indice_New = pd.read_csv("data_tSNE_GHI.csv")
Indice_New = Indice_New[Indice_New["HHIDPN"]!=32570030]    # This observation should have been removed when creating the index but was forgotten

In [69]:
liste_colonnes_Indice = Indice_New.columns

In [70]:
# We drop the outcomes from the previous index from the X

liste_colonnes_prev_ind = []
for i in range(1,15):
    liste_colonnes_prev_ind = np.append(liste_colonnes_prev_ind, "GHI" + str(i))

X = donnees.drop(["GHI1"], axis = 1)
for i in range(1,14):
    X = X.drop([liste_colonnes_prev_ind[i]], axis = 1)

In [71]:
# We only keep data where the individual answered the last wave and that answering was continuing from wave i to 14
# It is impossible to predict something we do not know
Y_bis = Indice_New[Indice_New['tSNE_GHI14'].isna()==False]

# We erase non continuous-responding rows
# There is no continuous-responding if there is an index in wave i and none in wave i+1
for i in range(1,14):
    Y_bis = Y_bis[((Y_bis["tSNE_GHI" + str(i)].isna()==False)&(Y_bis["tSNE_GHI" + str(i+1)].isna()==True))==False]

# We keep the sames rows for X
X_bis = X[X["HHIDPN"].isin(Y_bis["HHIDPN"].values)]

In [72]:
# We split our data in 3 sub-dataset
X_train, X_test, Y_train, Y_test = train_test_split(X_bis, Y_bis, test_size=0.2, random_state=18)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_test, Y_test, test_size=0.5, random_state = 6)

# Machine Learning

On va essayer ici de performer des régressions 2SLS successives afin de présuire l'indice en vague 14 en prenant en compte les vagues précédentes.

In [73]:
# The objective here is to make a dataset where we observe if each variable exists at each wave
temporal_variables = {}
waves_columns = [col for col in X_train.columns if "genetic_" not in col and col[1] in "123456789"]
for col in waves_columns:
  char = col[0] # R or H
  if col[2] in "01234":
    wave = col[1:3]
    suffix = col[3:]
  else:
    wave = col[1]
    suffix = col[2:]
  variable = char + 'w' + suffix
  
  if variable not in temporal_variables.keys():
    temporal_variables[variable] = np.zeros((14), dtype=bool)
  
  temporal_variables[variable][int(wave)-1] = True

temporal_variables = pd.DataFrame(temporal_variables)

In [74]:
# Données intemporelles
non_waves_columns = [col for col in X_train.columns if col not in waves_columns]
# We drop waves-related values
#temp = np.append(np.arange(15), 18)
#non_waves_columns = np.delete(non_waves_columns, temp)
for i in range(15):
    non_waves_columns.pop(0)
non_waves_columns.pop(3)

'HHID'

In [75]:
# We define a function that only keeps the outcomes that exist from a wave i
def Vector_without_na(vect):
    sol = np.empty(0)
    emplacement = np.empty(0)
    for i in range(len(vect)):
        if np.isnan(vect[i]) == False:
            sol = np.append(sol, vect[i])
            emplacement = np.append(emplacement, i)
    return sol, emplacement

# Recreate a vector with a greater size with nan in places that are not in emp
def Reshape_to_prev_size(vect, emp, nv_taille):
    '''Takes a vector vect, the list emp of the positions of the vector value and the size of the new vector nv_taille '''
    res = np.empty(nv_taille)
    
    if len(vect) == 0:
        res[:] = np.nan
        return res
        
        
    j = 0
    for i in range(nv_taille):
        if int(emp[j]) == i:
            res[i] = vect[j]
            if j<len(emp)-1:
                j = j+1
        else :
            res[i] = np.nan

    return res

# Tests if applying the two functions gives back the original vector
def test_Reshapes(n, proba):
    test = np.empty(n)
    for i in range(len(test)):
        if np.random.binomial(1,proba, 1) == 1:
            test[i] = np.random.poisson(1)
        else :
            test[i] = np.nan
            
    test2, tai = Vector_without_na(test)
    test3 = Reshape_to_prev_size(test2, tai, len(test))
    rep = True
    if len(test3) != len(test):
        rep = False
    else :
        for i in range(len(test)):
            if np.isnan(test[i]) == False:
                if test[i] != test3[i]:
                    rep = False
                    break
            elif np.isnan(test3[i]) == False:
                rep = False
                break
                
    if rep == False:
        print("Erreur, vecteurs non égaux")

#test for random values    np.random.uniform(0,1)    
for k in range(1, 100):
    test_Reshapes(1000, 0.5)

In [76]:
# The numpy mean function had a strange attitude so we implemented it ourselves
def moyenne_vect(vect):
    a = 0
    b = 0
    for i in range(len(vect)):
        if np.isnan(vect[i]) == False:
            a = a + vect[i]
            b = b + 1
    if b>0:
        moy = a/b
        return moy
    else:
        return 0
    
# Same for median
def mediane_vect(vect):
    vect2 = Vector_without_na(vect)
    if len(vect2)==0:
        print("Vecteur vide")
    
    return np.median(vect2)
    
# Every explaining variable is needed to predict the outcomes, so we had to impute them, here by the mean
# However, due to our Lasso function, coefficients associated to variables ith high missing variables rates will be less important
def impute_nan_numpy(X_n, type_imp):
    for i in range(X_n.shape[1]):
        
        if len(Vector_without_na(X_n[:, i]))==0:
            print("Vecteur vide")
        
        if type_imp == "Mean" :
            remp = moyenne_vect(X_n[:, i])
        elif type_imp == "Median" :
            remp = mediane_vect(X_n[:, i])
            
            #There is a bug where some medians are still nans - still not resolved
            # We put in this case the mean so that it does not crash
            if np.isnan(remp)==True:
                remp =  moyenne_vect(X_n[:, i])
        else : 
            print("Type implémenté inexistant.")
        
        X_n[:, i] = np.nan_to_num(X_n[:,i], copy=True, nan = remp)
        #print(np.count_nonzero(np.isnan(X_n[i, :]))==len(X_n[i,:]))

    return X_n

In [77]:
# Let's make the functions that calulate the lasso vector
# The function centers and normalizes the matrix x

def Calculate_lasso_Keryann(x, y, mu):
    
    # Lasso initiation
    hml.ERRORS_HANDLING = "ignore"
    lasso_res = hml.HMLasso(mu)
    
    # We create the estimator
    lasso_res.fit(x, y)
    
    return lasso_res


def Calculate_lasso_impute(x, y, mu, t_imput):
    
        x = impute_nan_numpy(x, t_imput)
        
        # Initialisation and model computation
        clf = linear_model.Lasso(alpha=mu)
        clf.fit(x, y)
        return clf

# Takes account of all the lasso regression types
def Calculate_lasso(x, y, mu, t_lass, t_imput):
    
    if x.shape[0] != y.shape[0]:
        print("x and y do not have the same number of rows")
    
    # We process the explaining variables
    scaler = StandardScaler() #(with_std=False)
    x = scaler.fit_transform(x)
    
    # We center the outcomes
    y = y - moyenne_vect(y)
    
    if t_lass==0:
        return Calculate_lasso_Keryann(x,y,mu)
    if t_lass==1:
        return Calculate_lasso_impute(x, y, mu, t_imput)
    else :
        print("The chosen lasso regression does not exist")

Let's make the main function.

In [78]:
# Function that predict each wave i index with predicted values from the previous waves, non-temporal variables and variables from wave i
def Lasso_2SLS(X, Y, mu, tps, type_lasso, type_imputation, gen_var):
    liste = []
    all_lasso = []
    var_intemp = 1*non_waves_columns
    temps = time.localtime()
    res = 0
    max_iter = 15
    print("Fonction commencée à ", (int(temps.tm_hour) + 2)%24, "h", temps.tm_min)
    
    
    for i in range(1,max_iter):
        
        # We add an item to the list of explaining variables
        columns_wave_i = [col.replace('w', str(i)) for col in temporal_variables.T[i-1].index[temporal_variables.T[i-1]] if col != "GHIw"]
        if i == max_iter - 1 and gen_var == True :
            liste.append(X.loc[X["INW"+str(i)] == 1, columns_wave_i + var_intemp])
        else :
            liste.append(X.loc[X["INW"+str(i)] == 1, columns_wave_i])
    
        # Creation of the vector of outputs
        y_train_i, emp_i = Vector_without_na(Y["tSNE_GHI" + str(i)].values)
    
        # Computation of the lasso
        lasso = Calculate_lasso(liste[i-1], y_train_i, mu, type_lasso, type_imputation)
        
        # We add the lasso to the list of lasso
        all_lasso.append(lasso)
    
        # Let's predict the index at wave i
        temp = liste[i-1].to_numpy()
        temp = impute_nan_numpy(temp, type_imputation)    # We impute missing values for the prediction - here with the mean
        # We predict the outcomes
        y_pred_i = lasso.predict(temp)
    
        # We have to align the predicted values from the wave i-1 to the set from the wave i
        # We will put the predicted outputs back to the size of X et add it to the latter              
        y_pred_i_aligned = Reshape_to_prev_size(y_pred_i , emp_i , X.shape[0])
    
        if i<max_iter-1:
            # We add it to X
            X["Pred_GHI"+str(i)] = y_pred_i_aligned
            var_intemp.append("Pred_GHI" + str(i))
            
        if i==max_iter-1:
            res = y_pred_i_aligned
            
        if tps == True:
            temps = time.localtime()
            print("Wave ", i, " terminée à ", (int(temps.tm_hour) + 2)%24, "h", temps.tm_min)
            
    return all_lasso, res

## Application on the testing and validation sets

In [79]:
# Compute the variance of a vector
def variance_vect(vect):
    a = 0
    b = 0
    moyenne = moyenne_vect(vect)
    for i in range(len(vect)):
        if np.isnan(vect[i]) == False:
            a = a + (vect[i]-moyenne)**2
            b = b + 1
    if b>0:
        moy = a/b
        return moy
    else:
        return 0

# Compute the R² of a predicted vector compared with the original outcome
def R_square(original, pred, affiche):
    a = variance_vect(original)
    b = variance_vect(pred)
    R_2 = b/a
    
    if affiche == True:
        print("R² = ", R_2)
    else :
        return R_2

# Compute the MSE or the RMSE
def squared_error(vect1, vect2, affiche, mean, root):
    if len(vect1) != len(vect2):
        print("Both vectors need to be the same length")
        return -1
    else:
        a = 0
        moy1 = moyenne_vect(vect1)
        moy2 = moyenne_vect(vect2)
                         
        for i in range(len(vect1)):
                         
            if np.isnan(vect1[i])==True :
                vect1[i] = moy1
                         
            if np.isnan(vect2[i])==True :
                vect2[i] = moy2
                         
            a = a + (vect1[i] - vect2[i])**2
        
        if mean==True:
            a = a/len(vect1)
            
        if root == True:
            a = np.sqrt(a)
                         
        if affiche==True:
            texte = "Squared-error"
            if mean==True :
                texte = "Mean" + texte
            if root==True:
                texte = "Root" + texte
                  
            print("Squared-error = ", a)
        else :
            return a
        
# Prend un vecteur et renvoie le nombre d'éléments non nuls de ce vecteur
# eps_ou_non est un booléen.
#S'il vaut True, on ne considère pas une églité stricte, mais si la distance entre deux valurs est supérieure à eps ou non
def nombre_Non_Nuls(vect, eps = 0, eps_ou_non = False):
    
    non_nul = 0
    if eps_ou_non == False :
        for i in range(len(vect)):
            if vect[i] != 0 :
                non_nul = non_nul + 1
                
    else :
        for i in range (len(vect)):
            if (abs(vect[i])) > 0 :
                non_nul = non_nul + 1
            
    return non_nul

In [80]:
# Apply a model to some data
def guess_outcome(X, Y, model, type_imputation, gen_incl):
    liste = []
    var_intemp = 1*non_waves_columns
    res = 0
    max_iter = 15    
    
    for i in range(1,max_iter):
        
        # We add an item to the list of explaining variables
        columns_wave_i = [col.replace('w', str(i)) for col in temporal_variables.T[i-1].index[temporal_variables.T[i-1]] if col != "GHIw"]
        if i == max_iter - 1 and gen_incl == True :
            liste.append(X.loc[X["INW"+str(i)] == 1, columns_wave_i + var_intemp])
        else :
            liste.append(X.loc[X["INW"+str(i)] == 1, columns_wave_i])
        
        # Scaling list in i-1
        #We normalize and center X_v
        scaler = StandardScaler() #(with_std=False)
        liste[i-1] = scaler.fit_transform(liste[i-1])
    
    
        # Creation of the vector of outputs
        y_train_i, emp_i = Vector_without_na(Y["tSNE_GHI" + str(i)].values)
    
        # Let's predict the index at wave i
        temp = liste[i-1]
        temp = impute_nan_numpy(temp, type_imputation)    # We impute missing values for the prediction - here with the mean
        # We predict the outcomes
        y_pred_i = model[i-1].predict(temp)
    
        # We have to align the predicted values from the wave i-1 to the set from the wave i
        # We will put the predicted outputs back to the size of X et add it to the latter              
        y_pred_i_aligned = Reshape_to_prev_size(y_pred_i , emp_i , X.shape[0])
    
        if i<max_iter-1:
            # We add it to X
            X["Pred_GHI"+str(i)] = y_pred_i_aligned
            var_intemp.append("Pred_GHI" + str(i))
            
        if i==max_iter-1:
            res = y_pred_i_aligned
            
    return res

### Validation tests
We will test different values as $\mu$, the penalty weight, so as to find a good, if not the best.

In [132]:
# return the major informations suming up a model
def return_summary_model(X_t, Y_t, X_v, Y_v, mu, tps, type_lasso, type_imputation, gen_var):
    lasso, pred = Lasso_2SLS(X_t, Y_t, mu, tps, type_lasso, type_imputation, gen_var)
    Y_t_centered = Y_t["tSNE_GHI14"].values - moyenne_vect(Y_t["tSNE_GHI14"].values)
    Y_v_centered =  Y_v["tSNE_GHI14"].values - moyenne_vect(Y_v["tSNE_GHI14"].values)
                         
    pred_v = guess_outcome(X_v, Y_v, lasso, type_imputation, gen_var)
                         
    #R2 = R_square(Y_v_centered, pred_v, False)
    R2 = r2_score(Y_v_centered, pred_v)
    #err_tr = squared_error(pred, Y_t_centered, False, True, True)
    err_tr = mean_squared_error(pred, Y_t_centered, squared=False)
    #err_val = squared_error(pred_v, Y_v_centered, False, True, True)
    err_val = mean_squared_error(pred_v, Y_v_centered, squared=False)
                         
    return R2, err_tr, err_val, lasso

# Sum up a list of models by calculating a R², some RMSE and the list of non-null coefficients
# Return a dataframe
def summaries_models_dataframe(X_t, Y_t, X_v, Y_v, l_mu, tps,tps2, type_lasso, l_imputation, gen_var):
    Stats_Modeles = pd.DataFrame(columns = ["Modèle", "RMSE - Training set", "RMSE - Validation Set", "Sum RMSE", "R² Validation", "Coefficients non nuls"])
    
    for i in l_imputation :
        for j in l_mu :
            a,b, c, lasso = return_summary_model(X_t, Y_t, X_v, Y_v, j, tps, type_lasso, i, gen_var)
            ifgen = ""
            if gen_var == True :
                ifgen = " génétique"
                
            d = "Lasso" + ifgen + " (μ = " + str(j) + ", imputation = "
            
            if type_lasso == 0:
                d = d + "HMLasso"
            elif type_lasso == 1:
                d = d + str(l_imputation) + ")"
                
            if type_lasso == 0:
                non_nul = np.nan
            else :
                non_nul = nombre_Non_Nuls(lasso[13].coef_)
            
            tempo = pd.DataFrame([[d, b, c, b+c, a, non_nul]], columns = ["Modèle", "RMSE - Training set", "RMSE - Validation Set", "Sum RMSE", "R² Validation", "Coefficients non nuls"])
            Stats_Modeles = pd.concat([Stats_Modeles, tempo], ignore_index = True)
                
            if tps2==True:
                temps = time.localtime()
                print("Modèle ", i , " et " , j, " terminée à ", (int(temps.tm_hour) + 2)%24, "h", temps.tm_min)
    return Stats_Modeles

In [91]:
# The code computing the different models has been regrouped here for clarity
# However, we do not advise to run this cell as it would take more than a day to run

list_imputation = ["Mean"]

# Traditionnal Lasso
list_mu = [0, 0.0001, 0.05, 0.1, 0.5, 1, 2, 4, 7, 8, 9, 10]
list_imputation = ["Mean"]

Liste_modeles = summaries_models_dataframe(X_train, Y_train, X_valid, Y_valid, list_mu, False, True, 1, list_imputation, True)
Liste_modeles2 = summaries_models_dataframe(X_train, Y_train, X_valid, Y_valid, list_mu, False, True, 1, list_imputation, False)

Liste_modeles = pd.concat([Liste_modeles, Liste_modeles2], ignore_index = True)

list_mu = [0.5]
list_imputation = ["Median"]

Liste_modeles2 = summaries_models_dataframe(X_train, Y_train, X_valid, Y_valid, list_mu, False, True, 1, list_imputation, False)
Liste_modeles = pd.concat([Liste_modeles, Liste_modeles2], ignore_index = True)

# HMLasso
list_mu = [1, 8, 10]
list_imputation = ["Mean"]

Liste_modeles2 = summaries_models_dataframe(X_train, Y_train, X_valid, Y_valid, list_mu, False, True, 0, list_imputation, True)
Liste_modeles = pd.concat([Liste_modeles, Liste_modeles2], ignore_index = True)

Liste_modeles2 = summaries_models_dataframe(X_train, Y_train, X_valid, Y_valid, list_mu, False, True, 0, list_imputation, False)
Liste_modeles = pd.concat([Liste_modeles, Liste_modeles2], ignore_index = True)

Fonction commencée à  14 h 42
Wave  1  terminée à  14 h 43
Wave  2  terminée à  14 h 44
Wave  3  terminée à  14 h 45
Wave  4  terminée à  14 h 47
Wave  5  terminée à  14 h 49
Wave  6  terminée à  14 h 51
Wave  7  terminée à  14 h 54
Wave  8  terminée à  14 h 56
Wave  9  terminée à  14 h 59
Wave  10  terminée à  15 h 4
Wave  11  terminée à  15 h 9
Wave  12  terminée à  15 h 15
Wave  13  terminée à  15 h 24
Wave  14  terminée à  15 h 36
Modèle  Mean  et  0.0001  terminée à  15 h 39
Fonction commencée à  15 h 39
Wave  1  terminée à  15 h 40
Wave  2  terminée à  15 h 41
Wave  3  terminée à  15 h 42
Wave  4  terminée à  15 h 43
Wave  5  terminée à  15 h 45
Wave  6  terminée à  15 h 47
Wave  7  terminée à  15 h 50
Wave  8  terminée à  15 h 53
Wave  9  terminée à  15 h 56
Wave  10  terminée à  16 h 1
Wave  11  terminée à  16 h 6
Wave  12  terminée à  16 h 12
Wave  13  terminée à  16 h 22
Wave  14  terminée à  16 h 33
Modèle  Mean  et  0  terminée à  16 h 36
Fonction commencée à  16 h 36
Wave 

In [140]:
Liste_modeles

Unnamed: 0,Modèle,RMSE - Training set,RMSE - Validation Set,Sum RMSE,R² Validation,Coefficients non nuls
0,"Lasso génétique (μ = 0.0001, imputation = ['Me...",732831400.0,48.539115,732831500.0,0.206183,1029.0
1,"Lasso génétique (μ = 0, imputation = ['Mean'])",744800800.0,48.511893,744800800.0,0.207073,1076.0
2,"Lasso génétique (μ = 0.05, imputation = ['Mean'])",260818600.0,47.785586,260818600.0,0.230639,696.0
3,"Lasso (μ = 0.0001, imputation = ['Mean'])",4447409.0,47.726365,4447456.0,0.232544,783.0
4,"Lasso (μ = 0, imputation = ['Mean'])",4511077.0,47.728325,4511124.0,0.232481,830.0
5,"Lasso (μ = 0.05, imputation = ['Mean'])",4129901.0,47.268454,4129949.0,0.2472,511.0
6,"Lasso génétique (μ = 0.5, imputation = ['Mean'])",1941538.0,46.897554,1941585.0,0.258968,219.0
7,"Lasso génétique (μ = 1, imputation = ['Mean'])",1909420.0,47.104458,1909467.0,0.252415,99.0
8,"Lasso génétique (μ = 0.1, imputation = ['Mean'])",79512460.0,47.350143,79512510.0,0.244596,597.0
9,"Lasso (μ = 0.5, imputation = ['Mean'])",1480352.0,47.04423,1480399.0,0.254326,191.0


In [131]:
# We export the results to have a lasting back-up
Liste_modeles.to_csv('Lassos_Liste.csv', index = True)

In [142]:
Liste_modeles.sort_values(by = ["R² Validation"], ascending=True)

Unnamed: 0,Modèle,RMSE - Training set,RMSE - Validation Set,Sum RMSE,R² Validation,Coefficients non nuls
29,"Lasso (μ = 10, imputation = Keryann's",286.2465,50.505661,336.7522,0.140558,
28,"Lasso génétique (μ = 10, imputation = Keryann's",12453.55,50.505607,12504.06,0.14056,
23,"Lasso (μ = 10, imputation = ['Mean'])",52.33941,50.47913,102.8185,0.141461,8.0
22,"Lasso (μ = 9, imputation = ['Mean'])",52.13823,50.180218,102.3184,0.151598,10.0
20,"Lasso génétique (μ = 10, imputation = ['Mean'])",57.21386,49.892759,107.1066,0.16129,8.0
21,"Lasso (μ = 8, imputation = ['Mean'])",51.93599,49.81879,101.7548,0.163775,11.0
27,"Lasso (μ = 8, imputation = Keryann's",82.78353,49.806971,132.5905,0.164172,
26,"Lasso génétique (μ = 8, imputation = Keryann's",166.7299,49.806359,216.5363,0.164193,
18,"Lasso génétique (μ = 8, imputation = ['Mean'])",102.7797,49.571993,152.3517,0.17204,12.0
19,"Lasso génétique (μ = 9, imputation = ['Mean'])",90.71367,49.570819,140.2845,0.172079,9.0


We will keep one model. The one maximizing the R² of the validation set is the same one which minimises the validation set RMSE. We consider for a model both cases: with and without the genetic data.
This method makes us keep the following model: The mean imputated lasso with $\mu = 0.5$.

### Testing set

We will apply the model kept to the testing set.

In [128]:
list_mu = [0.5]
list_imputation = ["Mean"]

Liste_modeles_testing_T = summaries_models_dataframe(X_train, Y_train, X_test, Y_test, list_mu, False, True, 1, list_imputation, True)
Liste_modeles_testing_F = summaries_models_dataframe(X_train, Y_train, X_test, Y_test, list_mu, True, True, 1, list_imputation, False)

Fonction commencée à  21 h 28
Wave  1  terminée à  21 h 30
Wave  2  terminée à  21 h 31
Wave  3  terminée à  21 h 32
Wave  4  terminée à  21 h 34
Wave  5  terminée à  21 h 36
Wave  6  terminée à  21 h 38
Wave  7  terminée à  21 h 42
Wave  8  terminée à  21 h 45
Wave  9  terminée à  21 h 49
Wave  10  terminée à  21 h 54
Wave  11  terminée à  22 h 0
Wave  12  terminée à  22 h 6
Wave  13  terminée à  22 h 15
Wave  14  terminée à  22 h 28
Modèle  Mean  et  0.5  terminée à  22 h 31
Fonction commencée à  22 h 31
Wave  1  terminée à  22 h 32
Wave  2  terminée à  22 h 33
Wave  3  terminée à  22 h 35
Wave  4  terminée à  22 h 37
Wave  5  terminée à  22 h 39
Wave  6  terminée à  22 h 41
Wave  7  terminée à  22 h 44
Wave  8  terminée à  22 h 48
Wave  9  terminée à  22 h 52
Wave  10  terminée à  22 h 58
Wave  11  terminée à  23 h 4
Wave  12  terminée à  23 h 10
Wave  13  terminée à  23 h 18
Wave  14  terminée à  23 h 30
Modèle  Mean  et  10  terminée à  23 h 33
Fonction commencée à  23 h 33
Wave  

In [12]:
Liste_modeles_testing = pd.concat([Liste_modeles_testing_F, Liste_modeles_testing_T], ignore_index = True)
Liste_modeles_testing = Liste_modeles_testing.rename(columns={"RMSE - Validation Set" : "RMSE - Testing Set", "R² Validation" : "R² Testing"})
Liste_modeles_testing

Unnamed: 0,Modèle,RMSE - Training set,RMSE - Testing Set,Sum RMSE,R² Testing,Coefficients non nuls
0,"Lasso (μ = 0.5, imputation = ['Mean'])",1480352.0,48.27828,1480400.0,0.276727,191
1,"Lasso génétique (μ = 0.5, imputation = ['Mean'])",1941538.0,48.032847,1941586.0,0.284062,219


We can see a slight improvement from the $R^2$ when we add the genetic components but this is only by 0.07 which makes it difficult to defend the fact that the considered genetic markers have an impact on the health level.

In [134]:
# We export the results to have a lasting back-up
Liste_modeles_testing.to_csv('Lassos_Liste_Testing.csv', index = True)