Import of libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # To standardize the data
import cvxpy as cp
from sklearn import linear_model

Import of the HMLasso function

In [3]:
### Adapt the path "C:/Users/Kilian/Desktop/ENSAE/STATAPP" to run the cell

import sys
sys.path.insert(1, 'C:/Users/Kilian/Desktop/ENSAE/STATAPP/Projet_Statapp/pretreatment')

import file_04_HMLasso as hml

## Data downloading and separation of the dataset

Dataset containing the types of each column from data_03.csv

In [4]:
columns_types = pd.read_csv("data_03_columns_types.csv", index_col=0)
columns_types.head(3)

Unnamed: 0,Name,Type
0,HHIDPN,Cont
1,HHID,Char
2,PN,Char


Downloading the data with social and genetic variables.

In [5]:
data = pd.read_csv("data_03.csv")

  data = pd.read_csv("data_03.csv")


The column "genetic_Section_A_or_E" have mixed types, so we change its format.

In [6]:
temporary = np.where(data['genetic_Section_A_or_E'] == 'E', 1, np.where(data['genetic_Section_A_or_E'] == 'A', 0, np.nan))

In [7]:
data["genetic_Section_A_or_E"] = temporary

Now we add the health index created by t-SNE

In [8]:
tSNE_GHI = pd.read_csv("data_tSNE_GHI.csv")

We merge the t-SNE health index to the data

In [9]:
data = data.merge(tSNE_GHI, how ='left', on ='HHIDPN')

The final outcome to predict is tSNE_GHI14, so we only keep individuals who were interviewed during the last wave (14th wave)

In [10]:
data_bis = data[data['tSNE_GHI14'].notna()]

Number of individuals present in every waves.

In [11]:
tSNE_GHI[~tSNE_GHI.isnull().any(axis=1)].shape[0]

3396

We select the outcome tSNE_GHI

In [12]:
Y = data_bis[["HHIDPN"]+["tSNE_GHI" + str(i) for i in range (1,15)]]

We drop the previous health index GHIw from the data, which won't be used as outcome.
(list_columns_GHI contains the names of GHIw columns).

We drop the outcome to create the matrix X.

In [13]:
X = data_bis.drop(["GHI" + str(i) for i in range (1,15)], axis = 1)
X.drop(["tSNE_GHI" + str(i) for i in range (1,15)], axis = 1, inplace =True)

Now we split the dataset into training, validation and test sets.

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=18)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_test, Y_test, test_size=0.5, random_state = 6)

Smaller sets while coding

In [14]:
nb_test, nb_train, nb_valid = len(X_test.index)//10, len(X_train.index)//10, len(X_valid.index)//10
X_test, Y_test = X_test.iloc[:nb_test], Y_test.iloc[:nb_test]
X_train, Y_train = X_train.iloc[:nb_train], Y_train.iloc[:nb_train]
X_valid, Y_valid = X_valid.iloc[:nb_valid], Y_valid.iloc[:nb_valid]

## Machine learning

The objective here is to make a dataset where we observe if each variable exists at each wave

In [36]:
def dataset_temporal_variables (X_train,add_tSNE_GHIw):   
    temporal_variables = {}
    waves_columns = [col for col in X_train.columns if "genetic_" not in col and col[1] in "123456789"]
    for col in waves_columns:
      char = col[0] # R or H
      if col[2] in "01234":
        wave = col[1:3]
        suffix = col[3:]
      else:
        wave = col[1]
        suffix = col[2:]
      variable = char + 'w' + suffix

      if variable not in temporal_variables.keys():
        temporal_variables[variable] = np.zeros((14), dtype=bool)

      temporal_variables[variable][int(wave)-1] = True

    temporal_variables = pd.DataFrame(temporal_variables)

    # We manually add "tSNE_GHIw":
    if add_tSNE_GHIw:
        temporal_variables["tSNE_GHIw"] = np.ones((14), dtype=bool)
        waves_columns += [f"tSNE_GHI{w}" for w in range(1,15)]
        
    return (temporal_variables,waves_columns)

In [37]:
# Timeless data
def timeless_variables(X_train,waves_columns):
    non_waves_columns = [col for col in X_train.columns if col not in waves_columns]
    To_remove = ["HHIDPN","PN","HHID","RAHHIDPN"]+["INW"+str(i+1) for i in range (14)]
    for x in To_remove:
        non_waves_columns.remove(x)
    return non_waves_columns

We put the explaining variables by wave in a list of dataset Intemporal variables are put in each one of them

In [38]:
import random

def list_wave(X_train, reduced):
    (temporal_variables , waves_columns) = dataset_temporal_variables(X_train,True)
    non_waves_columns = timeless_variables(X_train,waves_columns)
    
    #Reduce number of variables to code
    if reduced:
        temporal_variables_2 = temporal_variables.iloc[:,[i for i in range(1,15)]+[-i for i in range(1,5)]]
        non_waves_columns_2 = random.choices(non_waves_columns,k=5)

        liste = [] 
        for i in range(14):
            columns_wave_i = ["HHIDPN"]+[col.replace('w', str(i+1)) for col in temporal_variables_2.T[i].index[temporal_variables_2.T[i]] if col != "tSNE_GHIw"]
            #Add the intemporal variables only to the last wave, to avoid duplicated labels issues
            if i == 13:
                liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i + non_waves_columns_2])
            else:
                liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i])
                
    #All the variables
    else:
        liste = []    # len = 14 
        for i in range(14):
            columns_wave_i = ["HHIDPN"]+[col.replace('w', str(i+1)) for col in temporal_variables.T[i].index[temporal_variables.T[i]] if col != "tSNE_GHIw"]
            #Intemporal variables only to the last wave, to avoid duplicated labels issues
            if i ==  13:
                liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i + non_waves_columns])
            else:
                liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i])
                
    return (liste)
    

### Lasso selection

We start to initialize with a first lasso on the first wave.

In [39]:
def initialize_Lasso(liste, Y_train, HMLasso, method, mu, limit):
    
    print("wave",1)
    
    scaler = StandardScaler()#(with_std=False)
    hml.ERRORS_HANDLING = "ignore"
    
    #Prepare data
    X_train1 = liste[0].drop("HHIDPN",axis=1)
    Y_train1 = Y_train.iloc[:,1]
    Y_train1.dropna(inplace =True)
    Y_train1 = Y_train1.values
    Y_train1 = (Y_train1 - np.mean(Y_train1))/np.std(Y_train1)
  
    #HMLasso
    if HMLasso:
        #Standardize X_train
        X_train1 = scaler.fit_transform(X_train1)
        
        coefficients = apply_HMLasso(X_train1, Y_train1,mu)
        
        #Variables to keep
        var_to_keep = coefficients > 10**(limit)
    
    #Common Lasso
    else:
        X_train1 = Na_imputation(X_train1, method)
        
        #Standardize X_train
        X_train1 = scaler.fit_transform(X_train1)
        
        coefficients = apply_Lasso(X_train1, Y_train1, mu)
        
        #Variables top keep
        var_to_keep = coefficients != 0

    #Selection of variables
    print("Variables kept :", list(var_to_keep).count(1))
    var_to_keep = np.insert(var_to_keep,0,True)
    
    entire_data = liste[0]
    selected = entire_data[entire_data.columns[var_to_keep]]
        
    return selected

In [40]:
def apply_HMLasso(X, Y, mu):
    
    lasso = hml.HMLasso(mu)
    lasso.fit(X, Y)
    
    coefficients = np.abs(lasso.beta_opt.copy())
    
    return coefficients

def apply_Lasso(X,Y, mu):
    
    clf = linear_model.Lasso(alpha=mu)
    clf.fit(X, Y)
    
    coefficients = clf.coef_
    
    return coefficients

In [41]:
def Na_imputation(X, method):
    if method == "mean":
        return X.fillna(X.mean())

function to impute missing data created when merging by mean but without touching Na values already there before the merge.

In [42]:
def Na_management(df1, df2, index):
    
    merged = df1.merge(df2, how='outer', on = index)
    
    df1_index = df1.set_index(index)
    df2_index = df2.set_index(index)
    
    merged = merged.fillna(merged.mean())
    merged = merged.set_index(index)
    
    df1_index = df1_index.fillna("NaN")
    merged.update(df1_index)
    
    df2_index = df2_index.fillna("NaN")
    merged.update(df2_index)
    
    merged = merged.replace("NaN",np.nan)
    
    merged = merged.reset_index()
    
    return merged

Function to select variables by HMLasso

In [43]:
def Lasso_selection(X_train, Y_train, HMLasso, method, mu, limit, reduced):
    
    #If a columns contains only Nan values, we drop it
    empty_col = [col for col in X_train.columns if X_train[col].isnull().all()]
    if empty_col != []:
        X_train.drop(empty_col, axis=1, inplace=True)
    
    liste = list_wave(X_train, reduced)
    
    print("Lasso selection")
    
    selected = initialize_Lasso(liste, Y_train, HMLasso, method, mu, limit)
    
    scaler = StandardScaler()#(with_std=False)
    hml.ERRORS_HANDLING = "ignore"
    
    for i in range (1,14) :
    
        print("wave",i+1)

        var_to_select = Na_management(selected, liste[i], "HHIDPN")

        Y_train_i = Y_train.iloc[:,[0,i+1]]
        X_Y_train = var_to_select.merge(Y_train_i, how = 'left', on = "HHIDPN")

        Y_train_i = X_Y_train[f"tSNE_GHI{i+1}"]
        X_train_i = X_Y_train.drop([f"tSNE_GHI{i+1}","HHIDPN"], axis =1)

        Y_train_i = Y_train_i.fillna(Y_train_i.mean())
        Y_train_i = Y_train_i.values
        Y_train_i = (Y_train_i - np.mean(Y_train_i))/np.std(Y_train_i)

        
        #HMLasso
        if HMLasso:
            #Standardize X_train
            X_train_i = scaler.fit_transform(X_train_i)
        
            coefficients = apply_HMLasso(X_train_i, Y_train_i, mu)
        
            #Variables to keep
            var_to_keep = coefficients > 10**(limit)
            
        #Common Lasso
        else:
            X_train_i = Na_imputation(X_train_i, method)
        
            #Standardize X_train
            X_train_i = scaler.fit_transform(X_train_i)

            coefficients = apply_Lasso(X_train_i, Y_train_i, mu)

            #Variables top keep
            var_to_keep = coefficients != 0
            
        #Selection of variables
        print("Variables kept :", list(var_to_keep).count(1))
        var_to_keep = np.insert(var_to_keep,0,True)

        entire_data = var_to_select
        selected = entire_data[entire_data.columns[var_to_keep]] 
    
    return selected

### Within estimator

There two types of missing values, the "one-time" missing values when someone didn't awnser a question during the interview or so and the missing values when someone wasn't interviewed at all during a wave.


For the first type, we impute those missing values with the mean of the column (Nan).
(Possibility to work on another imputation method).

Then for the individuals who weren't interviewed during a wave, we replace the missing value with the temporal mean of the variable over time (NanNan)


Finally, we compute (A faire en latex) X_vague_ti = X_ti - temporal_mean(X_ti)

In [44]:
def creation_data_within(selected):
    
    #For the "one-time" missing values imputation by mean
    X_train_within = selected.fillna(selected.mean())
    
    ###For people who weren't interviewed
    # We start by adding the INWw columns to know if the individual was interviewed during the wave w
    X_train_within = X_train_within.merge(X_train[["HHIDPN"]+["INW"+str(i) for i in range(1,15)]], how ="left", on="HHIDPN")  
    
    #We recover the missing values for people who weren't interviewed during the wave w
    X_train_within = recover_missing(X_train_within)
    
    # Creation of the data set for within regression.
    (X_train_within, temporal_variables_within) = data_set_within(X_train_within)
    
    #Still Nan values in intemporal variables
    X_train_within = X_train_within.fillna(X_train_within.mean())
    
    return (X_train_within, temporal_variables_within)

This function return a dataset containing only variables concerned by the wave.

In [46]:
import re
import pickle

def get_wave(data, wave, non_temporal):
  """
  This function returns a smaller dataset summarizing all data for the given wave.

  Note that it also returns columns that are not relative to any wave (for instance, 'HHIDPN')
  """

  assert wave in range(1, 15)

  regex = re.compile("[0-9]+")
  if non_temporal:
        wave_columns = [col for col in data.columns if (len(regex.findall(col)) == 0 or regex.findall(col)[0] == str(wave))]
  else:
        wave_columns = [col for col in data.columns if (regex.findall(col)[0] == str(wave))]
  wave_data = data[wave_columns]

  return wave_data

Function to recover the missing values for people who weren't interviewed during the wave w

In [47]:
def recover_missing(X_train_within):

    X_train_within_index = X_train_within.set_index("HHIDPN")
    wave_1 = get_wave(X_train_within_index,1, non_temporal =False)
    wave_1.loc[wave_1["INW1"] == 0] = np.nan
    wave_1["INW1"].fillna(0)
    Tempo = wave_1

    for i in range(2,15):
        if i == 14:
            wave_i = get_wave(X_train_within_index, i, non_temporal =True)
        else:
            wave_i = get_wave(X_train_within_index, i, non_temporal =False)
        wave_i.loc[wave_i["INWw".replace('w', str(i))] == 0] = np.nan
        wave_i["INWw".replace('w', str(i))].fillna(0)
        Tempo = Tempo.merge(wave_i, how= "left", on = "HHIDPN")

    return Tempo.reset_index()

Get a dataframe to know which variables are in X_train_within

Function to compute (A faire en latex) X_vague_ti = X_ti - temporal_mean(X_ti)

It creates columns containing the temporal mean of a temporal variables and then replaces the Nan values (when people weren't interviewed) by this mean. Finally it creates the dataset  X_vague_ti.

In [48]:
def data_set_within (X_train_within):
    
    temporal_variables_within = dataset_temporal_variables(X_train_within, False)[0]
    
    X_within = X_train_within.copy()
    
    for col in temporal_variables_within.columns:
        index_wave = temporal_variables_within.index[temporal_variables_within[col]==1].tolist()
        names_waves = [col.replace('w', str(i+1)) for i in index_wave]
        # (~X_within[names_waves].isna()).sum(axis=1) = number of non missing values
        X_within[col+"_MEAN"] = X_within[names_waves].sum(axis=1)/(~X_within[names_waves].isna()).sum(axis=1)
        for x in names_waves:
            # Imputing the missing values by the temporal mean
            new_col = X_within[x].fillna(X_within[col+"_MEAN"])
            X_within[x] = new_col            
            #Creating the new data for within regression X_vague
            X_within[x] = X_within[x] - X_within[col+"_MEAN"]
    return (X_within, temporal_variables_within)

Now we do the same thing to Y_train but no need to impute the Nan values since the only outcome is tSNE_GHI14 (no Nan).

In [49]:
def creation_outcome_within(Y_train):

    Y_train_within = Y_train.copy()
    
    tSNE_GHI = [f"tSNE_GHI{w}" for w in range(1,15)]

    Y_train_within["tSNE_GHIw_MEAN"] = Y_train_within[tSNE_GHI].sum(axis=1)/(~Y_train_within[tSNE_GHI].isna()).sum(axis=1)
    Y_train_within["tSNE_GHI14_within"] = Y_train_within["tSNE_GHI14"] - Y_train_within["tSNE_GHIw_MEAN"]
    
    return Y_train_within

We can now proceed to the regression

In [50]:
def get_X_Y_within(X_train_within, Y_train_within, temporal_variables_within):
    
    data_regression = X_train_within.merge(Y_train_within[["HHIDPN","tSNE_GHI14_within"]], on = "HHIDPN")
    Y_regression = data_regression["tSNE_GHI14_within"]
    list_to_drop = ["HHIDPN","tSNE_GHI14_within"]+["INW"+str(i) for i in range(1,15)]+[col+"_MEAN" for col in temporal_variables_within.columns]
    X_regression = data_regression.drop(list_to_drop,axis=1)
    
    return (X_regression, Y_regression)
    

In [51]:
from sklearn.linear_model import LinearRegression

def regression(X_regression,Y_regression):
    
    modeleReg=LinearRegression()

    modeleReg.fit(X_regression,Y_regression) 
    
    print("intercept :" , modeleReg.intercept_)
    print("coefficients :", modeleReg.coef_)

    #compute R²
    print("R² :", modeleReg.score(X_regression,Y_regression))
    
    return modeleReg.score(X_regression,Y_regression)

In [52]:
def Within_estimates(X, Y, HMLasso, method, mu, limit, reduced):
    
    selected = Lasso_selection(X, Y, HMLasso, method, mu, limit, reduced)
        
    (X_within, temporal_variables_within) = creation_data_within(selected)
    Y_within = creation_outcome_within(Y_train)
    
    (X_regression, Y_regression) = get_X_Y_within(X_within, Y_within, temporal_variables_within)
    
    return regression(X_regression,Y_regression)

In [54]:
Within_estimates(X_train, Y_train, HMLasso = False, method="mean", mu = 0.05, limit = -14, reduced=False)

Lasso selection
wave 1
Variables kept : 26
wave 2
Variables kept : 28
wave 3
Variables kept : 32
wave 4
Variables kept : 38
wave 5
Variables kept : 40
wave 6
Variables kept : 47
wave 7
Variables kept : 38
wave 8
Variables kept : 42
wave 9
Variables kept : 47
wave 10
Variables kept : 41
wave 11
Variables kept : 41
wave 12
Variables kept : 44
wave 13
Variables kept : 42
wave 14
Variables kept : 49


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wave_1.loc[wave_1["INW1"] == 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wave_i.loc[wave_i["INWw".replace('w', str(i))] == 0] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wave_i.loc[wave_i["INWw".replace('w', str(i))] == 0] = np.nan
A value is trying to be set on a copy of

intercept : 5.892084151639099
coefficients : [-2.87321339e+00  2.01235667e+00  1.14928695e+01 -6.93046491e+00
 -1.11540116e+00 -1.84983801e-01 -6.12222352e-01  1.50158938e+00
  4.21884749e-14 -3.17523785e-14  2.06210985e-02 -1.86187899e-03
  1.72869585e+00  8.07676249e-01 -9.96673024e-01  0.00000000e+00
 -1.41399534e+00  0.00000000e+00  1.57840725e+00 -3.37249815e-02
  0.00000000e+00 -2.06210985e-02  0.00000000e+00  1.86187899e-03
  0.00000000e+00  5.20176906e+00  9.19947260e-01  0.00000000e+00
 -2.33228923e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.41399534e+00  0.00000000e+00
 -1.57840725e+00 -1.14591445e+01  0.00000000e+00  0.00000000e+00]
R² : 0.021940641267517824


0.021940641267517824