Import of libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # To standardize the data
import cvxpy as cp

Import of the HMLasso function

In [2]:
### Adapt the path "C:/Users/Kilian/Desktop/ENSAE/STATAPP" to run the cell

import sys
sys.path.insert(1, 'C:/Users/Kilian/Desktop/ENSAE/STATAPP/Projet_Statapp/pretreatment')

import file_04_HMLasso as hml

## Data downloading and separation of the dataset

Dataset containing the types of each column from data_03.csv

In [3]:
columns_types = pd.read_csv("data_03_columns_types.csv", index_col=0)
columns_types.head(3)

Unnamed: 0,Name,Type
0,HHIDPN,Cont
1,HHID,Char
2,PN,Char


Downloading the data with social and genetic variables.

In [4]:
data = pd.read_csv("data_03.csv")

  data = pd.read_csv("data_03.csv")


The column "genetic_Section_A_or_E" have mixed types, so we change its format.

In [5]:
temporary = np.where(data['genetic_Section_A_or_E'] == 'E', 1, np.where(data['genetic_Section_A_or_E'] == 'A', 0, np.nan))

In [6]:
data["genetic_Section_A_or_E"] = temporary

Now we add the health index created by t-SNE

In [7]:
tSNE_GHI = pd.read_csv("data_tSNE_GHI.csv")

We merge the t-SNE health index to the data

In [8]:
data = data.merge(tSNE_GHI, how ='left', on ='HHIDPN')

The final outcome to predict is tSNE_GHI14, so we only keep individuals who were interviewed during the last wave (14th wave)

In [9]:
data_bis = data[data['tSNE_GHI14'].notna()]

Number of individuals present in every waves.

In [10]:
tSNE_GHI[~tSNE_GHI.isnull().any(axis=1)].shape[0]

3396

We select the outcome tSNE_GHI

In [11]:
Y = data_bis[["HHIDPN"]+["tSNE_GHI" + str(i) for i in range (1,15)]]

We drop the previous health index GHIw from the data, which won't be used as outcome.
(list_columns_GHI contains the names of GHIw columns).

We drop the outcome to create the matrix X.

In [12]:
X = data_bis.drop(["GHI" + str(i) for i in range (1,15)], axis = 1)
X.drop(["tSNE_GHI" + str(i) for i in range (1,15)], axis = 1, inplace =True)

Now we split the dataset into training, validation and test sets.

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=18)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_test, Y_test, test_size=0.5, random_state = 6)

Smaller sets while coding

In [14]:
nb_test, nb_train, nb_valid = len(X_test.index)//10, len(X_train.index)//10, len(X_valid.index)//10
X_test, Y_test = X_test.iloc[:nb_test], Y_test.iloc[:nb_test]
X_train, Y_train = X_train.iloc[:nb_train], Y_train.iloc[:nb_train]
X_valid, Y_valid = X_valid.iloc[:nb_valid], Y_valid.iloc[:nb_valid]

## Machine learning

The objective here is to make a dataset where we observe if each variable exists at each wave

In [130]:
temporal_variables = {}
waves_columns = [col for col in X_train.columns if "genetic_" not in col and col[1] in "123456789"]
for col in waves_columns:
  char = col[0] # R or H
  if col[2] in "01234":
    wave = col[1:3]
    suffix = col[3:]
  else:
    wave = col[1]
    suffix = col[2:]
  variable = char + 'w' + suffix
  
  if variable not in temporal_variables.keys():
    temporal_variables[variable] = np.zeros((14), dtype=bool)
  
  temporal_variables[variable][int(wave)-1] = True

temporal_variables = pd.DataFrame(temporal_variables)

# We manually add "tSNE_GHIw":
temporal_variables["tSNE_GHIw"] = np.ones((14), dtype=bool)
waves_columns += [f"tSNE_GHI{w}" for w in range(1,15)]

In [131]:
# Timeless data
non_waves_columns = [col for col in X_train.columns if col not in waves_columns]
To_remove = ["HHIDPN","PN","HHID","RAHHIDPN"]+["INW"+str(i+1) for i in range (14)]
for x in To_remove:
    non_waves_columns.remove(x)

We put the explaining variables by wave in a list of dataset Intemporal variables are put in each one of them

In [132]:
liste = []    # len = 14 
for i in range(14):
    columns_wave_i = ["HHIDPN"]+[col.replace('w', str(i+1)) for col in temporal_variables.T[i].index[temporal_variables.T[i]] if col != "tSNE_GHIw"]
    #Intemporal variables only in the first wave, to avoid duplicated labels issues
    if i == 0:
        liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i + non_waves_columns])
    else:
        liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i])

In [134]:
#Reduce number of variables to code
import random
temporal_variables_2 = temporal_variables.iloc[:,[i for i in range(1,30)]+[-i for i in range(1,10)]]
non_waves_columns_2 = random.choices(non_waves_columns,k=5)

liste = [] 
for i in range(14):
    columns_wave_i = ["HHIDPN"]+[col.replace('w', str(i+1)) for col in temporal_variables_2.T[i].index[temporal_variables_2.T[i]] if col != "tSNE_GHIw"]
    #Add the intemporal variables only to the last wave, to avoid duplicated labels issues
    if i == 13:
        liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i + non_waves_columns_2])
    else:
        liste.append(X_train.loc[X_train["INW"+str(i+1)] == 1, columns_wave_i])

We apply the lasso to select variables

We start to initialize with a first lasso on the first wave.

In [135]:
scaler = StandardScaler()#(with_std=False)
hml.ERRORS_HANDLING = "ignore"

In [136]:
X_train1 = liste[0].drop("HHIDPN",axis=1)
Y_train1 = Y_train.iloc[:,1]
Y_train1.dropna(inplace =True)
Y_train1 = Y_train1.values
Y_train1 = (Y_train1 - np.mean(Y_train1))/np.std(Y_train1)

In [137]:
X_train1 = scaler.fit_transform(X_train1)

In [138]:
lasso = hml.HMLasso(mu = 100)
lasso.fit(X_train1, Y_train1)



In [139]:
coefficients = np.abs(lasso.beta_opt.copy())

var_to_keep = coefficients < 10**(-14)
var_to_keep = np.insert(var_to_keep,0,False)

In [140]:
entire_data = liste[0]
selected = entire_data[entire_data.columns[~var_to_keep]]

function to impute missing data created when merging by mean but without touching Na values already there before the merge.

In [141]:
def Na_management(df1, df2, index):
    
    merged = df1.merge(df2, how='outer', on = index)
    
    df1_index = df1.set_index(index)
    df2_index = df2.set_index(index)
    
    merged = merged.fillna(merged.mean())
    merged = merged.set_index(index)
    
    df1_index = df1_index.fillna("NaN")
    merged.update(df1_index)
    
    df2_index = df2_index.fillna("NaN")
    merged.update(df2_index)
    
    merged = merged.replace("NaN",np.nan)
    
    merged = merged.reset_index()
    
    return merged

In [142]:
scaler = StandardScaler()#(with_std=False)
hml.ERRORS_HANDLING = "ignore"

for i in range (1,14) :
    
    print("wave",i+1)
    
    var_to_select = Na_management(selected, liste[i], "HHIDPN")
    
    Y_train_i = Y_train.iloc[:,[0,i+1]]
    X_Y_train = var_to_select.merge(Y_train_i, how = 'left', on = "HHIDPN")
    
    Y_train_i = X_Y_train[f"tSNE_GHI{i+1}"]
    X_train_i = X_Y_train.drop([f"tSNE_GHI{i+1}","HHIDPN"], axis =1)
    
    Y_train_i = Y_train_i.fillna(Y_train_i.mean())
    Y_train_i = Y_train_i.values
    Y_train_i = (Y_train_i - np.mean(Y_train_i))/np.std(Y_train_i)
    
    X_train_i = scaler.fit_transform(X_train_i)
    
    lasso = hml.HMLasso(mu = 100)
    lasso.fit(X_train_i, Y_train_i)
    
    coefficients = np.abs(lasso.beta_opt.copy())

    var_to_keep = coefficients < 10**(-14)
    var_to_keep = np.insert(var_to_keep,0,False)
    
    entire_data = var_to_select
    selected = entire_data[entire_data.columns[~var_to_keep]]

1
2
3
4
5
6
7
8
9
10
11
12
13
