In [53]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model

## Data import

In [2]:
data = pd.read_csv("data_03.csv", low_memory=False)

In [3]:
temp = np.where(data['genetic_Section_A_or_E'] == 'E', 1, np.where(data['genetic_Section_A_or_E'] == 'A', 0, np.nan))
data["genetic_Section_A_or_E"] = temp

In [30]:
tSNE_GHI = pd.read_csv("data_tSNE_GHI.csv")
tSNE_GHI = tSNE_GHI[tSNE_GHI["HHIDPN"]!=32570030] 
data = data[data["HHIDPN"]!=32570030]

In [31]:
liste_colonnes_Indice = tSNE_GHI.columns.tolist()
liste_colonnes_Indice

['HHIDPN',
 'tSNE_GHI1',
 'tSNE_GHI2',
 'tSNE_GHI3',
 'tSNE_GHI4',
 'tSNE_GHI5',
 'tSNE_GHI6',
 'tSNE_GHI7',
 'tSNE_GHI8',
 'tSNE_GHI9',
 'tSNE_GHI10',
 'tSNE_GHI11',
 'tSNE_GHI12',
 'tSNE_GHI13',
 'tSNE_GHI14']

In [6]:
data.head()

Unnamed: 0,HHIDPN,R1MPART,R2MPART,R3MPART,R4MPART,R5MPART,R6MPART,R7MPART,R8MPART,R9MPART,...,R13IADL5H_2,R13IADL5H_3,R13IADL5H_4,R13IADL5H_5,R14IADL5H_0,R14IADL5H_1,R14IADL5H_2,R14IADL5H_3,R14IADL5H_4,R14IADL5H_5
0,1010,0.0,0.0,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2010,0.0,0.0,0.0,0.0,0.0,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,10001010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Imputation of missing values

In this section, we try to impute missing values for each wave. The variable INWw indicates if the individual responded to wave w. We do not want to impute missing values for a wave if the individual did not respond to this wave.
We mean-impute socioeconomic variables first.

In [33]:
# Some columns in wave 2 are empty, we drop them
wave_2 = data.loc[data["INW2"] == 1]
wave_2 = wave_2.filter(regex=f"[RH]2[a-zA-Z]|HHIDPN")
cols_wave_2 = wave_2.columns
empty_cols = []
for col in cols_wave_2:
    if wave_2[col].isnull().all():
        empty_cols.append(col)
        
empty_cols
data = data.drop(empty_cols, axis=1)

In [37]:
data = data.drop("RAHHIDPN", axis=1)

In [38]:
waves_sep = {}
temporal_variables = []
for i in range(1,15):           
    print("\n Wave " + str(i))
    name = f"wave_{i}"
    
    # We keep only the variables of wave i and invididuas who responded to this wave
    wave_i = data.loc[data[f"INW{i}"] == 1]
    wave_i = wave_i.filter(regex=f"[RH]{i}[a-zA-Z]|HHIDPN")
    variables_wave_i = wave_i.columns
    temporal_variables += [col for col in variables_wave_i if col != "HHIDPN"]
    
    # Initialize the imputer
    imputer = SimpleImputer(strategy='mean')
    imputer.fit(wave_i)
    wave_i_imputed = imputer.transform(wave_i)

    waves_sep[name] = pd.DataFrame(wave_i_imputed, columns = variables_wave_i)


 Wave 1

 Wave 2

 Wave 3

 Wave 4

 Wave 5

 Wave 6

 Wave 7

 Wave 8

 Wave 9

 Wave 10

 Wave 11

 Wave 12

 Wave 13

 Wave 14


In [11]:
waves_sep["wave_1"].head()

Unnamed: 0,HHIDPN,R1MPART,R1MLEN,R1MCURLN,R1MLENM,R1MNEV,R1FAMR,R1FINR,H1HHRESP,H1CPL,...,R1RETLIV_2.0,R1RETLIV_3.0,R1RETLIV_4.0,R1RETLIV_5.0,R1ADLW_0.0,R1ADLW_1.0,R1ADLW_2.0,R1ADLW_3.0,R1ADLW_4.0,R1ADLW_5.0
0,1010.0,0.0,20.2,27.471508,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2010.0,0.0,25.8,27.471508,0.0,0.0,1.0,1.0,1.0,0.0,...,0.070799,0.500858,0.344962,0.069427,1.0,0.0,0.0,0.0,0.0,0.0
2,3010.0,0.0,31.3,31.3,0.0,0.0,0.0,1.0,2.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3020.0,0.0,31.2,31.2,0.0,0.0,1.0,0.0,2.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,10001010.0,0.0,0.0,27.471508,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# Données intemporelles
intertemp_variables = [col for col in data.columns if col not in temporal_variables]
intertemp_variables = [intertemp_variables[0]] + intertemp_variables[15:]
intertemp_variables.pop(4)

['HHIDPN', 'RASPCT', 'RAOAHDID', 'RAOHRSID', 'RAGENDER', 'RAHISPAN', 'RABMONTH', 'RABYEAR', 'RABDATE', 'RADMONTH', 'RADYEAR', 'RADDATE', 'RADTIMTDTH', 'RADAGE_M', 'RADAGE_Y', 'RAVETRN', 'RAESTRAT', 'RAEHSAMP', 'RAAHDSMP', 'RAHRSAMP', 'HAOAHDHH', 'RAOVRAYR', 'PN', 'REXITYR', 'REPROXYSP', 'REMPART', 'REIWBEG', 'REIWMID', 'REIWEND', 'RASLEEPEF', 'RAHIBPEF', 'RADIABEF', 'RACANCREF', 'RALUNGEF', 'RAHEARTEF', 'RASTROKEF', 'RAPSYCHEF', 'RAARTHREF', 'RAMEMRYEF', 'RAALZHEEF', 'RADEMENEF', 'RENRSHOM', 'RENRSTIM', 'REDRUGS', 'RENHMLIV', 'REHOSP', 'REHPCSTY', 'REHPCTIM', 'REHPCNIT', 'REOOPMDO', 'REOOPMDOF', 'REOOPMD', 'REOOPMDF', 'RASSRECV', 'RASSAGEM', 'RASSAGEB', 'SASSRECV', 'VERSION', 'REGOVMR', 'REGOVVA', 'REHIGOV', 'RAEVBRN', 'genetic_PC1_5A', 'genetic_PC1_5B', 'genetic_PC1_5C', 'genetic_PC1_5D', 'genetic_PC1_5E', 'genetic_PC6_10A', 'genetic_PC6_10B', 'genetic_PC6_10C', 'genetic_PC6_10D', 'genetic_PC6_10E', 'genetic_4_GENCOG_CHARGE15', 'genetic_4_BMI_GIANT15', 'genetic_4_HEIGHT_GIANT14', 'gen

In [49]:
participation_indicator = [f"INW{w}" for w in range(1,15)] + ["HHIDPN"]

In [43]:
# Initialize the imputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(data[intertemp_variables])
intertemp_filled = imputer.transform(data[intertemp_variables])

waves_sep["intertemporal"] = pd.DataFrame(intertemp_filled, columns = intertemp_variables)

In [44]:
imputed_data = pd.merge(waves_sep["intertemporal"], waves_sep["wave_1"], on="HHIDPN", how="outer")
for i in range(2,15):
    imputed_data = pd.merge(imputed_data, waves_sep[f"wave_{i}"], on="HHIDPN", how="outer")

In [50]:
data_and_index = pd.merge(imputed_data, tSNE_GHI, on="HHIDPN")
data_and_index = pd.merge(data_and_index, data[participation_indicator], on="HHIDPN")

## Dimension reduction

Generate training and test samples

In [51]:
X_train, X_test, y_train, y_test = train_test_split(data_and_index[temporal_variables+intertemp_variables], data_and_index[liste_colonnes_Indice])

We use pooled Lasso regressions to reduce the dimension of the dataset. For each wave, we use the predicted indexes from previous waves, the intertemporal variables and the variables of the current wave to predict the index. 

In [None]:
regressors = intertemp_variables
useful_temp_variables = []
useful_intertemp_variables = []
for i in range(1,15):
    columns_wave = waves_sep[f"wave_{i}"].columns.tolist()
    # on ajoute à la liste des régresseurs les variables de la vague i
    regressors += columns_wave
    #on ne garde que les individus qui ont répondu à la vague i
    wave_responders = X_train[X_train[f"INW{i}"] == 1]
    # on garde seulement les variables prédictives
    ind_var = wave_responders[regressors].drop("HHIDPN", axis=1)
    # on ne garde que l'output des personnes qui ont répondu à la vague i
    output = pd.merge(ind_var, y_train, on="HHIDPN", how="left")
    output = output[f"tSNE_GHI{i}"]
    
    if i > 1: # les colonnes de prédictions des vagues précédentes ont des valeurs manquantes, on va les imputer par la moyenne
        imputer = SimpleImputer(strategy='mean')
        imputer.fit(ind_var)
        ind_var = imputer.transform(ind_var)
        
    # on normalise les données
    scaler = StandardScaler()
    ind_var = scaler.fit_transform(ind_var)
    y = output - output.mean()
    
    # on applique le lasso
    clf = linear_model.Lasso(alpha=0.1)
    clf.fit(ind_var, y)
    
    # on ne veut garder que les paramètres non-nuls
    params = clf.get_params()
    for var in params.keys():
        if params[var] != 0:
            if var in temporal_variables:
                useful_temp_variables.append(str(var))
            elif var not in useful_temp_variables:
                useful_intertemp_variables.append(str(var))
            
    regressors = regressors[:-len(columns_wave)]
    
    # on ajoute aux variables prédictives l'output prédit à la vague i
    prediction = pd.DataFrame(data=clf.predict(ind_var), columns = f"prediction_wave_{i}")
    regressors += [f"prediction_wave_{i}"]
    prediction["HHIDPN"] = ind_var["HHIDPN"]
    X_train = pd.merge(X_train, prediction, on="HHIDPN", how="outer")