## Libraries import

In [None]:
pip install fancyimpute

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29898 sha256=e668b64fa945d1ab9e00611ec260b0ab2c9b3d8781dea0d65e0ea0c35be6b0be
  Stored in directory: /root/.cache/pip/wheels/f9/fc/6a/b0406b906bce293abe23c3b6da5a72637d2d04146ef1125a0b
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Cre

In [None]:
# Basic imports
import pandas as pd
import numpy as np

# Import to find the wave related to each variable
import re

# Import to preprocess the data
from sklearn.preprocessing import StandardScaler

# Imports to impute missing values
from fancyimpute import IterativeImputer
import scipy

# Imports to plot pretty graphs
import matplotlib.pyplot as plt
import seaborn as sns

## Data import

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Statapp/data_dummies.csv")

  data = pd.read_csv("/content/drive/MyDrive/Statapp/data_dummies.csv")


In [None]:
data.info(memory_usage="deep")

In [None]:
section_A_or_E = data["genetic_Section_A_or_E"]
data = data.drop(columns = ["genetic_Section_A_or_E"]) # We drop of this useless variable.

## Iterative imputation

We start by dividing the dataset into smaller ones, representing each wave.
We will impute missing values related to each wave separately.

In [None]:
def get_wave(data, wave):
  """
  This function returns a smaller dataset summarizing all data for the given wave.

  Note that it also returns columns that are not relative to any wave (for instance, 'HHIDPN')
  """

  assert wave in range(1, 15)

  regex = re.compile("[0-9]+")
  wave_columns = [col for col in data.columns if (len(regex.findall(col)) == 0 or regex.findall(col)[0] == str(wave))]
  wave_data = data.loc[data[f"INW{wave}"] == 1, wave_columns]

  return wave_data

In [None]:
wave1_data = get_wave(data, 1)
print(wave1_data.shape)
wave1_data.head()

In [None]:
scaler = StandardScaler()
scaler.fit(wave1_data)
wave1_data_scaled = scaler.transform(wave1_data)

To further understand how important imputation is needed, let us have a look to the part of missing values in our dataset.

In [None]:
# For each indivudal, we count the number of missing values.
nan_by_rows = wave1_data.isna().sum(axis=1)

X = range(1, nan_by_rows.max()+1)
Y = []
for x in X:
  y = (nan_by_rows <= x).sum()
  Y.append(y)

plt.plot(X, Y)

plt.xlabel("Number of missing values", fontsize=12)
plt.ylabel("Rows", fontsize=12)
plt.title("How many rows present less or a given number of missing values")

plt.grid()
plt.show()

In [None]:
imp_simple = SimpleImputer(strategy="most_frequent")
pd.DataFrame(imp_simple.fit_transform(wave1_data), columns = wave1_data.columns)

In [None]:
imp = IterativeImputer(imputation_order='random', sample_posterior=True, min_value=0, max_value=1)

In [None]:
imp.fit(wave1_data_scaled)

[IterativeImputer] Completing matrix with shape (12652, 1715)
[IterativeImputer] Ending imputation round 1/20, elapsed time 41.40
[IterativeImputer] Change: 213.92473732947144, scaled tolerance: 0.11247666424641158 
[IterativeImputer] Ending imputation round 2/20, elapsed time 78.65
[IterativeImputer] Change: 235.99570361521506, scaled tolerance: 0.11247666424641158 
[IterativeImputer] Ending imputation round 3/20, elapsed time 114.74
[IterativeImputer] Change: 257.64735167303746, scaled tolerance: 0.11247666424641158 
[IterativeImputer] Ending imputation round 4/20, elapsed time 152.94
[IterativeImputer] Change: 263.2254816218256, scaled tolerance: 0.11247666424641158 
[IterativeImputer] Ending imputation round 5/20, elapsed time 189.40
[IterativeImputer] Change: 259.6217561347557, scaled tolerance: 0.11247666424641158 
[IterativeImputer] Ending imputation round 6/20, elapsed time 225.19
[IterativeImputer] Change: 250.44236330405505, scaled tolerance: 0.11247666424641158 
[IterativeIm



In [None]:
wave1_data_imputed_iterative = pd.DataFrame(scaler.inverse_transform(imp.transform(wave1_data_scaled)), columns=wave1_data.columns)
wave1_data_imputed_iterative.head(10)

[IterativeImputer] Completing matrix with shape (12652, 1715)
[IterativeImputer] Ending imputation round 1/20, elapsed time 1.19
[IterativeImputer] Ending imputation round 2/20, elapsed time 2.41
[IterativeImputer] Ending imputation round 3/20, elapsed time 3.60
[IterativeImputer] Ending imputation round 4/20, elapsed time 4.78
[IterativeImputer] Ending imputation round 5/20, elapsed time 5.98
[IterativeImputer] Ending imputation round 6/20, elapsed time 7.18
[IterativeImputer] Ending imputation round 7/20, elapsed time 8.38
[IterativeImputer] Ending imputation round 8/20, elapsed time 9.59
[IterativeImputer] Ending imputation round 9/20, elapsed time 10.79
[IterativeImputer] Ending imputation round 10/20, elapsed time 12.01
[IterativeImputer] Ending imputation round 11/20, elapsed time 13.32
[IterativeImputer] Ending imputation round 12/20, elapsed time 14.56
[IterativeImputer] Ending imputation round 13/20, elapsed time 15.78
[IterativeImputer] Ending imputation round 14/20, elapsed 

Unnamed: 0,HHIDPN,S1HHIDPN,R1MPART,S1BMONTH,S1BYEAR,S1BDATE,S1BFLAG,S1HRSAMP,S1AHDSMP,S1HISPAN,...,S1ADLW_2.0,S1ADLW_3.0,S1ADLW_4.0,S1ADLW_5.0,R1ADLW_0.0,R1ADLW_1.0,R1ADLW_2.0,R1ADLW_3.0,R1ADLW_4.0,R1ADLW_5.0
0,1010.0,0.0,0.0,6.486992,1937.311702,-7346.144438,0.000596,-0.036304,-0.001549,-0.003114,...,0.023267,0.01626,0.016007,0.009874,1.0,0.0,0.0,0.0,0.0,0.0
1,2010.0,0.0,0.0,6.360719,1932.458299,-9042.863096,0.00079,0.030824,-0.011935,0.038395,...,0.033165,0.022596,0.020769,0.012239,1.0,0.0,0.0,0.0,0.0,0.0
2,3010.0,3020.0,0.0,9.0,1938.0,-7778.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3020.0,3010.0,0.0,1.0,1936.0,-8752.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,10001010.0,0.0,0.0,6.361757,1938.056865,-7635.164689,-0.000121,0.030907,-0.004237,0.032521,...,0.031083,0.017436,0.016789,0.00935,1.0,0.0,0.0,0.0,0.0,0.0
5,10003020.0,10003030.0,0.0,3.0,1956.0,-1387.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,10003030.0,10003020.0,0.0,4.0,1934.0,-9392.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,10004010.0,10004040.0,0.0,4.0,1946.0,-5009.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,10004040.0,10004010.0,0.0,12.0,1939.0,-7322.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,10013010.0,10013040.0,0.0,11.0,1947.0,-4430.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
wave1_data_imputed_iterative["R1ADLW_0.0"].describe()


count    12652.000000
mean         0.892582
std          0.309664
min         -0.040536
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: R1ADLW_0.0, dtype: float64

In [None]:
impute_knn = KNNImputer(n_neighbors = 10)

wave1_data_imputed_knn = pd.DataFrame(scaler.inverse_transform(impute_knn.fit_transform(wave1_data_scaled)), columns=wave1_data.columns)
wave1_data_imputed_knn.head(10)