Looking for best approach to feature imputation

Constants:
* Not using parquet fields
* Not using PCA
* Non-augmented data
* Feature selection, calculation
* Using single XGBoost model for predictions from imputed labelled data
* StandardScaler

Varying:
* Feature imputation model: mean (control), median (control), KNN, MICE, RandomForest, autoencoder

In [1]:
import numpy as np
import pandas as pd

import os

import polars as pl
from glob import glob
from tqdm.auto import tqdm

import tensorflow as tf
from tensorflow import keras
import random

np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

from sklearn.impute import KNNImputer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.callbacks import LearningRateScheduler

from sklearn.svm import SVR

import lightgbm as lgb

from sklearn.model_selection import KFold

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

In [2]:
train_data=pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [3]:
INPUT_DIR = "/kaggle/input/child-mind-institute-problematic-internet-use/"

# Import aggregate fields from parquet files
# Modified code from rsakata: https://www.kaggle.com/code/rsakata/cmi-piu-16th-place-solution

files_train = glob(INPUT_DIR + "series_train.parquet/*")
#if IS_SUBMIT:
#    files += glob(INPUT_DIR + "series_test.parquet/*")

list_df_train = []
for file in tqdm(files_train):
    df_series = (
        pl.read_parquet(file)
        .with_columns(
            (
                (pl.col("relative_date_PCIAT") - pl.col("relative_date_PCIAT").min())*24
                + (pl.col("time_of_day") // int(1e9)) / 3600
            ).floor().cast(int).alias("total_hours")
        )
        .filter(pl.col("non-wear_flag") != 1)
        .filter(pl.col("step").count().over("total_hours") == 12 * 60)
        .group_by("total_hours").agg(
            pl.col("enmo").std().alias("enmo_std"),
            pl.col("anglez").std().alias("anglez_std"),
            pl.col("light").std().alias("light_std")
        )
        .with_columns(
            (pl.col("total_hours") % 24).alias("hour"),
            pl.lit(file.split("/")[-1][3:]).alias("id")
        )
    )
    list_df_train.append(df_series.to_pandas())

df_series = pd.concat(list_df_train)
df_series["enmo_std"] = np.log(df_series["enmo_std"] + 0.01)
df_series["anglez_std"] = np.log(df_series["anglez_std"] + 1)
df_series["light_std"] = np.log(df_series["light_std"] + 0.01)

df_agg_train = df_series.groupby("id")[["enmo_std", "anglez_std", "light_std"]].agg(["mean", "std"]).reset_index()
df_agg_train.columns = [cols[0] + "_" + cols[1] if cols[1] != "" else cols[0] for cols in df_agg_train.columns]
df_agg_train

  0%|          | 0/996 [00:00<?, ?it/s]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00115b9f,-4.000377,,1.989905,,0.051475,
1,001f3379,-3.514671,0.652348,3.236993,0.678888,0.774591,2.945807
2,00f332d1,-3.071176,0.927238,3.249122,0.463244,1.138379,2.939823
3,01085eb3,-2.902040,0.791255,3.389762,0.315061,1.054698,2.185839
4,012cadd8,-2.806918,1.171675,3.337322,0.388409,0.823770,3.350365
...,...,...,...,...,...,...,...
964,fe9c71d8,-3.116904,0.961804,3.037607,0.943554,-0.394200,2.742634
965,fecc07d6,-3.969482,0.981531,1.332831,1.428363,-0.438018,1.795653
966,ff18b749,-2.820076,0.937540,3.258458,0.417267,1.236652,3.341580
967,ffcd4dbd,-3.271800,0.827489,3.183395,0.629553,0.521227,2.665325


In [4]:
train_data2 = train_data.merge(df_agg_train, how="left", on="id")
train_data2.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,Fall,3.0,2.0,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,64.0,Summer,0.0,0.0,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,54.0,Summer,2.0,0.0,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.0,Winter,0.0,1.0,-4.000377,,1.989905,,0.051475,
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [5]:
files_test = glob(INPUT_DIR + "series_test.parquet/*")


list_df_test = []
for file in tqdm(files_test):
    df_series = (
        pl.read_parquet(file)
        .with_columns(
            (
                (pl.col("relative_date_PCIAT") - pl.col("relative_date_PCIAT").min())*24
                + (pl.col("time_of_day") // int(1e9)) / 3600
            ).floor().cast(int).alias("total_hours")
        )
        .filter(pl.col("non-wear_flag") != 1)
        .filter(pl.col("step").count().over("total_hours") == 12 * 60)
        .group_by("total_hours").agg(
            pl.col("enmo").std().alias("enmo_std"),
            pl.col("anglez").std().alias("anglez_std"),
            pl.col("light").std().alias("light_std")
        )
        .with_columns(
            (pl.col("total_hours") % 24).alias("hour"),
            pl.lit(file.split("/")[-1][3:]).alias("id")
        )
    )
    list_df_test.append(df_series.to_pandas())

df_series = pd.concat(list_df_test)
df_series["enmo_std"] = np.log(df_series["enmo_std"] + 0.01)
df_series["anglez_std"] = np.log(df_series["anglez_std"] + 1)
df_series["light_std"] = np.log(df_series["light_std"] + 0.01)

df_agg_test = df_series.groupby("id")[["enmo_std", "anglez_std", "light_std"]].agg(["mean", "std"]).reset_index()
df_agg_test.columns = [cols[0] + "_" + cols[1] if cols[1] != "" else cols[0] for cols in df_agg_test.columns]
df_agg_test

  0%|          | 0/2 [00:00<?, ?it/s]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00115b9f,-4.000377,,1.989905,,0.051475,
1,001f3379,-3.514671,0.652348,3.236993,0.678888,0.774591,2.945807


In [6]:
test_data2 = test_data.merge(df_agg_test, how="left", on="id")
test_data2.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,Fall,3.0,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,46.0,64.0,Summer,0.0,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,38.0,54.0,Summer,2.0,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,31.0,45.0,Winter,0.0,-4.000377,,1.989905,,0.051475,
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [26]:
X_train = train_data2[['Basic_Demos-Age',
                      'Basic_Demos-Sex',
                      'CGAS-CGAS_Score',
                      'Physical-BMI',
                      'BIA-BIA_BMI',
                      'Physical-Waist_Circumference',
                      'Physical-Diastolic_BP',
                      'Physical-HeartRate',
                      'Physical-Systolic_BP',
                      'Fitness_Endurance-Max_Stage',
                      'Fitness_Endurance-Time_Mins',
                      'Fitness_Endurance-Time_Sec',
                      'FGC-FGC_CU_Zone',
                      'FGC-FGC_GSND_Zone',
                      'FGC-FGC_GSD_Zone',
                      'FGC-FGC_PU_Zone',
                      'FGC-FGC_SRL_Zone',
                      'FGC-FGC_SRR_Zone',
                      'FGC-FGC_TL_Zone',
                      'BIA-BIA_Activity_Level_num',
                      'BIA-BIA_BMC',
                      'BIA-BIA_BMR',
                      'BIA-BIA_DEE',
                      'BIA-BIA_ECW',
                      'BIA-BIA_FFM',
                      'BIA-BIA_FFMI',
                      'BIA-BIA_FMI',
                      'BIA-BIA_Fat',
                      'BIA-BIA_ICW',
                      'BIA-BIA_LDM',
                      'BIA-BIA_LST',
                      'BIA-BIA_SMM',
                      'BIA-BIA_TBW',
                      'PAQ_A-PAQ_A_Total',
                      'PAQ_C-PAQ_C_Total',
                      'SDS-SDS_Total_T',
                      'PreInt_EduHx-computerinternet_hoursday'
                      # ,
                      #'enmo_std_mean',
                      #'enmo_std_std',
                      #'anglez_std_mean',
                      #'anglez_std_std',
                      #'light_std_mean',
                      #'light_std_std',
                      ]]

y_train = train_data2['PCIAT-PCIAT_Total']

X_test = test_data2[['Basic_Demos-Age',
                      'Basic_Demos-Sex',
                      'CGAS-CGAS_Score',
                      'Physical-BMI',
                      'BIA-BIA_BMI',
                      'Physical-Waist_Circumference',
                      'Physical-Diastolic_BP',
                      'Physical-HeartRate',
                      'Physical-Systolic_BP',
                      'Fitness_Endurance-Max_Stage',
                      'Fitness_Endurance-Time_Mins',
                      'Fitness_Endurance-Time_Sec',
                      'FGC-FGC_CU_Zone',
                      'FGC-FGC_GSND_Zone',
                      'FGC-FGC_GSD_Zone',
                      'FGC-FGC_PU_Zone',
                      'FGC-FGC_SRL_Zone',
                      'FGC-FGC_SRR_Zone',
                      'FGC-FGC_TL_Zone',
                      'BIA-BIA_Activity_Level_num',
                      'BIA-BIA_BMC',
                      'BIA-BIA_BMR',
                      'BIA-BIA_DEE',
                      'BIA-BIA_ECW',
                      'BIA-BIA_FFM',
                      'BIA-BIA_FFMI',
                      'BIA-BIA_FMI',
                      'BIA-BIA_Fat',
                      'BIA-BIA_ICW',
                      'BIA-BIA_LDM',
                      'BIA-BIA_LST',
                      'BIA-BIA_SMM',
                      'BIA-BIA_TBW',
                      'PAQ_A-PAQ_A_Total',
                      'PAQ_C-PAQ_C_Total',
                      'SDS-SDS_Total_T',
                      'PreInt_EduHx-computerinternet_hoursday'
                      #,
                      #'enmo_std_mean',
                      #'enmo_std_std',
                      #'anglez_std_mean',
                      #'anglez_std_std',
                      #'light_std_mean',
                      #'light_std_std',
                   ]]

In [27]:
# Add calculated fields
X_train['Physical-BMI_Calc'] = X_train.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
X_train['Fitness_Endurance-Time_Sec_Calc'] = X_train.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
X_train['PAQ_Total'] = X_train.apply(lambda row: row['PAQ_A-PAQ_A_Total'] if row['PAQ_A-PAQ_A_Total']==row['PAQ_A-PAQ_A_Total'] else row['PAQ_C-PAQ_C_Total'],axis=1)


# Drop fields no longer needed
X_train = X_train.drop(columns=['PAQ_A-PAQ_A_Total','PAQ_C-PAQ_C_Total',
                     'Physical-BMI','BIA-BIA_BMI',
                     'Fitness_Endurance-Time_Mins','Fitness_Endurance-Time_Sec'])

# Remove outliers - may give warnings due to NaN value comparison
X_train.loc[X_train['CGAS-CGAS_Score']>=100.0,'CGAS-CGAS_Score'] = np.nan
X_train.loc[X_train['Physical-Systolic_BP']>=180.0,'Physical-Systolic_BP'] = np.nan
X_train.loc[X_train['Physical-Diastolic_BP']>=120.0,'Physical-Diastolic_BP'] = np.nan
X_train.loc[X_train['BIA-BIA_DEE']>=6000.0,'BIA-BIA_DEE'] = np.nan
X_train.loc[(X_train['BIA-BIA_BMC']<=0.0) | (X_train['BIA-BIA_BMC']>=16.0),'BIA-BIA_BMC'] = np.nan
X_train.loc[(X_train['BIA-BIA_BMR']<=0.0) | (X_train['BIA-BIA_BMR']>=2400.0),'BIA-BIA_BMR'] = np.nan
X_train.loc[(X_train['BIA-BIA_ECW']<=0.0) | (X_train['BIA-BIA_ECW']>=60.0),'BIA-BIA_ECW'] = np.nan
X_train.loc[(X_train['BIA-BIA_FFM']<=0.0) | (X_train['BIA-BIA_FFM']>=200.0),'BIA-BIA_FFM'] = np.nan
X_train.loc[(X_train['BIA-BIA_FFMI']<=0.0) | (X_train['BIA-BIA_FFMI']>=25.0),'BIA-BIA_FFMI'] = np.nan
X_train.loc[(X_train['BIA-BIA_FMI']<=0.0) | (X_train['BIA-BIA_FMI']>=25.0),'BIA-BIA_FMI'] = np.nan
X_train.loc[(X_train['BIA-BIA_Fat']<=8.0) | (X_train['BIA-BIA_Fat']>=60.0),'BIA-BIA_Fat'] = np.nan
X_train.loc[(X_train['BIA-BIA_ICW']<=0.0) | (X_train['BIA-BIA_ICW']>=80.0),'BIA-BIA_ICW'] = np.nan
X_train.loc[(X_train['BIA-BIA_LDM']<=0.0) | (X_train['BIA-BIA_LDM']>=60.0),'BIA-BIA_LDM'] = np.nan
X_train.loc[(X_train['BIA-BIA_LST']<=0.0) | (X_train['BIA-BIA_LST']>=150.0),'BIA-BIA_LST'] = np.nan
X_train.loc[(X_train['BIA-BIA_SMM']<=0.0) | (X_train['BIA-BIA_SMM']>=100.0),'BIA-BIA_SMM'] = np.nan
X_train.loc[(X_train['BIA-BIA_TBW']<=0.0) | (X_train['BIA-BIA_TBW']>=150.0),'BIA-BIA_TBW'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Physical-BMI_Calc'] = X_train.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Fitness_Endurance-Time_Sec_Calc'] = X_train.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:

In [28]:
# Repeat the above for X_test

# Add calculated fields
X_test['Physical-BMI_Calc'] = X_test.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
X_test['Fitness_Endurance-Time_Sec_Calc'] = X_test.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
X_test['PAQ_Total'] = X_test.apply(lambda row: row['PAQ_A-PAQ_A_Total'] if row['PAQ_A-PAQ_A_Total']==row['PAQ_A-PAQ_A_Total'] else row['PAQ_C-PAQ_C_Total'],axis=1)

# Drop fields no longer needed
X_test = X_test.drop(columns=['PAQ_A-PAQ_A_Total','PAQ_C-PAQ_C_Total',
                     'Physical-BMI','BIA-BIA_BMI',
                     'Fitness_Endurance-Time_Mins','Fitness_Endurance-Time_Sec'])

# Remove outliers
X_test.loc[X_test['CGAS-CGAS_Score']>=100.0,'CGAS-CGAS_Score'] = np.nan
X_test.loc[X_test['Physical-Systolic_BP']>=180.0,'Physical-Systolic_BP'] = np.nan
X_test.loc[X_test['Physical-Diastolic_BP']>=120.0,'Physical-Diastolic_BP'] = np.nan
X_test.loc[X_test['BIA-BIA_DEE']>=6000.0,'BIA-BIA_DEE'] = np.nan
X_test.loc[(X_test['BIA-BIA_BMC']<=0.0) | (X_test['BIA-BIA_BMC']>=16.0),'BIA-BIA_BMC'] = np.nan
X_test.loc[(X_test['BIA-BIA_BMR']<=0.0) | (X_test['BIA-BIA_BMR']>=2400.0),'BIA-BIA_BMR'] = np.nan
X_test.loc[(X_test['BIA-BIA_ECW']<=0.0) | (X_test['BIA-BIA_ECW']>=60.0),'BIA-BIA_ECW'] = np.nan
X_test.loc[(X_test['BIA-BIA_FFM']<=0.0) | (X_test['BIA-BIA_FFM']>=200.0),'BIA-BIA_FFM'] = np.nan
X_test.loc[(X_test['BIA-BIA_FFMI']<=0.0) | (X_test['BIA-BIA_FFMI']>=25.0),'BIA-BIA_FFMI'] = np.nan
X_test.loc[(X_test['BIA-BIA_FMI']<=0.0) | (X_test['BIA-BIA_FMI']>=25.0),'BIA-BIA_FMI'] = np.nan
X_test.loc[(X_test['BIA-BIA_Fat']<=8.0) | (X_test['BIA-BIA_Fat']>=60.0),'BIA-BIA_Fat'] = np.nan
X_test.loc[(X_test['BIA-BIA_ICW']<=0.0) | (X_test['BIA-BIA_ICW']>=80.0),'BIA-BIA_ICW'] = np.nan
X_test.loc[(X_test['BIA-BIA_LDM']<=0.0) | (X_test['BIA-BIA_LDM']>=60.0),'BIA-BIA_LDM'] = np.nan
X_test.loc[(X_test['BIA-BIA_LST']<=0.0) | (X_test['BIA-BIA_LST']>=150.0),'BIA-BIA_LST'] = np.nan
X_test.loc[(X_test['BIA-BIA_SMM']<=0.0) | (X_test['BIA-BIA_SMM']>=100.0),'BIA-BIA_SMM'] = np.nan
X_test.loc[(X_test['BIA-BIA_TBW']<=0.0) | (X_test['BIA-BIA_TBW']>=150.0),'BIA-BIA_TBW'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Physical-BMI_Calc'] = X_test.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Fitness_Endurance-Time_Sec_Calc'] = X_test.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

# Feature imputation models

In [29]:
X_train.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,2420.0,898.0,2935.0,2967.0,2938.0,743.0,2282.0,1062.0,...,1978.0,1982.0,1963.0,1974.0,1981.0,2606.0,3301.0,3042.0,739.0,2195.0
mean,10.433586,0.372727,65.069008,27.278508,69.242249,81.597236,116.591899,4.989233,0.476337,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,57.763622,1.060588,19.334696,469.91069,2.500797
std,3.574648,0.483591,11.78731,5.567287,12.631379,13.665196,16.252453,2.014072,0.499549,0.612585,...,9.746636,7.09177,23.638407,13.855726,19.570096,13.196091,1.094875,5.10428,188.716073,0.816265
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,0.0,0.0,5.0,0.58
25%,8.0,0.0,59.0,23.0,61.0,72.0,107.0,4.0,0.0,1.0,...,24.43685,12.973475,45.04865,21.116075,35.7985,47.0,0.0,15.870652,362.0,1.88
50%,10.0,0.0,65.0,26.0,68.0,81.0,114.0,5.0,0.0,2.0,...,28.79455,16.4086,56.5182,27.27715,44.8198,55.0,1.0,17.942687,476.0,2.46
75%,13.0,1.0,75.0,30.0,76.0,90.5,125.0,6.0,1.0,2.0,...,35.21465,21.983775,75.79605,37.77755,60.0358,64.0,2.0,21.57906,590.5,3.09
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,79.4738,52.5275,149.83,97.9231,146.075,100.0,3.0,59.132048,1200.0,4.79


In [76]:
# Mean

X_train_fimpute_mean = X_train.copy()

for column in X_train_fimpute_mean.columns:
    mean_val = np.mean(X_train[column])
    X_train_fimpute_mean.loc[X_train_fimpute_mean[column].isna(),column] = mean_val

X_train_fimpute_mean.describe()

  X_train_fimpute_mean.loc[X_train_fimpute_mean[column].isna(),column] = mean_val
  X_train_fimpute_mean.loc[X_train_fimpute_mean[column].isna(),column] = mean_val


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,65.069008,27.278508,69.242249,81.597236,116.591899,4.989233,0.476337,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,57.763622,1.060588,19.334696,469.91069,2.500797
std,3.574648,0.483591,9.213823,2.650007,10.873968,11.827929,13.998391,0.871935,0.379182,0.317126,...,6.887559,5.016538,16.640829,9.781378,13.839895,10.704254,0.999606,4.473526,81.478743,0.607654
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,0.0,0.0,5.0,0.58
25%,8.0,0.0,61.0,27.278508,64.0,76.0,109.0,4.989233,0.0,1.829567,...,28.80125,16.394825,56.7964,27.3053,44.819225,51.0,0.0,16.485531,469.91069,2.3575
50%,10.0,0.0,65.069008,27.278508,69.242249,81.597236,116.591899,4.989233,0.476337,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,57.763622,1.060588,19.334696,469.91069,2.500797
75%,13.0,1.0,68.0,27.278508,72.0,87.0,121.0,4.989233,1.0,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,59.0,2.0,20.089901,469.91069,2.57
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,79.4738,52.5275,149.83,97.9231,146.075,100.0,3.0,59.132048,1200.0,4.79


In [77]:
mean_scaler = StandardScaler()                  

X_train_fimpute_mean[X_train_fimpute_mean.columns] = mean_scaler.fit_transform(X_train_fimpute_mean[X_train_fimpute_mean.columns])
X_train_fimpute_mean.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,-1.928872e-16,-5.831474e-17,5.4726150000000007e-17,1.455177e-15,4.458835e-16,-1.247936e-15,6.573866e-16,-9.734077e-16,-8.679925000000001e-17,-8.056406e-16,...,4.037175e-17,-8.182007e-16,-1.915415e-16,2.274275e-16,1.381611e-16,-6.064733e-16,-5.3829e-18,6.997769e-17,6.773482e-16,1.982701e-16
std,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,...,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126
min,-1.520226,-0.7708456,-4.349342,-3.501756,-6.368512,-4.616542,-8.330002,-5.722744,-1.256379,-2.616223,...,-2.443542,-2.675477,-2.354323,-2.717075,-2.135461,-1.846567,-1.06114,-4.322572,-5.706634,-3.161402
25%,-0.6808763,-0.7708456,-0.4416757,1.340812e-15,-0.4821525,-0.4732818,-0.5424093,-1.018757e-15,-1.256379,-7.002669e-16,...,-0.3652939,-0.3311314,-0.3604029,-0.4012018,-0.3845026,-0.6319428,-1.06114,-0.6369751,6.977353e-16,-0.2358501
50%,-0.12131,-0.7708456,0.0,1.340812e-15,0.0,-1.201618e-15,1.015306e-15,-1.018757e-15,0.0,-7.002669e-16,...,0.0,-7.082897e-16,-4.270415e-16,3.632578e-16,0.0,-6.638786e-16,0.0,0.0,6.977353e-16,0.0
75%,0.7180394,1.297277,0.3181483,1.340812e-15,0.2536424,0.4568379,0.3149403,-1.018757e-15,1.381208,-7.002669e-16,...,0.0,-7.082897e-16,-4.270415e-16,3.632578e-16,0.0,0.115518,0.9399011,0.168838,6.977353e-16,0.1138994
max,3.236088,1.297277,3.248898,8.575207,4.576437,4.769211,4.458797,26.39378,1.381208,3.69122,...,6.992748,6.872489,5.230986,6.819326,6.932646,3.946255,1.940421,8.897317,8.96162,3.767753


In [85]:
# Median

X_train_fimpute_median = X_train.copy()

for column in X_train_fimpute_median.columns:
    median_val = np.median(X_train.loc[X_train[column].notna()][column])
    X_train_fimpute_median.loc[X_train_fimpute_median[column].isna(),column] = median_val

X_train_fimpute_median.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,65.042172,26.289924,68.920707,81.447475,115.92298,4.99798,0.274495,1.954293,...,30.054459,17.233006,59.628686,29.247143,47.481258,56.818687,1.050505,19.012003,474.863636,2.482614
std,3.574648,0.483591,9.213885,2.703558,10.887576,11.830762,14.044273,0.871945,0.446316,0.325992,...,7.002103,5.083709,16.93406,9.979019,14.093285,10.784249,0.999861,4.511937,81.513283,0.607993
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,0.0,0.0,5.0,0.58
25%,8.0,0.0,61.0,26.0,64.0,76.0,109.0,5.0,0.0,2.0,...,28.79455,16.394825,56.5182,27.27715,44.819225,51.0,0.0,16.485531,476.0,2.3575
50%,10.0,0.0,65.0,26.0,68.0,81.0,114.0,5.0,0.0,2.0,...,28.79455,16.4086,56.5182,27.27715,44.8198,55.0,1.0,17.942687,476.0,2.46
75%,13.0,1.0,68.0,26.0,72.0,87.0,121.0,5.0,1.0,2.0,...,28.79455,16.417725,56.5182,27.27715,44.822725,59.0,2.0,20.089901,476.0,2.57
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,79.4738,52.5275,149.83,97.9231,146.075,100.0,3.0,59.132048,1200.0,4.79


In [86]:
median_scaler = StandardScaler()                  

X_train_fimpute_median[X_train_fimpute_median.columns] = median_scaler.fit_transform(X_train_fimpute_median[X_train_fimpute_median.columns])
X_train_fimpute_median.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,-1.928872e-16,-5.831474e-17,5.768674e-16,-9.895564e-16,3.445056e-16,3.750087e-16,4.077546e-16,2.435762e-16,-9.599504000000001e-17,3.489913e-16,...,-4.108947e-16,6.100619e-17,3.732144e-16,-3.557199e-16,2.9605950000000004e-17,8.792069e-17,-9.330359e-17,4.611351e-16,-1.269467e-16,-4.66518e-16
std,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,...,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126
min,-1.520226,-0.7708456,-4.3464,-3.066689,-6.331016,-4.602776,-8.255153,-5.73271,-0.6151017,-2.927717,...,-2.22325,-2.478267,-2.126669,-2.464623,-1.908388,-1.745236,-1.050784,-4.214244,-5.764986,-3.129731
25%,-0.6808763,-0.7708456,-0.4387598,-0.1072516,-0.4520133,-0.4605082,-0.493002,0.002317183,-0.6151017,0.1402267,...,-0.1799557,-0.1648967,-0.1837054,-0.1974384,-0.1889105,-0.5396223,-1.050784,-0.5600235,0.0139426,-0.2058074
50%,-0.12131,-0.7708456,-0.004577553,-0.1072516,-0.08457561,-0.03782776,-0.13694,0.002317183,-0.6151017,0.1402267,...,-0.1799557,-0.1621867,-0.1837054,-0.1974384,-0.1888697,-0.1686642,-0.05051845,-0.2370269,0.0139426,-0.03719862
75%,0.7180394,1.297277,0.3210591,-0.1072516,0.282862,0.4693887,0.3615468,0.002317183,1.625747,0.1402267,...,-0.1799557,-0.1603916,-0.1837054,-0.1974384,-0.1886621,0.202294,0.9497469,0.2389295,0.0139426,0.1437474
max,3.236088,1.297277,3.251789,8.771061,4.600254,4.780729,4.491866,26.38344,1.625747,3.20817,...,7.058677,6.943542,5.327293,6.882904,6.996679,4.004615,1.950012,8.8931,8.897052,3.795567


In [33]:
#KNN
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=30, weights='distance') #30 has worked well in the past
X_train_fimpute_knn = pd.DataFrame(knn_imputer.fit_transform(X_train), columns = X_train.columns)
X_train_fimpute_knn.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,64.868609,26.892919,69.285364,81.418071,116.339899,4.873853,0.479768,1.818804,...,31.306746,18.019151,63.518488,31.116039,50.167154,58.083483,1.071311,19.468548,455.259767,2.515851
std,3.574648,0.483591,10.052045,4.016683,11.246181,12.216466,14.603324,0.977338,0.411427,0.351487,...,8.856623,6.465503,21.880104,12.498368,17.728622,11.898635,1.031655,4.773115,95.874235,0.667581
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,0.0,0.0,5.0,0.58
25%,8.0,0.0,60.0,24.0,63.0,74.0,108.0,4.584122,0.0,1.725932,...,25.134004,13.122775,46.6715,21.763,37.298568,50.0,0.0,16.278703,422.627536,2.13
50%,10.0,0.0,65.0,26.0,68.380952,81.0,114.291667,4.88191,0.478196,1.834561,...,28.78611,16.5246,57.392422,27.62615,45.39086,56.5,1.0,18.267153,454.300803,2.539943
75%,13.0,1.0,70.0,28.961712,74.0,88.0,122.925,5.146691,1.0,2.0,...,35.338329,21.84449,77.20575,37.821475,60.198483,62.925,2.0,21.717966,491.91121,2.778608
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,79.4738,52.5275,149.83,97.9231,146.075,100.0,3.0,59.132048,1200.0,4.79


In [45]:
knn_scaler = StandardScaler()                  

X_train_fimpute_knn[X_train_fimpute_knn.columns] = knn_scaler.fit_transform(X_train_fimpute_knn[X_train_fimpute_knn.columns])
X_train_fimpute_knn.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,-1.928872e-16,-5.831474e-17,3.624486e-16,5.436729e-16,-9.124015e-16,9.173358e-16,7.320743e-16,-8.455638e-16,-1.43544e-17,-3.382255e-16,...,-5.652044e-17,1.094523e-16,1.758414e-16,1.7943e-16,1.130409e-16,-3.009938e-16,2.87088e-17,1.901958e-16,3.570657e-16,-5.86736e-16
std,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,...,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126
min,-1.520226,-0.7708456,-3.966719,-2.214275,-6.161568,-4.455048,-7.967679,-4.987497,-1.166254,-2.329836,...,-1.899129,-2.070223,-1.823731,-2.117369,-1.668584,-1.688094,-1.03857,-4.079308,-4.696951,-2.900165
25%,-0.6808763,-0.7708456,-0.4844013,-0.7203169,-0.5589593,-0.6072957,-0.5711681,-0.2964867,-1.166254,-0.2642593,...,-0.6970515,-0.7574035,-0.7700656,-0.7484354,-0.7259567,-0.679448,-1.03857,-0.6683785,-0.3404079,-0.5780568
50%,-0.12131,-0.7708456,0.01307276,-0.2223307,-0.08042959,-0.03422625,-0.1402757,0.008244151,-0.003819653,0.04483639,...,-0.2846405,-0.2311869,-0.2800188,-0.2792629,-0.2694455,-0.1330979,-0.06913122,-0.2517323,-0.01000357,0.03609284
75%,0.7180394,1.297277,0.5105468,0.5151151,0.419274,0.5388432,0.4509886,0.279199,1.264619,0.5155775,...,0.4552629,0.5917285,0.6256365,0.5365727,0.5658983,0.4069482,0.9003074,0.4713276,0.382335,0.3936456
max,3.236088,1.297277,2.997917,5.753504,4.421138,4.632197,4.291353,23.66538,1.264619,3.360991,...,5.439222,5.337977,3.945247,5.345938,5.410458,3.523245,1.869746,8.310821,7.768868,3.40698


In [35]:
#MICE
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

iter_imputer = IterativeImputer(max_iter=10, random_state=42)
X_train_fimpute_mice = pd.DataFrame(iter_imputer.fit_transform(X_train), columns = X_train.columns)
X_train_fimpute_mice.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,64.818333,26.974426,69.403169,81.519388,116.805724,4.97403,0.480343,1.84934,...,31.36562,18.066786,63.749033,31.477562,50.138601,57.881433,1.073783,19.374281,468.721596,2.537644
std,3.574648,0.483591,9.805847,4.770198,11.011761,12.021297,14.228392,0.935016,0.389401,0.34619,...,7.099089,5.021442,18.068723,10.250418,13.839926,10.73703,1.02287,4.520383,89.856124,0.653839
min,5.0,0.0,-23.164848,4.967968,0.0,27.0,0.0,0.0,-0.172314,-1.772461,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,-0.278039,0.0,5.0,0.58
25%,8.0,0.0,60.0,24.0,64.0,74.477905,109.0,4.865373,0.0,1.766934,...,28.665877,16.394825,56.7477,27.1903,44.819225,51.0,0.0,16.485531,450.588533,2.15
50%,10.0,0.0,64.765647,26.137767,69.0,81.485263,115.604541,4.973003,0.485209,1.872826,...,31.34785,18.058831,63.734357,30.792761,50.135566,57.770114,1.0,18.589876,469.207062,2.5676
75%,13.0,1.0,70.0,29.035486,73.0,87.0,122.0,5.081351,1.0,2.0,...,32.68998,18.232694,63.917531,33.00796,50.147145,60.0,2.0,21.015855,487.416029,2.886524
max,22.0,1.0,95.0,56.944995,119.0,138.0,179.0,28.0,1.129295,3.0,...,86.587576,52.5275,188.145195,111.83576,146.075,100.0,3.019947,59.132048,2154.275208,4.79


Note that some imputed values were, e.g., <0, so clipping is required.

In [41]:
# Clip imputed values to original max and min
for column in X_train_fimpute_mice.columns:
    max_val = np.max(X_train[column])
    min_val = np.min(X_train[column])
    X_train_fimpute_mice.loc[X_train_fimpute_mice[column]>max_val,column] = max_val
    X_train_fimpute_mice.loc[X_train_fimpute_mice[column]<min_val, column] = min_val

X_train_fimpute_mice.describe()
    

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,64.830496,26.998653,69.403169,81.519388,116.805724,4.97403,0.480419,1.850041,...,31.363446,18.066786,63.672368,31.469455,50.138601,57.881433,1.073848,19.374281,468.480617,2.537644
std,3.574648,0.483591,9.726236,4.692416,11.011761,12.021297,14.228392,0.935016,0.389164,0.341634,...,7.083396,5.021442,17.644465,10.193244,13.839926,10.73703,1.022777,4.520383,86.553527,0.653839
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,14.489,4.63581,23.6201,4.65573,20.5892,38.0,0.0,0.0,5.0,0.58
25%,8.0,0.0,60.0,24.0,64.0,74.477905,109.0,4.865373,0.0,1.766934,...,28.665877,16.394825,56.7477,27.1903,44.819225,51.0,0.0,16.485531,450.588533,2.15
50%,10.0,0.0,64.765647,26.137767,69.0,81.485263,115.604541,4.973003,0.485209,1.872826,...,31.34785,18.058831,63.734357,30.792761,50.135566,57.770114,1.0,18.589876,469.207062,2.5676
75%,13.0,1.0,70.0,29.035486,73.0,87.0,122.0,5.081351,1.0,2.0,...,32.68998,18.232694,63.917531,33.00796,50.147145,60.0,2.0,21.015855,487.416029,2.886524
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,79.4738,52.5275,149.83,97.9231,146.075,100.0,3.0,59.132048,1200.0,4.79


In [46]:
mice_scaler = StandardScaler()                  

X_train_fimpute_mice[X_train_fimpute_mice.columns] = mice_scaler.fit_transform(X_train_fimpute_mice[X_train_fimpute_mice.columns])
X_train_fimpute_mice.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,-1.928872e-16,-5.831474e-17,8.343494e-16,5.921189000000001e-17,2.866394e-16,4.893953e-16,-3.545985e-16,6.948426e-16,-1.282924e-16,1.058637e-16,...,-3.184882e-16,-1.004808e-16,1.040694e-16,-4.135861e-16,1.964758e-16,-1.821214e-16,-5.741759000000001e-17,9.796877e-16,4.831152e-16,-1.578984e-16
std,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,...,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126
min,-1.520226,-0.7708456,-4.095678,-1.917944,-6.303437,-4.535806,-8.210378,-5.3204,-1.234646,-2.48848,...,-2.382555,-2.675063,-2.270249,-2.630871,-2.135353,-1.851904,-1.050066,-4.286523,-5.35552,-2.994455
25%,-0.6808763,-0.7708456,-0.4967087,-0.6391231,-0.4907346,-0.5858247,-0.5486713,-0.1162227,-1.234646,-0.2432958,...,-0.380878,-0.3330064,-0.3925051,-0.4198561,-0.3843986,-0.6409875,-1.050066,-0.6391307,-0.2067431,-0.5929487
50%,-0.12131,-0.7708456,-0.006668295,-0.1834866,-0.03661725,-0.002839099,-0.08443218,-0.001098129,0.01230976,0.06670247,...,-0.002202046,-0.001584385,0.003513687,-0.06639494,-0.0002193295,-0.01036914,-0.07221268,-0.1735482,0.00839407,0.04582149
75%,0.7180394,1.297277,0.5315681,0.4341239,0.3266766,0.4559661,0.3651103,0.1147947,1.335289,0.4390017,...,0.1872974,0.03304415,0.01389639,0.1509529,0.0006173593,0.197339,0.9056408,0.363195,0.2187987,0.5336544
max,3.236088,1.297277,3.10226,4.902432,4.504556,4.698973,4.371691,24.6294,1.335289,3.366483,...,6.792848,6.86358,4.8836,6.520205,6.932733,3.923235,1.883494,8.796331,8.452709,3.445252


In [44]:
#RF - not used, cannot import fancyimpute
#from fancyimpute import RandomForestImputer

#rf_imputer = RandomForestImputer()
#X_train_fimpute_rf = pd.DataFrame(rf_imputer.fit_transform(X_train), columns = X_train.columns)
#X_train_fimpute_rf.describe()

In [None]:
# Autoencoder - tricky
# How to handle passing missing values, will it regenerate missing values?
# https://www.xyonix.com/blog/filling-in-the-gaps-ai-powered-data-imputation-using-autoencoders

#stacked_encoder = keras.models.Sequential([
#    keras.layers.Flatten(input_shape=[28, 28]),
#    keras.layers.Dense(100, activation="selu"),
#    keras.layers.Dense(30, activation="selu"),
#])

#stacked_decoder = keras.models.Sequential([
#    keras.layers.Dense(100, activation="selu", input_shape=[30]),
#    keras.layers.Dense(28 * 28, activation="sigmoid"),
#    keras.layers.Reshape([28, 28])
#])

#stacked_ae = keras.models.Sequential([stacked_encoder, stacked_decoder])

# Repeat imputation and scaling for test data

In [48]:
X_test.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,8.0,5.0,10.0,12.0,11.0,3.0,13.0,5.0,...,8.0,8.0,8.0,8.0,8.0,10.0,16.0,13.0,3.0,10.0
mean,10.75,0.4,62.5,25.4,65.3,81.666667,117.545455,5.0,0.461538,1.6,...,28.48675,15.457795,56.989275,25.985962,45.167825,52.3,1.4375,19.835939,454.0,2.2391
std,3.725799,0.502625,11.275764,3.130495,7.528465,9.316001,21.262002,1.0,0.518875,0.547723,...,5.099449,4.021153,14.490362,7.479799,11.94,7.02456,1.152895,4.927625,122.503061,1.102044
min,5.0,0.0,50.0,22.0,57.0,70.0,95.0,4.0,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,332.0,1.04
25%,9.0,0.0,51.0,24.0,60.25,74.5,102.5,4.5,0.0,1.0,...,24.230725,13.8154,44.62725,19.801775,37.245575,47.75,0.0,16.861286,392.5,1.2325
50%,10.0,0.0,63.0,24.0,62.0,80.0,116.0,5.0,0.0,2.0,...,29.4704,16.40245,59.19905,26.33775,46.60885,53.5,2.0,18.292347,453.0,2.255
75%,12.25,1.0,71.0,27.0,69.5,90.25,119.5,5.5,1.0,2.0,...,31.398725,17.674625,65.22205,30.4211,51.860475,55.75,2.0,21.079065,515.0,2.87775
max,19.0,1.0,80.0,30.0,80.0,97.0,163.0,6.0,1.0,2.0,...,36.0572,20.902,79.6982,36.2232,63.1265,64.0,3.0,30.094649,577.0,4.11


In [87]:
# Mean

X_test_fimpute_mean = X_test.copy()

for column in X_test_fimpute_mean.columns:
    mean_val = np.mean(X_train[column])
    X_test_fimpute_mean.loc[X_test_fimpute_mean[column].isna(),column] = mean_val

X_test_fimpute_mean.describe()

  X_test_fimpute_mean.loc[X_test_fimpute_mean[column].isna(),column] = mean_val
  X_test_fimpute_mean.loc[X_test_fimpute_mean[column].isna(),column] = mean_val


Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,10.75,0.4,64.041405,26.808881,67.271124,81.638895,117.116355,4.990848,0.466718,1.772175,...,30.184849,17.016567,60.471538,29.131852,48.151147,55.031811,1.362118,19.660504,467.524087,2.369949
std,3.725799,0.502625,6.964871,1.661213,5.562121,7.0885,15.432754,0.324467,0.412423,0.271218,...,3.406479,2.768095,9.266454,5.249495,7.666107,5.588314,1.035987,3.923753,40.170382,0.770268
min,5.0,0.0,50.0,22.0,57.0,70.0,95.0,4.0,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,332.0,1.04
25%,9.0,0.0,65.069008,27.278508,62.5,78.25,114.25,4.989233,0.0,1.829567,...,30.771225,17.30175,61.96086,28.19595,49.410297,53.75,0.75,17.182707,469.91069,2.2975
50%,10.0,0.0,65.069008,27.278508,69.242249,81.597236,116.591899,4.989233,0.476337,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,57.763622,1.060588,19.334696,469.91069,2.500797
75%,12.25,1.0,65.069008,27.278508,69.242249,81.947927,116.693924,4.989233,1.0,1.829567,...,31.316916,18.055748,62.793046,31.229111,50.140029,57.763622,2.0,19.71886,469.91069,2.500797
max,19.0,1.0,80.0,30.0,80.0,97.0,163.0,6.0,1.0,2.0,...,36.0572,20.902,79.6982,36.2232,63.1265,64.0,3.0,30.094649,577.0,4.11


In [88]:
X_test_fimpute_mean[X_test_fimpute_mean.columns] = mean_scaler.transform(X_test_fimpute_mean[X_test_fimpute_mean.columns])
X_test_fimpute_mean.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.088527,0.056403,-0.111543,-0.1772396,-0.181293,0.003522468,0.03747014,0.00185252,-0.02537,-0.1809975,...,-0.164385,-0.2071773,-0.1395244,-0.2144406,-0.143725,-0.2552402,0.301687,0.07284,-0.02929482,-0.215361
std,1.042416,1.039489,0.756011,0.6269504,0.511573,0.5993776,1.102605,0.3721696,1.087802,0.8553472,...,0.494647,0.5518635,0.5569208,0.5367503,0.553984,0.5221307,1.036526,0.877216,0.493079,1.267768
min,-1.520226,-0.770846,-1.635685,-1.992136,-1.125973,-0.9806198,-1.542651,-1.134669,-1.256379,-2.616223,...,-1.492984,-1.826268,-1.434926,-1.617401,-1.668202,-1.659701,-1.06114,-1.184697,-1.692811,-2.404297
25%,-0.401093,-0.770846,0.0,1.340812e-15,-0.620114,-0.28303,-0.1673189,-1.018757e-15,-1.256379,-7.002669e-16,...,-0.079238,-0.1503215,-0.05001503,-0.3101346,-0.052733,-0.3750032,-0.310749,-0.481111,6.977353e-16,-0.334603
50%,-0.12131,-0.770846,0.0,1.340812e-15,0.0,-1.201618e-15,1.015306e-15,-1.018757e-15,0.0,-7.002669e-16,...,0.0,-7.082897e-16,-4.270415e-16,3.632578e-16,0.0,-6.638786e-16,0.0,0.0,6.977353e-16,0.0
75%,0.508202,1.297277,0.0,1.340812e-15,0.0,0.02965314,0.007289271,-1.018757e-15,1.381208,-7.002669e-16,...,0.0,-7.082897e-16,-4.270415e-16,3.632578e-16,0.0,-6.638786e-16,0.939901,0.085886,6.977353e-16,0.0
max,2.396738,1.297277,1.620704,1.027105,0.989437,1.302401,3.315664,1.159369,1.381208,0.5374987,...,0.688326,0.5674453,1.016012,0.5106356,0.938454,0.582681,1.940421,2.405555,1.314488,2.648554


In [89]:
# Median

X_test_fimpute_median = X_test.copy()

for column in X_test_fimpute_median.columns:
    median_val = np.median(X_train.loc[X_train[column].notna()][column])
    X_test_fimpute_median.loc[X_test_fimpute_median[column].isna(),column] = median_val

X_test_fimpute_median.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,10.75,0.4,64.0,25.85,66.65,81.4,115.95,5.0,0.3,1.9,...,28.67143,16.028278,56.70663,26.760675,44.95901,53.65,1.35,19.173301,472.7,2.34955
std,3.725799,0.502625,6.958524,1.460894,5.363375,7.09633,15.530869,0.324443,0.470162,0.307794,...,3.099112,2.487093,8.798503,4.586216,7.249416,5.029126,1.03999,4.024182,40.554187,0.766897
min,5.0,0.0,50.0,22.0,57.0,70.0,95.0,4.0,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,332.0,1.04
25%,9.0,0.0,65.0,26.0,62.5,78.25,112.75,5.0,0.0,2.0,...,28.79455,16.4086,56.5182,27.077813,44.8198,53.75,0.75,17.182707,476.0,2.2975
50%,10.0,0.0,65.0,26.0,68.0,81.0,114.0,5.0,0.0,2.0,...,28.79455,16.4086,56.5182,27.27715,44.8198,55.0,1.0,17.942687,476.0,2.46
75%,12.25,1.0,65.0,26.0,68.0,81.5,116.25,5.0,1.0,2.0,...,28.79455,16.4086,57.1221,27.27715,45.114,55.0,2.0,19.71886,476.0,2.46
max,19.0,1.0,80.0,30.0,80.0,97.0,163.0,6.0,1.0,2.0,...,36.0572,20.902,79.6982,36.2232,63.1265,64.0,3.0,30.094649,577.0,4.11


In [90]:
X_test_fimpute_median[X_test_fimpute_median.columns] = median_scaler.transform(X_test_fimpute_median[X_test_fimpute_median.columns])
X_test_fimpute_median.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.088527,0.056403,-0.113123,-0.162741,-0.208586,-0.004013,0.001924,0.002317,0.057153,-0.166568,...,-0.197541,-0.237008,-0.172577,-0.249201,-0.178991,-0.293863,0.299574,0.035754,-0.026547,-0.218885
std,1.042416,1.039489,0.755317,0.540428,0.492676,0.599896,1.10599,0.372138,1.053563,0.944293,...,0.442653,0.48929,0.51964,0.459644,0.514453,0.466399,1.040266,0.892009,0.497579,1.261518
min,-1.520226,-0.770846,-1.632761,-1.58697,-1.095029,-0.967725,-1.489975,-1.144688,-0.615102,-2.927717,...,-1.288241,-1.640278,-1.223192,-1.386729,-1.44953,-1.559757,-1.050784,-1.103083,-1.752864,-2.373047
25%,-0.401093,-0.770846,-0.004578,-0.107252,-0.589802,-0.270302,-0.225955,0.002317,-0.615102,0.140227,...,-0.179956,-0.162187,-0.183705,-0.217417,-0.18887,-0.284589,-0.300585,-0.405486,0.013943,-0.304505
50%,-0.12131,-0.770846,-0.004578,-0.107252,-0.084576,-0.037828,-0.13694,0.002317,-0.615102,0.140227,...,-0.179956,-0.162187,-0.183705,-0.197438,-0.18887,-0.168664,-0.050518,-0.237027,0.013943,-0.037199
75%,0.508202,1.297277,-0.004578,-0.107252,-0.084576,0.00444,0.023288,0.002317,1.625747,0.140227,...,-0.179956,-0.162187,-0.148039,-0.197438,-0.167992,-0.168664,0.949747,0.156683,0.013943,-0.037199
max,2.396738,1.297277,1.623606,1.372467,1.017737,1.31475,3.352467,1.149323,1.625747,0.140227,...,0.857385,0.721807,1.185306,0.699161,1.110261,0.665992,1.950012,2.456604,1.253161,2.676992


In [47]:
# KNN

X_test_fimpute_knn = pd.DataFrame(knn_imputer.transform(X_test), columns = X_test.columns)
X_test_fimpute_knn.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,10.75,0.4,63.893374,27.23289,68.844066,80.936993,117.704557,4.77447,0.458276,1.774232,...,31.231725,18.052421,64.926761,31.004235,51.039191,56.536945,1.486818,20.450175,444.561572,2.348645
std,3.725799,0.502625,7.516838,3.226048,7.968698,7.446801,15.731184,0.520071,0.444699,0.292443,...,6.166867,4.895591,16.749135,9.795177,13.123497,8.495075,1.094726,4.340366,62.522798,0.777565
min,5.0,0.0,50.0,22.0,57.0,70.0,95.0,3.626199,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,321.347363,1.04
25%,9.0,0.0,60.75,24.555913,62.5,76.083333,111.040392,4.583752,0.0,1.722296,...,28.240784,15.782166,58.328499,26.278089,45.649286,52.999951,0.575,17.182707,425.606651,2.135671
50%,10.0,0.0,65.716667,27.12978,69.783333,79.283333,116.5,4.783606,0.408974,1.830257,...,30.64885,17.878862,63.40923,29.494703,49.648857,55.752882,2.0,19.924733,448.87506,2.44765
75%,12.25,1.0,68.087937,29.515545,72.923577,84.038228,120.425,5.019058,1.0,2.0,...,34.332187,20.20495,71.102672,35.5911,56.705428,60.325,2.083333,22.607081,483.631601,2.598476
max,19.0,1.0,80.0,34.0,90.421565,97.0,163.0,6.0,1.0,2.033333,...,49.49994,29.466295,104.393153,55.815855,83.330445,75.0,3.0,30.094649,577.0,4.11


In [49]:
X_test_fimpute_knn[X_test_fimpute_knn.columns] = knn_scaler.transform(X_test_fimpute_knn[X_test_fimpute_knn.columns])
X_test_fimpute_knn.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.088527,0.056403,-0.097031,0.08465,-0.039245,-0.039384,0.09346,-0.1017,-0.052244,-0.126827,...,-0.008472,0.005146,0.064371,-0.008947,0.049194,-0.129993,0.402809,0.205683,-0.1116,-0.250498
std,1.042416,1.039489,0.747886,0.803264,0.708659,0.609648,1.077369,0.532198,1.081008,0.832121,...,0.696388,0.757282,0.765593,0.783815,0.740337,0.714044,1.061269,0.909451,0.652216,1.164897
min,-1.520226,-0.770846,-1.479349,-1.218303,-1.092541,-0.934764,-1.461489,-1.276745,-1.166254,-2.329836,...,-1.159905,-1.411328,-1.124487,-1.25675,-1.303817,-1.519987,-1.03857,-1.138385,-1.396927,-2.211023
25%,-0.401093,-0.770846,-0.40978,-0.581898,-0.603424,-0.436739,-0.362943,-0.296866,-1.166254,-0.274606,...,-0.346221,-0.346031,-0.237231,-0.387135,-0.254867,-0.427291,-0.481143,-0.47896,-0.309331,-0.569561
50%,-0.12131,-0.770846,0.084377,0.058977,0.044285,-0.174765,0.010965,-0.092352,-0.172089,0.032588,...,-0.074292,-0.021701,-0.004994,-0.12974,-0.029239,-0.195896,0.900307,0.095586,-0.066603,-0.102175
75%,0.508202,1.297277,0.320306,0.653016,0.323547,0.214505,0.279773,0.14859,1.264619,0.515578,...,0.341645,0.338114,0.346668,0.358097,0.368844,0.188408,0.981094,0.657627,0.295965,0.123782
max,2.396738,1.297277,1.505495,1.769614,1.879649,1.275647,3.195573,1.152405,1.264619,0.610425,...,2.05445,1.770719,1.868356,1.976493,1.870844,1.421899,1.869746,2.226521,1.269951,2.388249


In [51]:
# MICE

X_test_fimpute_mice = pd.DataFrame(iter_imputer.transform(X_test), columns = X_test.columns)
X_test_fimpute_mice.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,10.75,0.4,64.384331,27.189833,68.802755,81.222964,118.182086,4.880596,0.444288,1.781648,...,29.934581,17.000002,61.027601,29.569552,48.151484,55.211312,1.480745,20.096235,456.502406,2.327285
std,3.725799,0.502625,7.793448,4.183181,8.019925,7.577237,15.582534,0.569945,0.414281,0.303869,...,3.557039,2.763017,9.424007,5.537355,7.6662,5.714718,1.05281,4.002297,46.087711,0.794676
min,5.0,0.0,50.0,15.998384,57.0,70.0,95.0,3.120483,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,332.0,1.04
25%,9.0,0.0,60.956539,24.0,62.5,75.75,112.206733,4.864217,0.0,1.757357,...,28.8616,17.30175,62.397428,28.19595,49.404827,53.75,0.75,17.182707,451.607831,2.018671
50%,10.0,0.0,65.350489,27.875495,69.550418,80.357634,117.0,4.96419,0.392398,1.821179,...,30.141796,18.005161,63.692298,30.880738,50.138271,57.487605,2.0,19.641881,463.273518,2.375526
75%,12.25,1.0,69.13803,29.819804,71.13901,85.722666,121.927123,5.076138,1.0,1.956164,...,32.380737,18.105431,63.812779,33.146024,50.148196,58.349053,2.029086,21.354441,473.176378,2.666661
max,19.0,1.0,80.0,35.572032,92.52466,97.0,163.0,6.0,1.0,2.192637,...,36.0572,20.902,79.6982,36.2232,63.1265,64.0,3.0,30.094649,577.0,4.11


In [52]:
# Clip imputed values to original max and min
for column in X_test_fimpute_mice.columns:
    max_val = np.max(X_train[column])
    min_val = np.min(X_train[column])
    X_test_fimpute_mice.loc[X_test_fimpute_mice[column]>max_val,column] = max_val
    X_test_fimpute_mice.loc[X_test_fimpute_mice[column]<min_val, column] = min_val

X_test_fimpute_mice.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,10.75,0.4,64.384331,27.289913,68.802755,81.222964,118.182086,4.880596,0.444288,1.781648,...,29.934581,17.000002,61.027601,29.569552,48.151484,55.211312,1.480745,20.096235,456.502406,2.327285
std,3.725799,0.502625,7.793448,3.916801,8.019925,7.577237,15.582534,0.569945,0.414281,0.303869,...,3.557039,2.763017,9.424007,5.537355,7.6662,5.714718,1.05281,4.002297,46.087711,0.794676
min,5.0,0.0,50.0,18.0,57.0,70.0,95.0,3.120483,0.0,1.0,...,21.0352,8.89536,38.9177,15.4107,27.0552,40.0,0.0,14.03559,332.0,1.04
25%,9.0,0.0,60.956539,24.0,62.5,75.75,112.206733,4.864217,0.0,1.757357,...,28.8616,17.30175,62.397428,28.19595,49.404827,53.75,0.75,17.182707,451.607831,2.018671
50%,10.0,0.0,65.350489,27.875495,69.550418,80.357634,117.0,4.96419,0.392398,1.821179,...,30.141796,18.005161,63.692298,30.880738,50.138271,57.487605,2.0,19.641881,463.273518,2.375526
75%,12.25,1.0,69.13803,29.819804,71.13901,85.722666,121.927123,5.076138,1.0,1.956164,...,32.380737,18.105431,63.812779,33.146024,50.148196,58.349053,2.029086,21.354441,473.176378,2.666661
max,19.0,1.0,80.0,35.572032,92.52466,97.0,163.0,6.0,1.0,2.192637,...,36.0572,20.902,79.6982,36.2232,63.1265,64.0,3.0,30.094649,577.0,4.11


In [53]:
X_test_fimpute_mice[X_test_fimpute_mice.columns] = mice_scaler.transform(X_test_fimpute_mice[X_test_fimpute_mice.columns])
X_test_fimpute_mice.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.088527,0.056403,-0.045878,0.062078,-0.054532,-0.024661,0.096746,-0.09994,-0.092855,-0.20022,...,-0.201746,-0.212473,-0.149911,-0.186412,-0.143597,-0.248715,0.397885,0.159731,-0.138408,-0.32177
std,1.042416,1.039489,0.801382,0.834814,0.728397,0.630397,1.09531,0.609633,1.064675,0.889571,...,0.502229,0.550313,0.534173,0.543306,0.553989,0.532311,1.029493,0.885501,0.532544,1.215554
min,-1.520226,-0.770846,-1.524985,-1.917944,-1.126499,-0.958369,-1.532744,-1.982619,-1.234646,-2.48848,...,-1.458277,-1.826683,-1.403148,-1.57563,-1.668095,-1.665609,-1.050066,-1.181175,-1.577034,-2.290829
25%,-0.401093,-0.770846,-0.39835,-0.639123,-0.62697,-0.479991,-0.323267,-0.117459,-1.234646,-0.271331,...,-0.353243,-0.152373,-0.072266,-0.321185,-0.053025,-0.384832,-0.316676,-0.484882,-0.194965,-0.793832
50%,-0.12131,-0.770846,0.05347,0.186887,0.013374,-0.096654,0.013656,-0.010525,-0.22621,-0.084494,...,-0.172488,-0.012274,0.00113,-0.057763,-2.4e-05,-0.036684,0.905641,0.059206,-0.060168,-0.24798
75%,0.508202,1.297277,0.442934,0.601291,0.157655,0.349697,0.359988,0.109218,1.335289,0.310673,...,0.143634,0.007697,0.007959,0.164499,0.000693,0.043558,0.934083,0.438107,0.05426,0.197347
max,2.396738,1.297277,1.559845,1.827302,2.099974,1.287928,3.247037,1.097415,1.335289,1.002942,...,0.662725,0.564693,0.908379,0.466421,0.938555,0.569929,1.883494,2.371861,1.253942,2.40511


# Use labelled data only

For each imputation approach, we have:
* X_train, X_test
* Common y_train

1. We extract labelled data from each X_train and common y_train
2. Fit model on X_train_labelled, common y_train_labelled
3. Predict on X_test

In [105]:
y_test = test_data[['id']]

In [106]:
# Constant model which we will use for testing all imputation approaches
xgb_impute_model = XGBRegressor(random_state=42, learning_rate=0.05, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.7)

In [107]:
#Mean
X_train_labelled = X_train_fimpute_mean.loc[y_train.notna()]
y_train_labelled = y_train[y_train.notna()]
print("Size of labelled train data set is: ", (X_train_labelled.shape, y_train_labelled.shape))

Size of labelled train data set is:  ((2736, 34), (2736,))


In [108]:
xgb_impute_model.fit(X_train_labelled, y_train_labelled)

In [109]:
y_test['PCIAT-PCIAT_Total_mean'] = xgb_impute_model.predict(X_test_fimpute_mean)
y_test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total_mean'] = xgb_impute_model.predict(X_test_fimpute_mean)


Unnamed: 0,id,PCIAT-PCIAT_Total_mean
0,00008ff9,28.097628
1,000fd460,13.788895
2,00105258,30.657166
3,00115b9f,20.609028
4,0016bb22,35.410194
5,001f3379,29.594351
6,0038ba98,34.180454
7,0068a485,28.73044
8,0069fbed,41.061615
9,0083e397,34.462418


In [110]:
#Median
X_train_labelled = X_train_fimpute_median.loc[y_train.notna()]
y_train_labelled = y_train[y_train.notna()]
print("Size of labelled train data set is: ", (X_train_labelled.shape, y_train_labelled.shape))

Size of labelled train data set is:  ((2736, 34), (2736,))


In [111]:
xgb_impute_model.fit(X_train_labelled, y_train_labelled)

In [112]:
y_test['PCIAT-PCIAT_Total_median'] = xgb_impute_model.predict(X_test_fimpute_median)
y_test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total_median'] = xgb_impute_model.predict(X_test_fimpute_median)


Unnamed: 0,id,PCIAT-PCIAT_Total_mean,PCIAT-PCIAT_Total_median
0,00008ff9,28.097628,28.710884
1,000fd460,13.788895,16.450972
2,00105258,30.657166,30.761959
3,00115b9f,20.609028,19.592257
4,0016bb22,35.410194,34.400955
5,001f3379,29.594351,31.051321
6,0038ba98,34.180454,33.795845
7,0068a485,28.73044,28.209627
8,0069fbed,41.061615,39.437405
9,0083e397,34.462418,33.939198


In [113]:
#KNN
X_train_labelled = X_train_fimpute_knn.loc[y_train.notna()]
y_train_labelled = y_train[y_train.notna()]
print("Size of labelled train data set is: ", (X_train_labelled.shape, y_train_labelled.shape))

Size of labelled train data set is:  ((2736, 34), (2736,))


In [114]:
xgb_impute_model.fit(X_train_labelled, y_train_labelled)

In [115]:
y_test['PCIAT-PCIAT_Total_knn'] = xgb_impute_model.predict(X_test_fimpute_knn)
y_test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total_knn'] = xgb_impute_model.predict(X_test_fimpute_knn)


Unnamed: 0,id,PCIAT-PCIAT_Total_mean,PCIAT-PCIAT_Total_median,PCIAT-PCIAT_Total_knn
0,00008ff9,28.097628,28.710884,27.825579
1,000fd460,13.788895,16.450972,14.881695
2,00105258,30.657166,30.761959,30.428879
3,00115b9f,20.609028,19.592257,21.287128
4,0016bb22,35.410194,34.400955,43.761642
5,001f3379,29.594351,31.051321,29.157087
6,0038ba98,34.180454,33.795845,33.366413
7,0068a485,28.73044,28.209627,32.692314
8,0069fbed,41.061615,39.437405,43.493179
9,0083e397,34.462418,33.939198,40.677105


In [116]:
#MICE
X_train_labelled = X_train_fimpute_mice.loc[y_train.notna()]
y_train_labelled = y_train[y_train.notna()]
print("Size of labelled train data set is: ", (X_train_labelled.shape, y_train_labelled.shape))

Size of labelled train data set is:  ((2736, 34), (2736,))


In [117]:
xgb_impute_model.fit(X_train_labelled, y_train_labelled)

In [118]:
y_test['PCIAT-PCIAT_Total_mice'] = xgb_impute_model.predict(X_test_fimpute_mice)
y_test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total_mice'] = xgb_impute_model.predict(X_test_fimpute_mice)


Unnamed: 0,id,PCIAT-PCIAT_Total_mean,PCIAT-PCIAT_Total_median,PCIAT-PCIAT_Total_knn,PCIAT-PCIAT_Total_mice
0,00008ff9,28.097628,28.710884,27.825579,26.997581
1,000fd460,13.788895,16.450972,14.881695,16.266987
2,00105258,30.657166,30.761959,30.428879,29.768028
3,00115b9f,20.609028,19.592257,21.287128,19.785748
4,0016bb22,35.410194,34.400955,43.761642,36.03418
5,001f3379,29.594351,31.051321,29.157087,30.798737
6,0038ba98,34.180454,33.795845,33.366413,32.711647
7,0068a485,28.73044,28.209627,32.692314,30.812389
8,0069fbed,41.061615,39.437405,43.493179,42.307667
9,0083e397,34.462418,33.939198,40.677105,36.498135


# Testing and choosing the imputation approach

In [119]:
y_test['PCIAT-PCIAT_Total'] = y_test.apply(
    #lambda row: (row['PCIAT-PCIAT_Total_mean']),
    #lambda row: (row['PCIAT-PCIAT_Total_median']),
    #lambda row: (row['PCIAT-PCIAT_Total_knn']),
    lambda row: (row['PCIAT-PCIAT_Total_mice']),
    
axis=1)

y_test['sii'] = y_test.apply(lambda row: 0 if row['PCIAT-PCIAT_Total']<=30 else 
                             (1 if row['PCIAT-PCIAT_Total']<50 else (
                                2 if row['PCIAT-PCIAT_Total']<80 else (3)
                            )), axis=1)

y_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total'] = y_test.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['sii'] = y_test.apply(lambda row: 0 if row['PCIAT-PCIAT_Total']<=30 else


Unnamed: 0,id,PCIAT-PCIAT_Total_mean,PCIAT-PCIAT_Total_median,PCIAT-PCIAT_Total_knn,PCIAT-PCIAT_Total_mice,PCIAT-PCIAT_Total,sii
0,00008ff9,28.097628,28.710884,27.825579,26.997581,28.097628,0
1,000fd460,13.788895,16.450972,14.881695,16.266987,13.788895,0
2,00105258,30.657166,30.761959,30.428879,29.768028,30.657166,1
3,00115b9f,20.609028,19.592257,21.287128,19.785748,20.609028,0
4,0016bb22,35.410194,34.400955,43.761642,36.03418,35.410194,1


In [120]:
solution = y_test[['id','sii']]
solution.to_csv("submission.csv", index=False)

Scoring (QWK on private and public datasets):
* Mean: 0.355, 0.393
* Median: 0.351, 0.390
* KNN: 0.373, 0.382
* MICE: 0.388, 0.377