In [9]:
import numpy as np
import pandas as pd

import os

import polars as pl
from glob import glob
from tqdm.auto import tqdm

import tensorflow as tf
from tensorflow import keras
import random

np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#from sklearn.impute import KNNImputer

#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import cohen_kappa_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.callbacks import LearningRateScheduler

from sklearn.svm import SVR

import lightgbm as lgb

from sklearn.model_selection import KFold

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

import matplotlib.pyplot as plt

# Data import and preprocessing

In [10]:
train_data=pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [11]:
INPUT_DIR = "/kaggle/input/child-mind-institute-problematic-internet-use/"

# Import aggregate fields from parquet files
# Modified code from rsakata: https://www.kaggle.com/code/rsakata/cmi-piu-16th-place-solution

files_train = glob(INPUT_DIR + "series_train.parquet/*")
#if IS_SUBMIT:
#    files += glob(INPUT_DIR + "series_test.parquet/*")

list_df_train = []
for file in tqdm(files_train):
    df_series = (
        pl.read_parquet(file)
        .with_columns(
            (
                (pl.col("relative_date_PCIAT") - pl.col("relative_date_PCIAT").min())*24
                + (pl.col("time_of_day") // int(1e9)) / 3600
            ).floor().cast(int).alias("total_hours")
        )
        .filter(pl.col("non-wear_flag") != 1)
        .filter(pl.col("step").count().over("total_hours") == 12 * 60)
        .group_by("total_hours").agg(
            pl.col("enmo").std().alias("enmo_std"),
            pl.col("anglez").std().alias("anglez_std"),
            pl.col("light").std().alias("light_std")
        )
        .with_columns(
            (pl.col("total_hours") % 24).alias("hour"),
            pl.lit(file.split("/")[-1][3:]).alias("id")
        )
    )
    list_df_train.append(df_series.to_pandas())

df_series = pd.concat(list_df_train)
df_series["enmo_std"] = np.log(df_series["enmo_std"] + 0.01)
df_series["anglez_std"] = np.log(df_series["anglez_std"] + 1)
df_series["light_std"] = np.log(df_series["light_std"] + 0.01)

df_agg_train = df_series.groupby("id")[["enmo_std", "anglez_std", "light_std"]].agg(["mean", "std"]).reset_index()
df_agg_train.columns = [cols[0] + "_" + cols[1] if cols[1] != "" else cols[0] for cols in df_agg_train.columns]
df_agg_train

  0%|          | 0/996 [00:00<?, ?it/s]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00115b9f,-4.000377,,1.989905,,0.051475,
1,001f3379,-3.514671,0.652348,3.236993,0.678888,0.774591,2.945807
2,00f332d1,-3.071176,0.927238,3.249122,0.463244,1.138379,2.939823
3,01085eb3,-2.902040,0.791255,3.389762,0.315061,1.054698,2.185839
4,012cadd8,-2.806918,1.171675,3.337322,0.388409,0.823770,3.350365
...,...,...,...,...,...,...,...
964,fe9c71d8,-3.116904,0.961804,3.037607,0.943554,-0.394200,2.742634
965,fecc07d6,-3.969482,0.981531,1.332831,1.428363,-0.438018,1.795653
966,ff18b749,-2.820076,0.937540,3.258458,0.417267,1.236652,3.341580
967,ffcd4dbd,-3.271800,0.827489,3.183395,0.629553,0.521227,2.665325


In [12]:
train_data2 = train_data.merge(df_agg_train, how="left", on="id")
train_data2.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,Fall,3.0,2.0,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,64.0,Summer,0.0,0.0,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,54.0,Summer,2.0,0.0,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.0,Winter,0.0,1.0,-4.000377,,1.989905,,0.051475,
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [13]:
files_test = glob(INPUT_DIR + "series_test.parquet/*")


list_df_test = []
for file in tqdm(files_test):
    df_series = (
        pl.read_parquet(file)
        .with_columns(
            (
                (pl.col("relative_date_PCIAT") - pl.col("relative_date_PCIAT").min())*24
                + (pl.col("time_of_day") // int(1e9)) / 3600
            ).floor().cast(int).alias("total_hours")
        )
        .filter(pl.col("non-wear_flag") != 1)
        .filter(pl.col("step").count().over("total_hours") == 12 * 60)
        .group_by("total_hours").agg(
            pl.col("enmo").std().alias("enmo_std"),
            pl.col("anglez").std().alias("anglez_std"),
            pl.col("light").std().alias("light_std")
        )
        .with_columns(
            (pl.col("total_hours") % 24).alias("hour"),
            pl.lit(file.split("/")[-1][3:]).alias("id")
        )
    )
    list_df_test.append(df_series.to_pandas())

df_series = pd.concat(list_df_test)
df_series["enmo_std"] = np.log(df_series["enmo_std"] + 0.01)
df_series["anglez_std"] = np.log(df_series["anglez_std"] + 1)
df_series["light_std"] = np.log(df_series["light_std"] + 0.01)

df_agg_test = df_series.groupby("id")[["enmo_std", "anglez_std", "light_std"]].agg(["mean", "std"]).reset_index()
df_agg_test.columns = [cols[0] + "_" + cols[1] if cols[1] != "" else cols[0] for cols in df_agg_test.columns]
df_agg_test

  0%|          | 0/2 [00:00<?, ?it/s]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00115b9f,-4.000377,,1.989905,,0.051475,
1,001f3379,-3.514671,0.652348,3.236993,0.678888,0.774591,2.945807


In [14]:
test_data2 = test_data.merge(df_agg_test, how="left", on="id")
test_data2.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,Fall,3.0,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,46.0,64.0,Summer,0.0,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,38.0,54.0,Summer,2.0,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,31.0,45.0,Winter,0.0,-4.000377,,1.989905,,0.051475,
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [39]:
X_train = train_data2[['Basic_Demos-Age',
                      'Basic_Demos-Sex',
                      'CGAS-CGAS_Score',
                      'Physical-BMI',
                      'BIA-BIA_BMI',
                      'Physical-Waist_Circumference',
                      'Physical-Diastolic_BP',
                      'Physical-HeartRate',
                      'Physical-Systolic_BP',
                      'Fitness_Endurance-Max_Stage',
                      'Fitness_Endurance-Time_Mins',
                      'Fitness_Endurance-Time_Sec',
                      'FGC-FGC_CU_Zone',
                      'FGC-FGC_GSND_Zone',
                      'FGC-FGC_GSD_Zone',
                      'FGC-FGC_PU_Zone',
                      'FGC-FGC_SRL_Zone',
                      'FGC-FGC_SRR_Zone',
                      'FGC-FGC_TL_Zone',
                      'BIA-BIA_Activity_Level_num',
                      'BIA-BIA_BMC',
                      'BIA-BIA_BMR',
                      'BIA-BIA_DEE',
                      'BIA-BIA_ECW',
                      'BIA-BIA_FFM',
                      'BIA-BIA_FFMI',
                      'BIA-BIA_FMI',
                      'BIA-BIA_Fat',
                      'BIA-BIA_ICW',
                      'BIA-BIA_LDM',
                      'BIA-BIA_LST',
                      'BIA-BIA_SMM',
                      'BIA-BIA_TBW',
                      'PAQ_A-PAQ_A_Total',
                      'PAQ_C-PAQ_C_Total',
                      'SDS-SDS_Total_T',
                      'PreInt_EduHx-computerinternet_hoursday'
                       ,
                      'enmo_std_mean',
                      'enmo_std_std',
                      'anglez_std_mean',
                      'anglez_std_std',
                      'light_std_mean',
                      'light_std_std'
                      ]]

y_train = train_data2['PCIAT-PCIAT_Total']

X_test = test_data2[['Basic_Demos-Age',
                      'Basic_Demos-Sex',
                      'CGAS-CGAS_Score',
                      'Physical-BMI',
                      'BIA-BIA_BMI',
                      'Physical-Waist_Circumference',
                      'Physical-Diastolic_BP',
                      'Physical-HeartRate',
                      'Physical-Systolic_BP',
                      'Fitness_Endurance-Max_Stage',
                      'Fitness_Endurance-Time_Mins',
                      'Fitness_Endurance-Time_Sec',
                      'FGC-FGC_CU_Zone',
                      'FGC-FGC_GSND_Zone',
                      'FGC-FGC_GSD_Zone',
                      'FGC-FGC_PU_Zone',
                      'FGC-FGC_SRL_Zone',
                      'FGC-FGC_SRR_Zone',
                      'FGC-FGC_TL_Zone',
                      'BIA-BIA_Activity_Level_num',
                      'BIA-BIA_BMC',
                      'BIA-BIA_BMR',
                      'BIA-BIA_DEE',
                      'BIA-BIA_ECW',
                      'BIA-BIA_FFM',
                      'BIA-BIA_FFMI',
                      'BIA-BIA_FMI',
                      'BIA-BIA_Fat',
                      'BIA-BIA_ICW',
                      'BIA-BIA_LDM',
                      'BIA-BIA_LST',
                      'BIA-BIA_SMM',
                      'BIA-BIA_TBW',
                      'PAQ_A-PAQ_A_Total',
                      'PAQ_C-PAQ_C_Total',
                      'SDS-SDS_Total_T',
                      'PreInt_EduHx-computerinternet_hoursday'
                      ,
                      'enmo_std_mean',
                      'enmo_std_std',
                      'anglez_std_mean',
                      'anglez_std_std',
                      'light_std_mean',
                      'light_std_std'
                   ]]

In [40]:
# Add this only if we are not interested in the actigraph data

X_train = X_train.drop(columns=['enmo_std_mean',
                      'enmo_std_std',
                      'anglez_std_mean',
                      'anglez_std_std',
                      'light_std_mean',
                      'light_std_std'])

X_test = X_test.drop(columns=['enmo_std_mean',
                      'enmo_std_std',
                      'anglez_std_mean',
                      'anglez_std_std',
                      'light_std_mean',
                      'light_std_std'])


"\nX_train = X_train.drop(columns=['enmo_std_mean',\n                      'enmo_std_std',\n                      'anglez_std_mean',\n                      'anglez_std_std',\n                      'light_std_mean',\n                      'light_std_std'])\n\nX_test = X_test.drop(columns=['enmo_std_mean',\n                      'enmo_std_std',\n                      'anglez_std_mean',\n                      'anglez_std_std',\n                      'light_std_mean',\n                      'light_std_std'])\n"

In [41]:
# Add calculated fields
X_train['Physical-BMI_Calc'] = X_train.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
X_train['Fitness_Endurance-Time_Sec_Calc'] = X_train.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
X_train['PAQ_Total'] = X_train.apply(lambda row: row['PAQ_A-PAQ_A_Total'] if row['PAQ_A-PAQ_A_Total']==row['PAQ_A-PAQ_A_Total'] else row['PAQ_C-PAQ_C_Total'],axis=1)


# Drop fields no longer needed
X_train = X_train.drop(columns=['PAQ_A-PAQ_A_Total','PAQ_C-PAQ_C_Total',
                     'Physical-BMI','BIA-BIA_BMI',
                     'Fitness_Endurance-Time_Mins','Fitness_Endurance-Time_Sec'])

# Remove outliers - may give warnings due to NaN value comparison
X_train.loc[X_train['CGAS-CGAS_Score']>=100.0,'CGAS-CGAS_Score'] = np.nan
X_train.loc[X_train['Physical-Systolic_BP']>=180.0,'Physical-Systolic_BP'] = np.nan
X_train.loc[X_train['Physical-Diastolic_BP']>=120.0,'Physical-Diastolic_BP'] = np.nan
X_train.loc[X_train['BIA-BIA_DEE']>=6000.0,'BIA-BIA_DEE'] = np.nan
X_train.loc[(X_train['BIA-BIA_BMC']<=0.0) | (X_train['BIA-BIA_BMC']>=16.0),'BIA-BIA_BMC'] = np.nan
X_train.loc[(X_train['BIA-BIA_BMR']<=0.0) | (X_train['BIA-BIA_BMR']>=2400.0),'BIA-BIA_BMR'] = np.nan
X_train.loc[(X_train['BIA-BIA_ECW']<=0.0) | (X_train['BIA-BIA_ECW']>=60.0),'BIA-BIA_ECW'] = np.nan
X_train.loc[(X_train['BIA-BIA_FFM']<=0.0) | (X_train['BIA-BIA_FFM']>=200.0),'BIA-BIA_FFM'] = np.nan
X_train.loc[(X_train['BIA-BIA_FFMI']<=0.0) | (X_train['BIA-BIA_FFMI']>=25.0),'BIA-BIA_FFMI'] = np.nan
X_train.loc[(X_train['BIA-BIA_FMI']<=0.0) | (X_train['BIA-BIA_FMI']>=25.0),'BIA-BIA_FMI'] = np.nan
X_train.loc[(X_train['BIA-BIA_Fat']<=8.0) | (X_train['BIA-BIA_Fat']>=60.0),'BIA-BIA_Fat'] = np.nan
X_train.loc[(X_train['BIA-BIA_ICW']<=0.0) | (X_train['BIA-BIA_ICW']>=80.0),'BIA-BIA_ICW'] = np.nan
X_train.loc[(X_train['BIA-BIA_LDM']<=0.0) | (X_train['BIA-BIA_LDM']>=60.0),'BIA-BIA_LDM'] = np.nan
X_train.loc[(X_train['BIA-BIA_LST']<=0.0) | (X_train['BIA-BIA_LST']>=150.0),'BIA-BIA_LST'] = np.nan
X_train.loc[(X_train['BIA-BIA_SMM']<=0.0) | (X_train['BIA-BIA_SMM']>=100.0),'BIA-BIA_SMM'] = np.nan
X_train.loc[(X_train['BIA-BIA_TBW']<=0.0) | (X_train['BIA-BIA_TBW']>=150.0),'BIA-BIA_TBW'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Physical-BMI_Calc'] = X_train.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Fitness_Endurance-Time_Sec_Calc'] = X_train.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:

# Missing feature imputation

In [42]:
# Noting number of missing features
# Different weight approaches
features_missing_labelled = X_train.loc[y_train.notna()].isnull().sum(axis=1)/X_train.shape[1]
weights_labelled = 1 - features_missing_labelled
weights_labelled.shape

features_missing_labelled2 = X_train.loc[y_train.notna()].isnull().sum(axis=1)
weights_labelled2 = 1 * ((0.95)**features_missing_labelled2)

weights_labelled3 = np.exp((-2)*features_missing_labelled)

In [43]:
#MICE
iter_imputer = IterativeImputer(max_iter=10, random_state=42)
X_train_fimpute = pd.DataFrame(iter_imputer.fit_transform(X_train), columns = X_train.columns)
X_train_fimpute.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,64.837456,26.912663,69.401624,81.516962,116.809269,4.985771,0.478708,1.846919,...,1.073881,-2.918259,0.777887,2.969361,0.611719,1.130111,2.436779,19.373676,469.018248,2.541896
std,3.574648,0.483591,9.546987,4.762029,11.014283,12.026349,14.232763,0.967232,0.39541,0.347531,...,1.024101,0.354486,0.141125,0.335805,0.21692,1.262072,0.391125,4.519958,90.920902,0.660278
min,5.0,0.0,5.252031,-2.558782,0.0,27.0,0.0,0.0,-0.948836,-1.944131,...,-0.994345,-4.579415,0.00127,0.060442,-0.145349,-12.356457,-1.334368,0.0,5.0,-0.19909
25%,8.0,0.0,60.0,24.0,64.0,74.46036,109.0,4.857421,0.0,1.760748,...,0.0,-3.061473,0.707342,2.901092,0.561666,0.838311,2.3234,16.485531,450.0,2.15
50%,10.0,0.0,64.776148,26.240435,69.0,81.465654,115.755877,4.987385,0.484129,1.871204,...,1.0,-2.921262,0.792877,2.985504,0.608728,1.126844,2.444058,18.589876,469.295053,2.57
75%,13.0,1.0,70.0,29.0,73.0,87.0,122.0,5.130851,1.0,2.0,...,2.0,-2.794685,0.849432,3.12071,0.642094,1.344541,2.539335,21.008162,489.396477,2.903745
max,22.0,1.0,95.0,55.36337,119.0,138.0,179.0,28.0,1.180041,3.0,...,3.029967,-1.101896,1.679633,5.87466,1.726049,14.028859,5.291563,59.132048,2127.420735,4.79


In [44]:
# Clip imputed values to original max and min
for column in X_train_fimpute.columns:
    max_val = np.max(X_train[column])
    min_val = np.min(X_train[column])
    X_train_fimpute.loc[X_train_fimpute[column]>max_val,column] = max_val
    X_train_fimpute.loc[X_train_fimpute[column]<min_val, column] = min_val

X_train_fimpute.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,10.433586,0.372727,64.842443,26.943874,69.401624,81.516962,116.809269,4.985771,0.48014,1.847663,...,1.074125,-2.918293,0.777887,2.966826,0.611762,1.13013,2.437116,19.373676,468.784051,2.542092
std,3.574648,0.483591,9.520977,4.66726,11.014283,12.026349,14.232763,0.967232,0.392526,0.342532,...,1.023701,0.354321,0.141125,0.327594,0.216788,1.22899,0.388441,4.519958,87.788537,0.659577
min,5.0,0.0,25.0,18.0,0.0,27.0,0.0,0.0,0.0,1.0,...,0.0,-4.579415,0.00127,0.060442,0.022668,-4.60517,0.0,0.0,5.0,0.58
25%,8.0,0.0,60.0,24.0,64.0,74.46036,109.0,4.857421,0.0,1.760748,...,0.0,-3.061473,0.707342,2.901092,0.561666,0.838311,2.3234,16.485531,450.0,2.15
50%,10.0,0.0,64.776148,26.240435,69.0,81.465654,115.755877,4.987385,0.484129,1.871204,...,1.0,-2.921262,0.792877,2.985504,0.608728,1.126844,2.444058,18.589876,469.295053,2.57
75%,13.0,1.0,70.0,29.0,73.0,87.0,122.0,5.130851,1.0,2.0,...,2.0,-2.794685,0.849432,3.12071,0.642094,1.344541,2.539335,21.008162,489.396477,2.903745
max,22.0,1.0,95.0,50.0,119.0,138.0,179.0,28.0,1.0,3.0,...,3.0,-1.233944,1.679633,3.558768,1.726049,6.692563,5.291563,59.132048,1200.0,4.79


In [45]:
# standardise data
scaler = StandardScaler()                  

X_train_fimpute[X_train_fimpute.columns] = scaler.fit_transform(X_train_fimpute[X_train_fimpute.columns])
X_train_fimpute.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,...,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0,3960.0
mean,-1.928872e-16,-5.831474e-17,2.242875e-16,4.754895e-16,-3.238711e-16,-9.263073e-16,8.130421e-16,2.610706e-16,-2.242875e-18,-6.315935e-16,...,8.971499000000001e-18,-1.463252e-15,2.258575e-16,4.180719e-16,-2.233903e-16,7.356629000000001e-17,-5.329071e-16,-8.146121e-16,-5.436729e-16,1.43544e-17
std,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,...,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126,1.000126
min,-1.520226,-0.7708456,-4.18523,-1.916543,-6.301853,-4.533699,-8.208106,-5.155331,-1.22336,-2.475007,...,-1.049389,-4.688778,-5.503725,-8.873018,-2.717714,-4.667268,-6.274889,-4.286793,-5.283635,-2.975149
25%,-0.6808763,-0.7708456,-0.508672,-0.6308296,-0.4904819,-0.5868359,-0.5487518,-0.1327148,-1.22336,-0.2537742,...,-1.049389,-0.4041484,-0.4999376,-0.2006837,-0.2311091,-0.2374763,-0.2927873,-0.6390569,-0.2139963,-0.5945354
50%,-0.12131,-0.7708456,-0.006963866,-0.1507368,-0.03646854,-0.004266888,-0.07402106,0.001669477,0.01016305,0.06873711,...,-0.07241796,-0.008382024,0.1062321,0.05702355,-0.01399343,-0.002673648,0.01787417,-0.1734306,0.005821567,0.04231659
75%,0.7180394,1.297277,0.541773,0.4405981,0.3267421,0.4559763,0.3647491,0.1500143,1.324562,0.4447948,...,0.904553,0.348902,0.5070287,0.4697974,0.1399347,0.1744835,0.2631858,0.3616609,0.234826,0.5483784
max,3.236088,1.297277,3.167886,4.940595,4.503665,4.6972,4.370099,23.79692,1.324562,3.364597,...,1.881524,4.754337,6.390497,1.807162,5.14063,4.526593,7.349398,8.797292,8.330339,3.408534


In [46]:
# Repeat the above for X_test

# Add calculated fields
X_test['Physical-BMI_Calc'] = X_test.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
X_test['Fitness_Endurance-Time_Sec_Calc'] = X_test.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
X_test['PAQ_Total'] = X_test.apply(lambda row: row['PAQ_A-PAQ_A_Total'] if row['PAQ_A-PAQ_A_Total']==row['PAQ_A-PAQ_A_Total'] else row['PAQ_C-PAQ_C_Total'],axis=1)

# Drop fields no longer needed
X_test = X_test.drop(columns=['PAQ_A-PAQ_A_Total','PAQ_C-PAQ_C_Total',
                     'Physical-BMI','BIA-BIA_BMI',
                     'Fitness_Endurance-Time_Mins','Fitness_Endurance-Time_Sec'])

# Remove outliers
X_test.loc[X_test['CGAS-CGAS_Score']>=100.0,'CGAS-CGAS_Score'] = np.nan
X_test.loc[X_test['Physical-Systolic_BP']>=180.0,'Physical-Systolic_BP'] = np.nan
X_test.loc[X_test['Physical-Diastolic_BP']>=120.0,'Physical-Diastolic_BP'] = np.nan
X_test.loc[X_test['BIA-BIA_DEE']>=6000.0,'BIA-BIA_DEE'] = np.nan
X_test.loc[(X_test['BIA-BIA_BMC']<=0.0) | (X_test['BIA-BIA_BMC']>=16.0),'BIA-BIA_BMC'] = np.nan
X_test.loc[(X_test['BIA-BIA_BMR']<=0.0) | (X_test['BIA-BIA_BMR']>=2400.0),'BIA-BIA_BMR'] = np.nan
X_test.loc[(X_test['BIA-BIA_ECW']<=0.0) | (X_test['BIA-BIA_ECW']>=60.0),'BIA-BIA_ECW'] = np.nan
X_test.loc[(X_test['BIA-BIA_FFM']<=0.0) | (X_test['BIA-BIA_FFM']>=200.0),'BIA-BIA_FFM'] = np.nan
X_test.loc[(X_test['BIA-BIA_FFMI']<=0.0) | (X_test['BIA-BIA_FFMI']>=25.0),'BIA-BIA_FFMI'] = np.nan
X_test.loc[(X_test['BIA-BIA_FMI']<=0.0) | (X_test['BIA-BIA_FMI']>=25.0),'BIA-BIA_FMI'] = np.nan
X_test.loc[(X_test['BIA-BIA_Fat']<=8.0) | (X_test['BIA-BIA_Fat']>=60.0),'BIA-BIA_Fat'] = np.nan
X_test.loc[(X_test['BIA-BIA_ICW']<=0.0) | (X_test['BIA-BIA_ICW']>=80.0),'BIA-BIA_ICW'] = np.nan
X_test.loc[(X_test['BIA-BIA_LDM']<=0.0) | (X_test['BIA-BIA_LDM']>=60.0),'BIA-BIA_LDM'] = np.nan
X_test.loc[(X_test['BIA-BIA_LST']<=0.0) | (X_test['BIA-BIA_LST']>=150.0),'BIA-BIA_LST'] = np.nan
X_test.loc[(X_test['BIA-BIA_SMM']<=0.0) | (X_test['BIA-BIA_SMM']>=100.0),'BIA-BIA_SMM'] = np.nan
X_test.loc[(X_test['BIA-BIA_TBW']<=0.0) | (X_test['BIA-BIA_TBW']>=150.0),'BIA-BIA_TBW'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Physical-BMI_Calc'] = X_test.apply(lambda row: row['Physical-BMI'] if row['Physical-BMI']==row['Physical-BMI'] else row['BIA-BIA_BMI'],axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Fitness_Endurance-Time_Sec_Calc'] = X_test.apply(lambda row: row['Fitness_Endurance-Time_Sec'] + (row['Fitness_Endurance-Time_Mins']*60), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

In [47]:
# Imputation
X_test_fimpute = pd.DataFrame(iter_imputer.transform(X_test), columns = X_test.columns)

# Clipping
for column in X_test_fimpute.columns:
    max_val = np.max(X_train[column])
    min_val = np.min(X_train[column])
    X_test_fimpute.loc[X_test_fimpute[column]>max_val,column] = max_val
    X_test_fimpute.loc[X_test_fimpute[column]<min_val, column] = min_val

# Scaling
X_test_fimpute[X_test_fimpute.columns] = scaler.transform(X_test_fimpute[X_test_fimpute.columns])

X_test_fimpute.describe()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,FGC-FGC_CU_Zone,FGC-FGC_GSND_Zone,...,PreInt_EduHx-computerinternet_hoursday,enmo_std_mean,enmo_std_std,anglez_std_mean,anglez_std_std,light_std_mean,light_std_std,Physical-BMI_Calc,Fitness_Endurance-Time_Sec_Calc,PAQ_Total
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.088527,0.056403,-0.073113,0.117495,-0.054213,-0.025475,0.098069,-0.12508,-0.090134,-0.197021,...,0.397971,-0.358459,-0.113335,-0.179885,0.059682,-0.097972,0.125619,0.159203,-0.156574,-0.33223
std,1.042416,1.039489,0.787192,0.641688,0.728653,0.631242,1.095616,0.637155,1.055286,0.886713,...,1.030059,0.876167,0.633084,0.864986,0.618307,0.455405,0.508537,0.885214,0.555119,1.213472
min,-1.520226,-0.770846,-1.559117,-1.059401,-1.126101,-0.957765,-1.532522,-2.214272,-1.22336,-2.475007,...,-1.049389,-3.054352,-1.268242,-2.982483,-0.701638,-0.877787,-0.79516,-1.181153,-1.558305,-2.277644
25%,-0.401093,-0.770846,-0.390917,-0.265123,-0.626686,-0.479588,-0.322393,-0.065823,-1.22336,-0.277619,...,-0.316661,-0.725867,-0.583296,-0.238515,-0.184769,-0.269215,-0.103873,-0.484793,-0.258884,-0.885024
50%,-0.12131,-0.770846,0.046863,0.067879,0.01143,-0.09781,0.013403,-0.015109,-0.211477,-0.076272,...,0.904553,-0.228331,-0.031201,0.026315,-0.02586,-0.086737,0.03966,0.058498,-0.061002,-0.257326
75%,0.508202,1.297277,0.299965,0.557029,0.156283,0.35582,0.367314,0.114281,1.324562,0.301641,...,0.938755,0.091348,0.34647,0.214958,0.219982,0.00531,0.207726,0.436898,0.068412,0.25651
max,2.396738,1.297277,1.592218,1.484931,2.103444,1.287589,3.24579,1.048722,1.324562,1.025246,...,1.881524,0.971376,1.074214,0.824804,2.401594,1.186473,1.309735,2.372218,1.232845,2.377441


# Prepare datasets and functions for models¶
* Labelled data set
* Augmented data sets - X, y, weights
* PCA
* sii target

## Labelled data set

In [48]:
X_train_labelled = X_train_fimpute.loc[y_train.notna()]
y_train_labelled = y_train[y_train.notna()]
print("Size of labelled train data set is: ", (X_train_labelled.shape, y_train_labelled.shape))

Size of labelled train data set is:  ((2736, 40), (2736,))


In [49]:
# Since we are not using the unlabelled data in this notebook, we can drop the indices
# This will help us prevent any accidents when doing cross-validation when we split by index, don't have to think about loc vs iloc etc.

X_train_labelled = X_train_labelled.reset_index(drop=True)
y_train_labelled = y_train_labelled.reset_index(drop=True)

In [50]:
weights_labelled = weights_labelled.reset_index(drop=True)
weights_labelled2 = weights_labelled2.reset_index(drop=True)
weights_labelled3 = weights_labelled3.reset_index(drop=True)

## Augmented data

In [51]:
# X datasets - X_train_labelled, X_train_labelled_aug1a, X_train_labelled_aug1b, X_train_labelled_aug2a, X_train_labelled_aug2b
# aug1/aug2 - augment once or twice
# a/b - 0.1 noise multiplier, 0.15 noise multiplier

# Augmented data

# Get standard deviations of each column in X and y
std_X = np.std(X_train_labelled, axis=0)
std_y = np.std(y_train_labelled, axis=0)

# Create augmented datasets

X_noise_multiplier=0.1
y_noise_multiplier=0.1

# Deliberately not doing this in a for loop because we will not augment more than twice
# For ease of understanding the datasets being created
# And we may choose to add different noisiness to each augmentation
X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug1a = pd.concat([X_train_labelled,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug1a = pd.concat([y_train_labelled,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug1a.shape, y_train_labelled_aug1a.shape)

#repeat
X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug2a = pd.concat([X_train_labelled_aug1a,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug2a = pd.concat([y_train_labelled_aug1a,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug2a.shape, y_train_labelled_aug2a.shape)


# Increased noise multiplier
X_noise_multiplier=0.15
y_noise_multiplier=0.15

X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug1b = pd.concat([X_train_labelled,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug1b = pd.concat([y_train_labelled,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug1b.shape, y_train_labelled_aug1b.shape)

#repeat
X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug2b = pd.concat([X_train_labelled_aug1b,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug2b = pd.concat([y_train_labelled_aug1b,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug2b.shape, y_train_labelled_aug2b.shape)


X_noise_multiplier=0.2
y_noise_multiplier=0.2

X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug1c = pd.concat([X_train_labelled,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug1c = pd.concat([y_train_labelled,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug1c.shape, y_train_labelled_aug1c.shape)

#repeat
X_train_labelled_noisy = X_train_labelled + (X_noise_multiplier * np.random.normal(0, std_X, X_train_labelled.shape))
y_train_labelled_noisy = y_train_labelled + (y_noise_multiplier * np.random.normal(0, std_y, y_train_labelled.shape))

X_train_labelled_aug2c = pd.concat([X_train_labelled_aug1c,X_train_labelled_noisy], ignore_index=True)
y_train_labelled_aug2c = pd.concat([y_train_labelled_aug1c,y_train_labelled_noisy], ignore_index=True)
print(X_train_labelled_aug2c.shape, y_train_labelled_aug2c.shape)

(5472, 40) (5472,)
(8208, 40) (8208,)
(5472, 40) (5472,)
(8208, 40) (8208,)
(5472, 40) (5472,)
(8208, 40) (8208,)


In [52]:
# Weights. We already have defined weights_labelled, weights_labelled2, weights_labelled3
# Just need to augment

weights_labelled_aug1 = pd.concat([weights_labelled, weights_labelled], ignore_index=True)
weights_labelled_aug2 = pd.concat([weights_labelled_aug1, weights_labelled], ignore_index=True)
print(weights_labelled_aug1.shape, weights_labelled_aug2.shape)

weights_labelled2_aug1 = pd.concat([weights_labelled2, weights_labelled2], ignore_index=True)
weights_labelled2_aug2 = pd.concat([weights_labelled2_aug1, weights_labelled2], ignore_index=True)
print(weights_labelled2_aug1.shape, weights_labelled2_aug2.shape)

weights_labelled3_aug1 = pd.concat([weights_labelled3, weights_labelled3], ignore_index=True)
weights_labelled3_aug2 = pd.concat([weights_labelled3_aug1, weights_labelled3], ignore_index=True)
print(weights_labelled3_aug1.shape, weights_labelled3_aug2.shape)

(5472,) (8208,)
(5472,) (8208,)
(5472,) (8208,)


## PCA

In [53]:
pca = PCA(n_components=25)

X_train_labelled_pca = pd.DataFrame(pca.fit_transform(X_train_labelled))
X_test_pca = pd.DataFrame(pca.transform(X_test_fimpute))
X_train_labelled_pca_aug1a = pd.DataFrame(pca.transform(X_train_labelled_aug1a))
X_train_labelled_pca_aug2a = pd.DataFrame(pca.transform(X_train_labelled_aug2a))
X_train_labelled_pca_aug1b = pd.DataFrame(pca.transform(X_train_labelled_aug1b))
X_train_labelled_pca_aug2b = pd.DataFrame(pca.transform(X_train_labelled_aug2b))

print(X_train_labelled_pca.shape, X_test_pca.shape, X_train_labelled_pca_aug1b.shape, X_train_labelled_pca_aug2b.shape)

(2736, 25) (20, 25) (5472, 25) (8208, 25)


## sii target data

In [54]:
y_train_labelled_sii = y_train_labelled.copy()
y_train_labelled_sii.name='sii'
y_train_labelled_sii = y_train_labelled_sii.apply(lambda row: 0 if row<=30 else 
                             (1 if row<50 else (
                                2 if row<80 else (3)
                            )))

y_train_labelled_sii_aug1 = pd.concat([y_train_labelled_sii, y_train_labelled_sii], ignore_index=True)
y_train_labelled_sii_aug2 = pd.concat([y_train_labelled_sii_aug1, y_train_labelled_sii], ignore_index=True)

# Model

In [55]:
svr_param_grid = {
    'C': uniform(0.1,10),
    'epsilon': uniform(0.01,0.2),
    'gamma': ['scale', 'auto', 0.1],
    'kernel': ['rbf', 'linear','poly']
}

In [39]:
#random_search = RandomizedSearchCV(SVR(), svr_param_grid, cv=5, scoring='neg_mean_squared_error', n_iter=20)

In [40]:
#random_search.fit(X_train_labelled, y_train_labelled)

In [41]:
# Get the best parameters and the best score
#print(f"Best Parameters: {random_search.best_params_}")
#print(f"Best Score (MSE): {-random_search.best_score_}")

Best Parameters: {'C': 0.10778765841014329, 'epsilon': 0.20844231185824352, 'gamma': 'scale', 'kernel': 'linear'}
Best Score (MSE): 302.5792595597036


In [56]:
svr_model = SVR(C=20, epsilon=0.2, gamma='scale', kernel='rbf') # From GridSearchCV
#svr_model = SVR(C=0.10778765841014329, epsilon=0.20844231185824352, gamma='scale', kernel='linear')

In [57]:
svr_model.fit(X_train_labelled_aug1a, y_train_labelled_aug1a, sample_weight=weights_labelled2_aug1)
#svr_model.fit(X_train_labelled, y_train_labelled_sii)

In [58]:
y_test = test_data[['id']]
y_test.head()

Unnamed: 0,id
0,00008ff9
1,000fd460
2,00105258
3,00115b9f
4,0016bb22


In [59]:
y_test['PCIAT-PCIAT_Total_svr'] = svr_model.predict(X_test_fimpute)
#y_test['PCIAT-PCIAT_Total_svr'] = svr_model.predict(X_test_pca)
y_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total_svr'] = svr_model.predict(X_test_fimpute)


Unnamed: 0,id,PCIAT-PCIAT_Total_svr
0,00008ff9,19.353013
1,000fd460,12.631451
2,00105258,37.006033
3,00115b9f,32.241179
4,0016bb22,40.386918


In [60]:
y_test['PCIAT-PCIAT_Total'] = y_test.apply(
    lambda row: (row['PCIAT-PCIAT_Total_svr']),
axis=1)

y_test.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['PCIAT-PCIAT_Total'] = y_test.apply(


Unnamed: 0,id,PCIAT-PCIAT_Total_svr,PCIAT-PCIAT_Total
0,00008ff9,19.353013,19.353013
1,000fd460,12.631451,12.631451
2,00105258,37.006033,37.006033
3,00115b9f,32.241179,32.241179
4,0016bb22,40.386918,40.386918
5,001f3379,34.200215,34.200215
6,0038ba98,21.026758,21.026758
7,0068a485,30.260999,30.260999
8,0069fbed,51.166064,51.166064
9,0083e397,40.020334,40.020334


In [61]:
y_test['sii'] = y_test.apply(lambda row: 0 if row['PCIAT-PCIAT_Total']<=30 else 
                             (1 if row['PCIAT-PCIAT_Total']<50 else (
                                2 if row['PCIAT-PCIAT_Total']<80 else (3)
                            )), axis=1)
#y_test['sii'] = y_test.apply(lambda row: np.round(row['PCIAT-PCIAT_Total']), axis=1)
y_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['sii'] = y_test.apply(lambda row: 0 if row['PCIAT-PCIAT_Total']<=30 else


Unnamed: 0,id,PCIAT-PCIAT_Total_svr,PCIAT-PCIAT_Total,sii
0,00008ff9,19.353013,19.353013,0
1,000fd460,12.631451,12.631451,0
2,00105258,37.006033,37.006033,1
3,00115b9f,32.241179,32.241179,1
4,0016bb22,40.386918,40.386918,1


In [62]:
solution = y_test[['id','sii']]
solution.head()

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,1
4,0016bb22,1


In [60]:
solution.to_csv("submission.csv", index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
