# Import neccessary libraries

In [2]:
import numpy as np
import pandas as pd
import os

from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.model_selection import RandomizedSearchCV

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

import torch

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.impute import KNNImputer

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [3]:
import random

SEED = 42
n_splits = 5

#Ensure the reproducbility of results
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)
n_splits = 5

# Data processing

## Time-series data statistics extraction

In [4]:
def dictionary_of_statistics(data, time = None):
    #Handle empty dataframe
    if (data.empty):
        return {}
    
    if len(data.columns) == 0:
        return {}
    
    #Aggreate statistics for the dictionary
    stats_summary = data.agg(['mean', 'median', 'max', 'std']).to_dict()
    
    flattened_stats = {}
    for col, stats in stats_summary.items():
        for stat_name, value in stats.items():
            key = f"{stat_name}_{col}"
            if (time is not None):
                key = f"{stat_name}_{col}_{time}"
            flattened_stats[key] = value
    
    return flattened_stats

#Feature engineering

def compute_time_features(data, day_start_hour=6, day_end_hour=18, expected_diff=5):
    
    """
    Compute and add time-related features to the DataFrame.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'time_of_day' in nanosecond and 'relative_date_PCIAT'.
    - day_start_hour (int): Hour to start the day period. Default is 8.
    - day_end_hour (int): Hour to end the day period. Default is 21.
    - expected_diff (int): Expected time difference between steps in seconds. Default is 5.
    
    """
    
    #From nanosecond to hour in a day
    data['time_of_day_hours'] = data['time_of_day'] / 1e9 / 3600
    data['day_time'] = data['relative_date_PCIAT'] + data['time_of_day_hours'] / 24
    
    #Categorize the day and night based on time data
    
    data['day_period'] = np.where(
        (data['time_of_day_hours'] >= day_start_hour) &
        (data['time_of_day_hours'] < day_end_hour),
        'day', 'night'
    )
    
    #Time difference beween steps
    #As the description, the time_of_day should represent the start of a 5s window over which the data was sampled
    #Calculate the time difference between each step
    data['time_diff'] = (data['day_time'].diff() * 86400).round(0) # seconds in a day
    data['measurement_after_gap'] = data['time_diff'] > expected_diff
    
def no_motion_periods(worn_data):
    """
    Find periods of no motion and give analytical insights in the data.

    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'time_of_day' and 'relative_date_PCIAT'.

    Returns:
    - pd.DataFrame: DataFrame with new features: 
    + total duration of no motion periods per day
    + the number of no motion periods per day.
    """
    
    #Calculate no motion periods
    no_motion = worn_data['enmo'] == 0
    motion_group = (
        (no_motion != no_motion.shift()) |
        (worn_data['measurement_after_gap'])
    ).cumsum()

    no_motion_periods = worn_data[no_motion].groupby(
        motion_group
    )['day_time'].agg(['min', 'max'])

    no_motion_periods['duration_sec'] = (
        (no_motion_periods['max'] - no_motion_periods['min']) * 86400
    ).round(0).astype(int)
    
    no_motion_periods['duration_sec'] += 5
    no_motion_periods['day'] = no_motion_periods['min'].astype(int)
    
    # Calculate daily statistics on no motion periods
    daily_stats = no_motion_periods.groupby(no_motion_periods['day']) \
        .agg(no_motion_duration=('duration_sec', 'sum'),
            no_motion_count=('duration_sec', 'size'))
    
    #Aggreate statistics for the dictionary
    return dictionary_of_statistics(daily_stats)

def circadian_rhythm_analysis(worn_data):
    
    """
    Make features capturing the variation in activity across the 24-hour cycle, 
    separately for day and night times (or wakefulness and sleep periods).
    
    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'time_of_day_hours', 'day_period' and 'relative_date_PCIAT'.
    
    Returns:
    - pd.DataFrame: 2 DataFrame, corresponding to day and night times,  with new features capturing the circadian rhythm of the wearer,
    + Standard deviation across hourly means per day
    + Peak hour of activity per day
    + Entropy of activity distribution per day
    """
    if (worn_data.empty):
        return {}
    
    hourly_activity = worn_data.groupby(
        [worn_data['relative_date_PCIAT'].astype(int),
        worn_data['time_of_day_hours'].astype(int),
        worn_data['day_period']]
    )['enmo'].agg(['mean', 'max'])

    features = hourly_activity['mean'].groupby(
        ['relative_date_PCIAT', 'day_period']
    ).agg(
        std_across_hours='std',
        peak_hour=lambda x: x.idxmax()[1],
        entropy=lambda x: -(x / x.sum() * np.log(x / x.sum() + 1e-9)).sum()
    )
    
    day_features = features.xs('day', level='day_period')
    night_features = features.xs('night', level='day_period')
    
    return dictionary_of_statistics(day_features, time="day") | dictionary_of_statistics(night_features, time='night')
    # return features

def physical_activity_analysis(worn_data):
        
    """
    Analyze the Moderate to Vigorous Physical Activity (MVPA) based on a threshold of ENMO values, 
    and calculate the duration of the detected MVPA activity bouts
    
    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'enmo', 'time_diff' and 'day_time'.
    
    Returns:
    - pd.DataFrame: DataFrame with new features capturing the physical activity level of the wearer,
    including:
    + Total duration of MVPA per day
    + Number of MVPA periods per day
    """
    # In order to classify physical activity as MVPA, we only retained activities that lasted at least 1 minute and met the criteria for the 100 mg (= 0.1g) threshold
    mvpa_threshold = 0.1
    merge_gap = 60
    
    def merge_mvpa_groups(df, allowed_gap=60, merge_gap=60):
        last_mvpa_time = df['day_time'].where(df['is_mvpa']).ffill().shift()
        
        mvpa_time_diff = (
            (df['day_time'] - last_mvpa_time) * 86400
        ).round(0)
        
        mvpa_group = (
            (df['is_mvpa'] != df['is_mvpa'].shift()) |
            (df['time_diff'] >= allowed_gap)
        ).cumsum()
        
        is_mvpa_start = (
            (mvpa_group != mvpa_group.shift()) &
            df['is_mvpa']
        )
        
        group_increment = is_mvpa_start & (
            (mvpa_time_diff >= merge_gap) | last_mvpa_time.isnull()
        )
        
        merged_group = group_increment.cumsum()
        merged_group.loc[~df['is_mvpa']] = np.nan
        
        return merged_group
    
    worn_data['is_mvpa'] = worn_data['enmo'] > mvpa_threshold
    worn_data['mvpa_merged_group'] = merge_mvpa_groups(worn_data)

    mvpa_periods = worn_data[
        worn_data['is_mvpa']
    ].groupby('mvpa_merged_group')['day_time'].agg(['min', 'max'])

    mvpa_periods['duration_sec'] = (
        mvpa_periods['max'] - mvpa_periods['min']
    ) * 86400  # days to seconds

    mvpa_periods = mvpa_periods[mvpa_periods['duration_sec'] >= 60]
    mvpa_periods['duration_min'] = mvpa_periods['duration_sec'] / 60
    
    mvpa_periods['day'] = mvpa_periods['min'].astype(int)

    daily_stats = mvpa_periods.groupby(mvpa_periods['day']) \
        .agg(mvpa_total_duration=('duration_sec', 'sum'),
            mvpa_count_periods=('duration_sec', 'size'))
        
    return dictionary_of_statistics(daily_stats)
    
def activity_transition_analysis(worn_data):
        
    """
    The analysis to look at transitions between low, moderate and vigorous activity. To smooth out sudden, 
    short bursts of different activities, we filter out segments with a duration below a 1 minute threshold.
    
    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'enmo', 'time_diff' and 'day_time'.
    
    Returns:
    - pd.DataFrame: DataFrame with new features capturing the activity transitions of the wearer,
    + Total duration of different of activity per day
    + Number of different activity periods per day
    """
    mvpa_threshold = 0.1
    vig_threshold = 0.5
    worn_data['activity_type'] = pd.cut(
        worn_data['enmo'],
        bins=[-np.inf, mvpa_threshold, vig_threshold, np.inf],
        labels=['low', 'moderate', 'vigorous']
    )
    activity_group = (
        (worn_data['activity_type'] != worn_data['activity_type'].shift()) |
        (worn_data['measurement_after_gap'])
    ).cumsum()
    
    activity_periods = worn_data.groupby(activity_group).agg(
        min=('day_time', 'min'),
        max=('day_time', 'max'),
        activity_type=('activity_type', 'first')
    )
    activity_periods['duration_sec'] = (
        activity_periods['max'] - activity_periods['min']
    ) * 86400 + 5 

    activity_periods = activity_periods[activity_periods['duration_sec'] >= 60]
    activity_periods['duration_min'] = activity_periods['duration_sec'] / 60
    
    activity_periods['day'] = activity_periods['min'].astype(int)
    activity_periods['transition_num'] = (
        activity_periods.groupby('day')['activity_type']
        .apply(lambda x: (x != x.shift()).cumsum())
        .reset_index(level=0, drop=True)
    )
    
    low_activity = activity_periods[activity_periods['activity_type'] == 'low'].groupby('day').agg(
        low_act_total_duration=('duration_sec', 'sum'),
        low_act_count_periods=('duration_sec', 'size')
    )
    
    moderate_activity = activity_periods[activity_periods['activity_type'] == 'moderate'].groupby('day').agg(
            moderate_act_total_duration=('duration_sec', 'sum'),
            moderate_act_count_periods=('duration_sec', 'size')
    )

        
    daily_transitions = activity_periods.groupby('day').agg(
        count_transitions=('transition_num', 'max')
    )
    
    return dictionary_of_statistics(low_activity) | dictionary_of_statistics(moderate_activity) | dictionary_of_statistics(daily_transitions)

def activity_light_exposure(worn_data):
        
    """
    Analyze the correlation between light exposure and physical activity level.
    
    Parameters:
    - data (pd.DataFrame): The input DataFrame containing 'light' and 'enmo'.
    
    Returns:
    - float: The correlation between light exposure and physical activity level.
    """
    correlation_light_enmo = worn_data[['light', 'enmo']].corr().iloc[0, 1]
    return {'correlation_light_enmo': correlation_light_enmo}

def process_file(file_path, participant_id):
    try:
        data = pd.read_parquet(file_path)
        
        if data.empty:
            return {'id': participant_id}
        
        required_columns = ['time_of_day', 'relative_date_PCIAT', 'enmo', 'non-wear_flag', 'light']
        if not all(col in data.columns for col in required_columns):
            return {'id': participant_id}

        # Compute time features
        compute_time_features(data)
            
        # Calculate the percentage of non-worn time
        non_wear_percentage = (data['non-wear_flag'].sum() / len(data)) * 100
        
        # Filter out the worn data
        worn_data = data[data['non-wear_flag'] == 0]
        
        if (worn_data.empty):
            return {'id': participant_id}
        
        # recalculate time difference between rows and measurement_after_gap flag in the worn data
        expected_diff = 5
        worn_data['time_diff'] = (worn_data['day_time'].diff() * 86400).round(0)
        worn_data['measurement_after_gap'] = worn_data['time_diff'] > expected_diff

        # Compute no motion periods
        no_motion_stats = no_motion_periods(worn_data)
        # Circadian rhythm analysis
        circadian_rhythm_stats = circadian_rhythm_analysis(worn_data)
        # Physical activity analysis
        physical_activity_stats = physical_activity_analysis(worn_data)
        # Activity transition analysis
        activity_transition_stats = activity_transition_analysis(worn_data)
        # Compute activity light correlation
        activity_light_stats = activity_light_exposure(worn_data)

        return {
            'id': participant_id,
        } | no_motion_stats | circadian_rhythm_stats | physical_activity_stats | activity_transition_stats | activity_light_stats
    
    except Exception as e:
        return {'id': participant_id}

def load_time_series(dir_name):
    
    participant_ids = os.listdir(dir_name)
    
    with ThreadPoolExecutor() as executor:
        #tqdm: Wraps the executor.map iterable with tqdm -> Show a progress bar indicating the processing status
        results = list(tqdm(executor.map(lambda x: process_file(os.path.join(dir_name, x, 'part-0.parquet'), x), participant_ids), total=len(participant_ids)))

    df = pd.DataFrame(results)
    # Replace inf and -inf with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Replace NaN with 0
    df.fillna(0, inplace=True)
    
    return df

## Load data

**Loading tabular data**

In [5]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

**Few SII scores are still derived from the sum of NAN values in PICAT questions, leading to potentially invalid SII values. The below code tries to estimate the severity of internet usage SII based on current SII and the maximum possible SII.**

In [6]:
#Generate a list of column names in the format PCIAT-PCIAT_XX
PCIAT_cols = [f'PCIAT-PCIAT_{i+1:02d}' for i in range(20)]

#Recalculates the SII value based on the the current PCIAT values and the possible maximum PCIAT values
def recalculate_sii(row):
    value = 0
    if (not pd.isna(row['PCIAT-PCIAT_Total'])):
        value = row['PCIAT-PCIAT_Total']
        
    max_possible = value + row[PCIAT_cols].isna().sum() * 5
    
    if value <= 30 and max_possible <= 30:
        return 0
    elif 31 <= value <= 49 and max_possible <= 49:
        return 1
    elif 50 <= value <= 79 and max_possible <= 79:
        return 2
    elif value >= 80 and max_possible >= 80:
        return 3
    
    return np.nan

train['recalc_sii'] = train.apply(recalculate_sii, axis=1)
train['sii'] = train['recalc_sii']
train.drop(columns='recalc_sii', inplace=True)

**Loading time-series data**

In [7]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [05:45<00:00,  2.89it/s]
100%|██████████| 2/2 [00:00<00:00,  4.42it/s]


In [8]:
df_train = train_ts.copy()
df_test = test_ts.copy()

df_train['id'] = df_train['id'].str.replace('id=', '')
df_test['id'] = df_test['id'].str.replace('id=', '')

In [9]:
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove('id')

## Data merging

In [10]:
train = pd.merge(train, df_train, how="left", on='id')
test = pd.merge(test, df_test, how="left", on='id')

**Take a look at the test set after grafting encoded time-series data. As shown below, the participants who didn't wear device have features related to time-series values are NaN. To handle, we will fill all NaN values via KNNImputer trained on data in training set from those who worn devices.**

In [11]:
test.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,mean_no_motion_duration,median_no_motion_duration,max_no_motion_duration,std_no_motion_duration,mean_no_motion_count,median_no_motion_count,max_no_motion_count,std_no_motion_count,mean_std_across_hours_day,median_std_across_hours_day,max_std_across_hours_day,std_std_across_hours_day,mean_peak_hour_day,median_peak_hour_day,max_peak_hour_day,std_peak_hour_day,mean_entropy_day,median_entropy_day,max_entropy_day,std_entropy_day,mean_std_across_hours_night,median_std_across_hours_night,max_std_across_hours_night,std_std_across_hours_night,mean_peak_hour_night,median_peak_hour_night,max_peak_hour_night,std_peak_hour_night,mean_entropy_night,median_entropy_night,max_entropy_night,std_entropy_night,mean_mvpa_total_duration,median_mvpa_total_duration,max_mvpa_total_duration,std_mvpa_total_duration,mean_mvpa_count_periods,median_mvpa_count_periods,max_mvpa_count_periods,std_mvpa_count_periods,mean_low_act_total_duration,median_low_act_total_duration,max_low_act_total_duration,std_low_act_total_duration,mean_low_act_count_periods,median_low_act_count_periods,max_low_act_count_periods,std_low_act_count_periods,mean_moderate_act_total_duration,median_moderate_act_total_duration,max_moderate_act_total_duration,std_moderate_act_total_duration,mean_moderate_act_count_periods,median_moderate_act_count_periods,max_moderate_act_count_periods,std_moderate_act_count_periods,mean_count_transitions,median_count_transitions,max_count_transitions,std_count_transitions,correlation_light_enmo
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,,,,Fall,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0,389.545455,105.0,2300.0,557.549554,41.060606,13.0,272.0,60.863032,0.016498,0.011422,0.057418,0.015671,12.589744,13.0,17.0,3.084067,0.916029,0.887219,2.243301,0.731855,0.017414,0.015299,0.054692,0.01382,15.8,19.0,23.0,7.549378,0.804196,0.583313,2.061804,0.795553,2410.666667,1470.0,9930.0,3103.940391,11.866667,7.0,48.0,13.798896,7246.578947,7115.0,23505.0,6183.344351,32.421053,21.0,126.0,31.514222,95.833333,70.0,220.0,61.352805,1.166667,1.0,2.0,0.408248,1.5,1.0,3.0,0.888523,0.129729
4,0016bb22,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Handle missing in predictors

In [12]:
train_cols = set(train.columns)
test_cols = set(test.columns)
columns_not_in_test = sorted(list(train_cols - test_cols))
columns_not_in_test

['PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20',
 'PCIAT-PCIAT_Total',
 'PCIAT-Season',
 'sii']

**Initialize a KNNImputer for handling missing valuee. Statistically, KNNImputer outweights SimpleImputer in this context. However, it encounters the computationally expensive cost, and the requirement of hyperparameter K (K value should be considered to change later). The imputation should be applied on both train and test set. However, to avoid data leakage, the imputer has to be trained on data from training set, then appied to both train and test set**

In [13]:
def numeric_imputation(train_df, test_df):
    numeric_cols = train_df.select_dtypes(include=['float64', 'int64', 'int32', 'float32']).columns.tolist()
    
    numeric_cols = [col for col in numeric_cols if col not in columns_not_in_test]

    imputer = KNNImputer(n_neighbors=5)

    #Fit on training set
    imputer.fit(train_df[numeric_cols])

    #Transform both
    train_df[numeric_cols] = imputer.transform(train_df[numeric_cols])
    test_df[numeric_cols] = imputer.transform(test_df[numeric_cols])

    return train_df, test_df

In [14]:
train, test = numeric_imputation(train, test)

In [15]:
train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,mean_no_motion_duration,median_no_motion_duration,max_no_motion_duration,std_no_motion_duration,mean_no_motion_count,median_no_motion_count,max_no_motion_count,std_no_motion_count,mean_std_across_hours_day,median_std_across_hours_day,max_std_across_hours_day,std_std_across_hours_day,mean_peak_hour_day,median_peak_hour_day,max_peak_hour_day,std_peak_hour_day,mean_entropy_day,median_entropy_day,max_entropy_day,std_entropy_day,mean_std_across_hours_night,median_std_across_hours_night,max_std_across_hours_night,std_std_across_hours_night,mean_peak_hour_night,median_peak_hour_night,max_peak_hour_night,std_peak_hour_night,mean_entropy_night,median_entropy_night,max_entropy_night,std_entropy_night,mean_mvpa_total_duration,median_mvpa_total_duration,max_mvpa_total_duration,std_mvpa_total_duration,mean_mvpa_count_periods,median_mvpa_count_periods,max_mvpa_count_periods,std_mvpa_count_periods,mean_low_act_total_duration,median_low_act_total_duration,max_low_act_total_duration,std_low_act_total_duration,mean_low_act_count_periods,median_low_act_count_periods,max_low_act_count_periods,std_low_act_count_periods,mean_moderate_act_total_duration,median_moderate_act_total_duration,max_moderate_act_total_duration,std_moderate_act_total_duration,mean_moderate_act_count_periods,median_moderate_act_count_periods,max_moderate_act_count_periods,std_moderate_act_count_periods,mean_count_transitions,median_count_transitions,max_count_transitions,std_count_transitions,correlation_light_enmo
0,00008ff9,Fall,5.0,0.0,Winter,51.0,Fall,16.877316,46.0,50.8,23.2,59.2,82.0,106.6,,3.8,4.6,36.2,Fall,0.0,0.0,13.78,1.6,12.28,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,2.748,,2.136,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,36.4,51.8,Fall,3.0,2.0,6559.472754,5748.5,13618.0,3965.053609,164.481627,148.4,399.8,102.468018,0.044604,0.041098,0.104451,0.027647,12.868128,12.8,17.0,3.029144,1.930638,2.003336,2.405488,0.522328,0.027366,0.023468,0.067415,0.015811,17.716365,18.9,21.2,4.331692,1.576391,1.612343,2.281391,0.477174,14346.805865,14619.0,25211.0,6615.337689,51.091027,54.2,81.6,21.012055,42532.694658,44339.0,66539.0,15292.031102,116.681657,128.0,192.6,54.579048,513.779287,399.5,1488.0,386.547341,5.975914,4.8,17.0,4.263757,8.6552,7.4,22.0,5.767433,0.132322
1,000fd460,Summer,9.0,0.0,,68.8,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,5.2,7.8,25.4,Fall,3.0,0.0,16.22,1.6,14.64,1.4,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.626,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,6665.907673,6082.5,14720.0,4128.360193,172.254162,167.0,340.2,81.772337,0.035358,0.036146,0.086503,0.020255,12.698615,13.4,16.8,3.121248,1.693202,1.813241,2.241675,0.56626,0.020397,0.01541,0.079083,0.017736,17.839251,18.6,21.6,3.844107,1.438604,1.469691,2.041372,0.471304,11275.960566,11433.0,19438.0,4026.253279,45.142282,47.2,64.8,13.435897,41427.507073,42303.0,62325.0,14122.561898,100.86455,113.7,157.6,44.180932,697.180674,583.5,1500.0,309.145707,6.837425,5.6,13.6,2.756057,6.126791,5.0,17.6,4.862391,0.178144
2,00105258,Summer,10.0,1.0,Fall,71.0,Fall,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,2.4,2.666232,18.20692,977.2406,1542.044,12.351336,46.3519,14.19112,4.015798,14.08811,2.6,23.7575,10.243036,43.68564,19.61508,36.10882,,2.3238,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,2531.930754,1598.0,8450.0,2359.737591,85.380847,64.6,231.6,62.626601,0.019546,0.011361,0.081623,0.021959,12.517853,13.4,17.0,3.317508,1.126886,0.959884,2.289095,0.60945,0.018414,0.013064,0.05232,0.015785,17.257585,18.8,22.0,5.626243,0.961518,0.970654,1.773141,0.599527,7360.603182,6974.0,17008.0,5333.300078,29.331578,27.8,57.6,17.924631,22293.493047,20998.0,42821.0,11367.667389,64.802605,51.0,147.8,42.459857,397.676812,340.5,900.0,255.959519,4.193237,3.7,9.0,2.397566,4.494649,3.4,14.6,3.774495,0.210945
3,00115b9f,Winter,9.0,0.0,Fall,71.0,Summer,18.292347,56.0,81.6,26.0,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,18.88,1.6,21.4,2.2,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.424,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,389.545455,105.0,2300.0,557.549554,41.060606,13.0,272.0,60.863032,0.016498,0.011422,0.057418,0.015671,12.589744,13.0,17.0,3.084067,0.916029,0.887219,2.243301,0.731855,0.017414,0.015299,0.054692,0.01382,15.8,19.0,23.0,7.549378,0.804196,0.583313,2.061804,0.795553,2410.666667,1470.0,9930.0,3103.940391,11.866667,7.0,48.0,13.798896,7246.578947,7115.0,23505.0,6183.344351,32.421053,21.0,126.0,31.514222,95.833333,70.0,220.0,61.352805,1.166667,1.0,2.0,0.408248,1.5,1.0,3.0,0.888523,0.129729
4,0016bb22,Spring,18.0,1.0,Summer,67.8,,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,,4.4,8.4,17.8,,12.8,0.2,28.48,2.0,28.8,2.0,1.4,0.0,10.1,0.6,9.5,0.6,10.7,0.8,,2.4,4.382366,26.06698,1394.988,2144.724,29.72234,90.84782,16.01834,10.048682,56.67218,2.4,35.37708,25.74848,86.4656,47.54038,65.0994,Summer,1.04,,2.0724,,,,,,,,,,,,,,,,,,,,,,,,42.0,58.8,,2.8,,3172.544286,2636.5,8401.0,1938.672118,180.88619,160.4,454.6,112.344754,0.013944,0.013636,0.030328,0.007314,12.753497,13.1,17.0,2.859426,1.67775,1.85181,2.286355,0.587331,0.013005,0.011961,0.039652,0.007519,16.319573,18.8,22.8,6.581137,1.581126,1.689396,2.247904,0.667945,4031.257974,4099.0,7799.0,1904.495212,21.610999,22.3,37.6,9.139527,32301.158571,32221.0,52536.0,13661.636835,99.05,103.0,161.4,45.620557,679.983333,537.5,1885.0,486.388753,5.023333,4.2,11.6,2.954711,6.803429,6.4,16.2,4.19439,0.149184


In [16]:
#Get categorical columns
cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

#File the nan values in categorical collumns with 'Missing'
def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

#Convert from categorical to numeric 
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

## Feature engineering

In [17]:
def feature_engineering(df):
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']

    # Replace any remaining inf values with NaN
    df = df.replace([np.inf, -np.inf], np.nan)
    
    return df

In [18]:
train = feature_engineering(train)
#Remove rows which has got fewer than 10 non-null values
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

train = train.drop('id', axis=1)
test  = test .drop('id', axis=1)

In [19]:
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii',
                'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','BMI_PHR']

featuresCols += time_series_cols

train = train[featuresCols]

featuresCols.remove('sii')
test = test[featuresCols]

In [20]:
#Check and replace any INF values with NaN
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

In [21]:
df = train
# Get columns with NaN values
nan_columns = df.columns[df.isna().any()].tolist()

# Calculate number of NaN values per column
nan_counts = df[nan_columns].isna().sum()

# Sort by number of NaN values (descending)
nan_counts_sorted = nan_counts.sort_values(ascending=False)

# Display results
print("\nColumns with NaN values:")
print("-" * 50)
for col, count in nan_counts_sorted.items():
    total = len(df)
    percentage = (count/total * 100)
    print(f"{col:<30} {count:>7} NaN values ({percentage:>6.2f}%)")

print(f"\nTotal columns with NaN values: {len(nan_columns)}")


Columns with NaN values:
--------------------------------------------------
sii                               1241 NaN values ( 31.34%)
BMR_Weight                          63 NaN values (  1.59%)
DEE_Weight                          63 NaN values (  1.59%)
Hydration_Status                    63 NaN values (  1.59%)

Total columns with NaN values: 4


In [22]:
train = train.dropna(subset=['BMR_Weight', 'DEE_Weight', 'Hydration_Status'])

In [23]:
train, test = numeric_imputation(train, test)

In [24]:
# train.to_csv("train1.csv", index=False)
print(train.shape, test.shape)

(3897, 136) (20, 135)


In [25]:
# train.to_csv("train1.csv", index=False)
train.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,BMI_PHR,mean_no_motion_duration,median_no_motion_duration,max_no_motion_duration,std_no_motion_duration,mean_no_motion_count,median_no_motion_count,max_no_motion_count,std_no_motion_count,mean_std_across_hours_day,median_std_across_hours_day,max_std_across_hours_day,std_std_across_hours_day,mean_peak_hour_day,median_peak_hour_day,max_peak_hour_day,std_peak_hour_day,mean_entropy_day,median_entropy_day,max_entropy_day,std_entropy_day,mean_std_across_hours_night,median_std_across_hours_night,max_std_across_hours_night,std_std_across_hours_night,mean_peak_hour_night,median_peak_hour_night,max_peak_hour_night,std_peak_hour_night,mean_entropy_night,median_entropy_night,max_entropy_night,std_entropy_night,mean_mvpa_total_duration,median_mvpa_total_duration,max_mvpa_total_duration,std_mvpa_total_duration,mean_mvpa_count_periods,median_mvpa_count_periods,max_mvpa_count_periods,std_mvpa_count_periods,mean_low_act_total_duration,median_low_act_total_duration,max_low_act_total_duration,std_low_act_total_duration,mean_low_act_count_periods,median_low_act_count_periods,max_low_act_count_periods,std_low_act_count_periods,mean_moderate_act_total_duration,median_moderate_act_total_duration,max_moderate_act_total_duration,std_moderate_act_total_duration,mean_moderate_act_count_periods,median_moderate_act_count_periods,max_moderate_act_count_periods,std_moderate_act_count_periods,mean_count_transitions,median_count_transitions,max_count_transitions,std_count_transitions,correlation_light_enmo
0,0.0,5.0,0.0,0.0,51.0,0.0,16.877316,46.0,50.8,23.2,59.2,82.0,106.6,0.0,3.8,4.6,36.2,0.0,0.0,0.0,13.78,1.6,12.28,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,0.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,0.0,2.748,0.0,2.136,0.0,36.4,51.8,0.0,3.0,2.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453,1383.939887,6559.472754,5748.5,13618.0,3965.053609,164.481627,148.4,399.8,102.468018,0.044604,0.041098,0.104451,0.027647,12.868128,12.8,17.0,3.029144,1.930638,2.003336,2.405488,0.522328,0.027366,0.023468,0.067415,0.015811,17.716365,18.9,21.2,4.331692,1.576391,1.612343,2.281391,0.477174,14346.805865,14619.0,25211.0,6615.337689,51.091027,54.2,81.6,21.012055,42532.694658,44339.0,66539.0,15292.031102,116.681657,128.0,192.6,54.579048,513.779287,399.5,1488.0,386.547341,5.975914,4.8,17.0,4.263757,8.6552,7.4,22.0,5.767433,0.132322
1,1.0,9.0,0.0,1.0,68.8,0.0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,0.0,5.2,7.8,25.4,0.0,3.0,0.0,16.22,1.6,14.64,1.4,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,1.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,0.0,2.626,1.0,2.34,1.0,46.0,64.0,1.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,982.49132,6665.907673,6082.5,14720.0,4128.360193,172.254162,167.0,340.2,81.772337,0.035358,0.036146,0.086503,0.020255,12.698615,13.4,16.8,3.121248,1.693202,1.813241,2.241675,0.56626,0.020397,0.01541,0.079083,0.017736,17.839251,18.6,21.6,3.844107,1.438604,1.469691,2.041372,0.471304,11275.960566,11433.0,19438.0,4026.253279,45.142282,47.2,64.8,13.435897,41427.507073,42303.0,62325.0,14122.561898,100.86455,113.7,157.6,44.180932,697.180674,583.5,1500.0,309.145707,6.837425,5.6,13.6,2.756057,6.126791,5.0,17.6,4.862391,0.178144
2,1.0,10.0,1.0,2.0,71.0,0.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,1.0,5.0,7.0,33.0,0.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.0,2.4,2.666232,18.20692,977.2406,1542.044,12.351336,46.3519,14.19112,4.015798,14.08811,2.6,23.7575,10.243036,43.68564,19.61508,36.10882,0.0,2.3238,2.0,2.17,1.0,38.0,54.0,1.0,2.0,0.0,166.486961,20.0,33.297392,0.773778,1.007312,0.285049,1.209833,13767.473069,21724.485497,12.926463,20.397407,0.34717,4.884479,0.47763,0.657942,1564.97743,2531.930754,1598.0,8450.0,2359.737591,85.380847,64.6,231.6,62.626601,0.019546,0.011361,0.081623,0.021959,12.517853,13.4,17.0,3.317508,1.126886,0.959884,2.289095,0.60945,0.018414,0.013064,0.05232,0.015785,17.257585,18.8,22.0,5.626243,0.961518,0.970654,1.773141,0.599527,7360.603182,6974.0,17008.0,5333.300078,29.331578,27.8,57.6,17.924631,22293.493047,20998.0,42821.0,11367.667389,64.802605,51.0,147.8,42.459857,397.676812,340.5,900.0,255.959519,4.193237,3.7,9.0,2.397566,4.494649,3.4,14.6,3.774495,0.210945
3,2.0,9.0,0.0,2.0,71.0,1.0,18.292347,56.0,81.6,26.0,60.0,97.0,117.0,2.0,6.0,9.0,37.0,1.0,18.0,1.0,18.88,1.6,21.4,2.2,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,0.0,2.424,3.0,2.451,2.0,31.0,45.0,2.0,0.0,1.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,1774.357653,389.545455,105.0,2300.0,557.549554,41.060606,13.0,272.0,60.863032,0.016498,0.011422,0.057418,0.015671,12.589744,13.0,17.0,3.084067,0.916029,0.887219,2.243301,0.731855,0.017414,0.015299,0.054692,0.01382,15.8,19.0,23.0,7.549378,0.804196,0.583313,2.061804,0.795553,2410.666667,1470.0,9930.0,3103.940391,11.866667,7.0,48.0,13.798896,7246.578947,7115.0,23505.0,6183.344351,32.421053,21.0,126.0,31.514222,95.833333,70.0,220.0,61.352805,1.166667,1.0,2.0,0.408248,1.5,1.0,3.0,0.888523,0.129729
4,3.0,18.0,1.0,3.0,67.8,2.0,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,0.0,4.4,8.4,17.8,2.0,12.8,0.2,28.48,2.0,28.8,2.0,1.4,0.0,10.1,0.6,9.5,0.6,10.7,0.8,2.0,2.4,4.382366,26.06698,1394.988,2144.724,29.72234,90.84782,16.01834,10.048682,56.67218,2.4,35.37708,25.74848,86.4656,47.54038,65.0994,1.0,1.04,0.0,2.0724,0.0,42.0,58.8,3.0,2.8,,480.845494,50.4,74.798188,2.174098,0.282649,0.177312,1.328209,79057.011034,121546.184578,11.159904,17.157792,0.741197,4.731007,0.520795,0.543432,1982.15198,3172.544286,2636.5,8401.0,1938.672118,180.88619,160.4,454.6,112.344754,0.013944,0.013636,0.030328,0.007314,12.753497,13.1,17.0,2.859426,1.67775,1.85181,2.286355,0.587331,0.013005,0.011961,0.039652,0.007519,16.319573,18.8,22.8,6.581137,1.581126,1.689396,2.247904,0.667945,4031.257974,4099.0,7799.0,1904.495212,21.610999,22.3,37.6,9.139527,32301.158571,32221.0,52536.0,13661.636835,99.05,103.0,161.4,45.620557,679.983333,537.5,1885.0,486.388753,5.023333,4.2,11.6,2.954711,6.803429,6.4,16.2,4.19439,0.149184


# VIME Definition

## Mask & Pretext generator

**The below cell defines severals utilities for building VIME, including mask_generator & pretext_generator**

In [26]:
def mask_generator (p_m, x):
  """Generate mask vector.
  
  Args:
    - p_m: corruption probability
    - x: feature matrix
    
  Returns:
    - mask: binary mask matrix 
  """
  mask = np.random.binomial(1, p_m, x.shape)
  return mask

def pretext_generator (m, x):  
  """Generate corrupted samples.
  
  Args:
    m: mask matrix
    x: feature matrix
    
  Returns:
    m_new: final mask matrix after corruption
    x_tilde: corrupted feature matrix
  """
  
  # Parameters
  no, dim = x.shape  
  # Randomly (and column-wise) shuffle data
  x_bar = np.zeros([no, dim])
  for i in range(dim):
    idx = np.random.permutation(no)
    x_bar[:, i] = x[idx, i]
    
  # Corrupt samples
  x_tilde = x * (1-m) + x_bar * m  
  # Define new mask matrix
  m_new = 1 * (x != x_tilde)

  return m_new, x_tilde

def convert_matrix_to_vector(matrix):
  """Convert two dimensional matrix into one dimensional vector
  
  Args:
    - matrix: two dimensional matrix
    
  Returns:
    - vector: one dimensional vector
  """
  # Parameters
  no, dim = matrix.shape
  # Define output  
  vector = np.zeros([no,])
  
  # Convert matrix to vector
  for i in range(dim):
    idx = np.where(matrix[:, i] == 1)
    vector[idx] = i
    
  return vector

def convert_vector_to_matrix(vector):
  """Convert one dimensional vector into two dimensional matrix
  
  Args:
    - vector: one dimensional vector
    
  Returns:
    - matrix: two dimensional matrix
  """
  # Parameters
  no = len(vector)
  dim = len(np.unique(vector))
  # Define output
  matrix = np.zeros([no,dim])
  
  # Convert vector to matrix
  for i in range(dim):
    idx = np.where(vector == i)
    matrix[idx, i] = 1
    
  return matrix

## VIME Self-supervised framework

In [27]:
# Necessary packages
from tensorflow import keras
from tensorflow.keras import Model, Input, layers

def vime_self (x_unlab, p_m, alpha, parameters):
  """Self-supervised learning part in VIME.
  
  Args:
    x_unlab: unlabeled feature
    p_m: corruption probability
    alpha: hyper-parameter to control the weights of feature and mask losses
    parameters: epochs, batch_size
    
  Returns:
    encoder: Representation learning block
  """
    
  # Parameters
  _, dim = x_unlab.shape
  epochs = parameters['epochs']
  batch_size = parameters['batch_size']
  
  # Build model using Functional API
  inputs = Input(shape=(dim,))
  # Encoder
  h = layers.Dense(dim, activation='relu')(inputs)
  # Mask estimator
  mask_output = layers.Dense(dim, activation='sigmoid', name='mask')(h)
  # Feature estimator
  feature_output = layers.Dense(dim, activation='sigmoid', name='feature')(h)
  
  #Create model
  model = Model(inputs = inputs, outputs = [mask_output, feature_output])
  
  model.compile(optimizer='rmsprop',
                loss={'mask': 'binary_crossentropy', 
                      'feature': 'mean_squared_error'},
                loss_weights={'mask':1.0
                              , 'feature':float(alpha)})
  
  # Generate corrupted samples
  m_unlab = mask_generator(p_m, x_unlab)
  m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
  
  # Fit model on unlabeled data
  model.fit(x_tilde, {'mask': m_label, 'feature': x_unlab}, 
            epochs = epochs, batch_size= batch_size)
      
  # Extract encoder
  encoder = Model(
      inputs=model.input,
      outputs=model.layers[1].output,
      name='encoder'
  )
  
  return encoder

## VIME Semi-supervised framework

In [28]:
import numpy as np
import tensorflow as tf
import numpy as np
from tensorflow import keras
 
"""
Expected flow:
- Load & process data
- Create network architecture
- Train the model using:
+ Supervised loss from labeled data
+ Unsupervised loss from unlabeled data
"""

class Predictor(keras.Model):
    def __init__(self, hidden_dim, label_dim):
        super(Predictor, self).__init__()
        #Define layers
        self.dense1 = keras.layers.Dense(hidden_dim, activation='relu')
        self.dense2 = keras.layers.Dense(hidden_dim, activation='relu')
        self.output_layer = keras.layers.Dense(label_dim)
        
    def call(self, x_input):
        #Forward pass
        inter_layer = self.dense1(x_input)
        inter_layer = self.dense2(inter_layer)
        y_hat_logit = self.output_layer(inter_layer)
        y_hat = tf.nn.softmax(y_hat_logit)
        return y_hat_logit, y_hat
 
def vime_semi(x_train, y_train, x_unlab, x_test, parameters, 
              p_m, K, beta, file_name):
    """Semi-supervied learning part in VIME.
    
    Args:
        - x_train, y_train: training dataset
        - x_unlab: unlabeled dataset
        - x_test: testing features
        - parameters: network parameters (hidden_dim, batch_size, iterations)
        - p_m: corruption probability
        - K: number of augmented samples
        - beta: hyperparameter to control supervised and unsupervised loss
        - file_name: saved filed name for the encoder function
        
    Returns:
        - y_test_hat: prediction on x_test
    """
    class WeightedKappaLoss(tf.keras.losses.Loss):
        def __init__(self, num_classes, name="weighted_kappa_loss"):
            super().__init__(name=name)
            self.num_classes = num_classes
            # Create weight matrix
            weights = tf.cast(tf.range(num_classes), tf.float32)
            weights = tf.expand_dims(weights, 0) - tf.expand_dims(weights, 1)
            self.weights = tf.square(weights)
            
        @tf.function
        def call(self, y_true, y_pred):
            # Convert types
            y_true = tf.cast(y_true, tf.float32)
            y_pred = tf.cast(y_pred, tf.float32)
            
            # Apply softmax
            y_pred = tf.nn.softmax(y_pred)
            
            # Calculate confusion matrix
            batch_size = tf.cast(tf.shape(y_true)[0], tf.float32)
            confusion = tf.matmul(y_true, y_pred, transpose_a=True)
            confusion = confusion / batch_size
            
            # Calculate agreements
            observed = tf.reduce_sum(confusion * (1.0 - self.weights))
            expected_rows = tf.reduce_sum(confusion, axis=1)
            expected_cols = tf.reduce_sum(confusion, axis=0)
            expected = tf.reduce_sum(
                tf.matmul(tf.expand_dims(expected_rows, 1),
                         tf.expand_dims(expected_cols, 0)) * (1.0 - self.weights)
            )
            
            # Calculate kappa
            kappa = (observed - expected) / (1.0 - expected + 1e-8)
            return 1.0 - kappa
    
    # Network parameters
    hidden_dim = parameters['hidden_dim']
    batch_size = parameters['batch_size']
    iterations = parameters['iterations']
      
    # Basic parameters
    data_dim = x_train.shape[1]
    label_dim = y_train.shape[1]

    # Divide training and validation sets (9:1)
    idx = np.random.permutation(len(x_train))
    train_idx = idx[:int(len(idx)*0.9)]
    valid_idx = idx[int(len(idx)*0.9):]
    
    x_valid, y_valid = x_train[valid_idx], y_train[valid_idx]
    x_train, y_train = x_train[train_idx], y_train[train_idx]
    
    # Load encoder from self-supervised model
    encoder = keras.models.load_model(file_name)
    
    # Create predictor model
    predictor = Predictor(hidden_dim, label_dim)
    optimizer = keras.optimizers.Adam()
    
    # Encode validation and testing features
    x_valid = encoder.predict(x_valid)
    x_test = encoder.predict(x_test)
    
    # Setup checkpointing
    checkpoint = tf.train.Checkpoint(model=predictor)
    manager = tf.train.CheckpointManager(
        checkpoint, './save_model', max_to_keep=1)
    
    # Early stopping variables
    best_valid_loss = float('inf')
    patience = -1
    
    # Initialize loss
    loss_fn = WeightedKappaLoss(num_classes=label_dim)
    
    @tf.function
    def train_step(x_batch, y_batch, xu_batch):
        with tf.GradientTape() as tape:
            #Forward pass
            y_logits, _ = predictor(x_batch)
            yv_logits, _ = predictor(xu_batch)
            
            # Calculate supervised loss
            supervised_loss = loss_fn(y_batch, y_logits)

            # Unsupervised loss using variance
            unsupervised_loss = tf.reduce_mean(
                tf.math.reduce_variance(yv_logits, axis=0))
            
            total_loss = supervised_loss + beta * unsupervised_loss
        
        # Compute gradients and update weights
        gradients = tape.gradient(total_loss, predictor.trainable_variables)
        optimizer.apply_gradients(zip(gradients, predictor.trainable_variables))
        
        return total_loss
        
    for it in range(iterations):
        # Select a batch of labeled data
        batch_idx = np.random.permutation(len(x_train))[:batch_size]
        x_batch = x_train[batch_idx]
        y_batch = y_train[batch_idx]
        
        # Encode labeled data
        x_batch = encoder.predict(x_batch)
        
        # Select and augment unlabeled data
        batch_u_idx = np.random.permutation(len(x_unlab))[:batch_size]
        xu_batch_ori = x_unlab[batch_u_idx]
        
        xu_batch = []
        for _ in range(K):
            # Mask vector generation
            m_batch = mask_generator(p_m, xu_batch_ori)
            # Pretext generator
            _, xu_batch_temp = pretext_generator(m_batch, xu_batch_ori)
            
            # Encode corrupted samples
            xu_batch_temp = encoder.predict(xu_batch_temp)
            xu_batch.append(xu_batch_temp)
        
        #Convert list to matrix
        xu_batch = tf.convert_to_tensor(np.array(xu_batch))
        
        # Training step
        loss = train_step(x_batch, y_batch, xu_batch)
        
        # Validation step
        val_logits, _ = predictor(x_valid)
        
        val_loss = loss_fn(y_valid, val_logits)
        
        if it % 100 == 0:
            print(f'Iteration: {it}/{iterations}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')
            
        # Early stopping
        if val_loss < best_valid_loss:
            best_valid_loss = val_loss
            manager.save()
            patience = 0
        else:
            patience += 1
            if patience >= 100:
                break
            
        
    # Restore best model
    checkpoint.restore(manager.latest_checkpoint)
    
    # Generate predictions
    _, y_test_hat = predictor(x_test)
    
    return y_test_hat.numpy()

# Labeling 

## Split training to label & unlabel

In [29]:
from sklearn.preprocessing import MinMaxScaler

def load_data(train):
    df = train.copy()
    
    cols = df.columns
    categorial_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

    normalized_cols = [col for col in cols if col not in categorial_cols]
    normalized_cols.remove('sii')
    
    scaler = MinMaxScaler()
    df[normalized_cols] = scaler.fit_transform(df[normalized_cols])
    
    # Split into labeled and unlabeled sets
    labeled_df = df[df['sii'].notna()]
    unlabeled_df = df[df['sii'].isna()]
    
    x_label = labeled_df.drop(columns=['sii'])
    y_label = labeled_df['sii']
    y_label = np.asarray(pd.get_dummies(y_label))
    x_unlabel = unlabeled_df.drop(columns=['sii'])
    
    return x_label, y_label, x_unlabel, normalized_cols, scaler

**Load labeled & unlabeled data, and convert them to numpy.narray type**

In [30]:
x_label, y_label, x_unlabel, normalized_cols, scaler = load_data(train)

x_label = x_label.to_numpy().astype(np.float32)
x_unlabel = x_unlabel.to_numpy().astype(np.float32)

**Apply min-max scaler on test set**

In [31]:
test[normalized_cols] = scaler.transform(test[normalized_cols])

## VIME Usage

**Define hyperparameters for VIME**

In [32]:
#Define hyperparameters
p_m = 0.3 #Corruption probability for self-supervised learning
alpha = 2.0 #Control the weights of feature and mask losses
K = 3 #number of augmented samples
beta = 1.0 #Control the weights of the supervised and unsupervised losses

**Train VIME-Self**

In [33]:
#Re-define unlabeled data
x_unlabel_combination = np.concatenate([x_label, x_unlabel], axis=0)

In [34]:
pd.set_option('display.max_rows', 20)  # Show only 20 rows
pd.set_option('display.min_rows', 10)  # Minimum rows to show
# For Jupyter notebook display
from IPython.display import display_html

vime_self_parameters = dict()
vime_self_parameters['batch_size'] = 128
vime_self_parameters['epochs'] = 50
vime_self_encoder = vime_self(x_unlabel_combination, p_m, alpha, vime_self_parameters)

#Save encoder
if not os.path.exists('save_model'):
  os.makedirs('save_model')

file_name = './save_model/encoder_model.h5'
  
vime_self_encoder.save(file_name) 

Epoch 1/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.1848
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9892
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9773
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9760
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9756
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9691
Epoch 7/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9722
Epoch 8/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9673
Epoch 9/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9653
Epoch 10/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9616
Epoch 11/

**Train VIME_Semi using encoder from VIME_Self**

In [35]:
# Train VIME-Semi
vime_semi_parameters = dict()
vime_semi_parameters['hidden_dim'] = 50
vime_semi_parameters['batch_size'] = 128
vime_semi_parameters['iterations'] = 700

file_name = './save_model/encoder_model.h5'

y_test_hat = vime_semi(x_label, y_label, x_unlabel, x_unlabel, 
                       vime_semi_parameters, p_m, K, beta, file_name)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Iteration: 0/700, Loss: 1.0132, Val Loss: 0.9849
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [

In [36]:
y_test_hat = np.argmax(y_test_hat, axis=1)

In [37]:
# Count unique values in y_test_hat
unique_values, value_counts = np.unique(y_test_hat, return_counts=True)

# Display results
for value, count in zip(unique_values, value_counts):
    print(f"Value {value}: {count} occurrences")

# Optional: verify total
print(f"\nTotal elements: {len(y_test_hat)}")
print(f"Sum of counts: {np.sum(value_counts)}")

Value 0: 743 occurrences
Value 1: 407 occurrences
Value 3: 80 occurrences

Total elements: 1230
Sum of counts: 1230


## Data consolidation 

In [38]:
x_train = np.concatenate([x_label, x_unlabel], axis=0)
print(x_train.shape)
x_train

(3897, 135)


array([[0.        , 0.        , 0.        , ..., 0.40384614, 0.35927713,
        0.47250053],
       [1.        , 0.23529412, 0.        , ..., 0.31923077, 0.30289838,
        0.52149564],
       [1.        , 0.29411766, 1.        , ..., 0.26153848, 0.23512886,
        0.5565684 ],
       ...,
       [3.        , 0.23529412, 0.        , ..., 0.2769231 , 0.27038237,
        0.49884948],
       [2.        , 0.29411766, 0.        , ..., 0.43076923, 0.3873909 ,
        0.5516674 ],
       [3.        , 0.3529412 , 0.        , ..., 0.31538463, 0.31113008,
        0.48622155]], dtype=float32)

In [39]:
y_label_converted = np.argmax(y_label, axis=1).reshape(-1, 1)
y_train = np.concatenate([y_label_converted, y_test_hat.reshape(-1, 1)],axis=0)
print(y_train.shape)
y_train

(3897, 1)


array([[2],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [40]:
train.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,BMI_PHR,mean_no_motion_duration,median_no_motion_duration,max_no_motion_duration,std_no_motion_duration,mean_no_motion_count,median_no_motion_count,max_no_motion_count,std_no_motion_count,mean_std_across_hours_day,median_std_across_hours_day,max_std_across_hours_day,std_std_across_hours_day,mean_peak_hour_day,median_peak_hour_day,max_peak_hour_day,std_peak_hour_day,mean_entropy_day,median_entropy_day,max_entropy_day,std_entropy_day,mean_std_across_hours_night,median_std_across_hours_night,max_std_across_hours_night,std_std_across_hours_night,mean_peak_hour_night,median_peak_hour_night,max_peak_hour_night,std_peak_hour_night,mean_entropy_night,median_entropy_night,max_entropy_night,std_entropy_night,mean_mvpa_total_duration,median_mvpa_total_duration,max_mvpa_total_duration,std_mvpa_total_duration,mean_mvpa_count_periods,median_mvpa_count_periods,max_mvpa_count_periods,std_mvpa_count_periods,mean_low_act_total_duration,median_low_act_total_duration,max_low_act_total_duration,std_low_act_total_duration,mean_low_act_count_periods,median_low_act_count_periods,max_low_act_count_periods,std_low_act_count_periods,mean_moderate_act_total_duration,median_moderate_act_total_duration,max_moderate_act_total_duration,std_moderate_act_total_duration,mean_moderate_act_count_periods,median_moderate_act_count_periods,max_moderate_act_count_periods,std_moderate_act_count_periods,mean_count_transitions,median_count_transitions,max_count_transitions,std_count_transitions,correlation_light_enmo
0,0.0,5.0,0.0,0.0,51.0,0.0,16.877316,46.0,50.8,23.2,59.2,82.0,106.6,0.0,3.8,4.6,36.2,0.0,0.0,0.0,13.78,1.6,12.28,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,0.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,0.0,2.748,0.0,2.136,0.0,36.4,51.8,0.0,3.0,2.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453,1383.939887,6559.472754,5748.5,13618.0,3965.053609,164.481627,148.4,399.8,102.468018,0.044604,0.041098,0.104451,0.027647,12.868128,12.8,17.0,3.029144,1.930638,2.003336,2.405488,0.522328,0.027366,0.023468,0.067415,0.015811,17.716365,18.9,21.2,4.331692,1.576391,1.612343,2.281391,0.477174,14346.805865,14619.0,25211.0,6615.337689,51.091027,54.2,81.6,21.012055,42532.694658,44339.0,66539.0,15292.031102,116.681657,128.0,192.6,54.579048,513.779287,399.5,1488.0,386.547341,5.975914,4.8,17.0,4.263757,8.6552,7.4,22.0,5.767433,0.132322
1,1.0,9.0,0.0,1.0,68.8,0.0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,0.0,5.2,7.8,25.4,0.0,3.0,0.0,16.22,1.6,14.64,1.4,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,1.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,0.0,2.626,1.0,2.34,1.0,46.0,64.0,1.0,0.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,982.49132,6665.907673,6082.5,14720.0,4128.360193,172.254162,167.0,340.2,81.772337,0.035358,0.036146,0.086503,0.020255,12.698615,13.4,16.8,3.121248,1.693202,1.813241,2.241675,0.56626,0.020397,0.01541,0.079083,0.017736,17.839251,18.6,21.6,3.844107,1.438604,1.469691,2.041372,0.471304,11275.960566,11433.0,19438.0,4026.253279,45.142282,47.2,64.8,13.435897,41427.507073,42303.0,62325.0,14122.561898,100.86455,113.7,157.6,44.180932,697.180674,583.5,1500.0,309.145707,6.837425,5.6,13.6,2.756057,6.126791,5.0,17.6,4.862391,0.178144
2,1.0,10.0,1.0,2.0,71.0,0.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,1.0,5.0,7.0,33.0,0.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.0,2.4,2.666232,18.20692,977.2406,1542.044,12.351336,46.3519,14.19112,4.015798,14.08811,2.6,23.7575,10.243036,43.68564,19.61508,36.10882,0.0,2.3238,2.0,2.17,1.0,38.0,54.0,1.0,2.0,0.0,166.486961,20.0,33.297392,0.773778,1.007312,0.285049,1.209833,13767.473069,21724.485497,12.926463,20.397407,0.34717,4.884479,0.47763,0.657942,1564.97743,2531.930754,1598.0,8450.0,2359.737591,85.380847,64.6,231.6,62.626601,0.019546,0.011361,0.081623,0.021959,12.517853,13.4,17.0,3.317508,1.126886,0.959884,2.289095,0.60945,0.018414,0.013064,0.05232,0.015785,17.257585,18.8,22.0,5.626243,0.961518,0.970654,1.773141,0.599527,7360.603182,6974.0,17008.0,5333.300078,29.331578,27.8,57.6,17.924631,22293.493047,20998.0,42821.0,11367.667389,64.802605,51.0,147.8,42.459857,397.676812,340.5,900.0,255.959519,4.193237,3.7,9.0,2.397566,4.494649,3.4,14.6,3.774495,0.210945
3,2.0,9.0,0.0,2.0,71.0,1.0,18.292347,56.0,81.6,26.0,60.0,97.0,117.0,2.0,6.0,9.0,37.0,1.0,18.0,1.0,18.88,1.6,21.4,2.2,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,0.0,2.424,3.0,2.451,2.0,31.0,45.0,2.0,0.0,1.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,1774.357653,389.545455,105.0,2300.0,557.549554,41.060606,13.0,272.0,60.863032,0.016498,0.011422,0.057418,0.015671,12.589744,13.0,17.0,3.084067,0.916029,0.887219,2.243301,0.731855,0.017414,0.015299,0.054692,0.01382,15.8,19.0,23.0,7.549378,0.804196,0.583313,2.061804,0.795553,2410.666667,1470.0,9930.0,3103.940391,11.866667,7.0,48.0,13.798896,7246.578947,7115.0,23505.0,6183.344351,32.421053,21.0,126.0,31.514222,95.833333,70.0,220.0,61.352805,1.166667,1.0,2.0,0.408248,1.5,1.0,3.0,0.888523,0.129729
4,3.0,18.0,1.0,3.0,67.8,2.0,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,0.0,4.4,8.4,17.8,2.0,12.8,0.2,28.48,2.0,28.8,2.0,1.4,0.0,10.1,0.6,9.5,0.6,10.7,0.8,2.0,2.4,4.382366,26.06698,1394.988,2144.724,29.72234,90.84782,16.01834,10.048682,56.67218,2.4,35.37708,25.74848,86.4656,47.54038,65.0994,1.0,1.04,0.0,2.0724,0.0,42.0,58.8,3.0,2.8,,480.845494,50.4,74.798188,2.174098,0.282649,0.177312,1.328209,79057.011034,121546.184578,11.159904,17.157792,0.741197,4.731007,0.520795,0.543432,1982.15198,3172.544286,2636.5,8401.0,1938.672118,180.88619,160.4,454.6,112.344754,0.013944,0.013636,0.030328,0.007314,12.753497,13.1,17.0,2.859426,1.67775,1.85181,2.286355,0.587331,0.013005,0.011961,0.039652,0.007519,16.319573,18.8,22.8,6.581137,1.581126,1.689396,2.247904,0.667945,4031.257974,4099.0,7799.0,1904.495212,21.610999,22.3,37.6,9.139527,32301.158571,32221.0,52536.0,13661.636835,99.05,103.0,161.4,45.620557,679.983333,537.5,1885.0,486.388753,5.023333,4.2,11.6,2.954711,6.803429,6.4,16.2,4.19439,0.149184


In [41]:
TRAIN = pd.DataFrame(x_train)
TRAIN.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134
0,0.0,0.0,0.0,0.0,0.026694,0.0,0.165085,0.285714,0.108396,0.1625,0.330726,0.495495,0.525123,0.0,0.135714,0.23,0.613559,0.0,0.0,0.0,0.111129,0.3,0.099192,0.2,0.0,0.0,0.322581,0.0,0.285714,0.0,0.272727,1.0,0.0,0.25,0.002536,0.312401,0.001446,0.003385,0.002001,0.001446,0.02836,0.886743,0.98375,0.0,0.00407,0.001372,0.003283,0.004131,0.002134,0.0,0.515556,0.0,0.369596,0.0,0.24557,0.222581,0.0,1.0,0.026206,0.227273,0.313016,0.999985,0.831784,0.580108,0.308812,0.999605,0.999579,0.011544,0.011721,0.004809,0.526263,0.005195,0.655781,0.177046,0.211682,0.15904,0.154803,0.158995,0.243241,0.187848,0.14248,0.147826,0.354845,0.341267,0.065589,0.087521,0.527199,0.618182,1.0,0.389441,0.800796,0.821121,0.957594,0.443837,0.155801,0.284307,0.048856,0.048281,0.755894,0.859459,0.64,0.278452,0.658257,0.6602,0.919397,0.386908,0.539799,0.536822,0.585349,0.571354,0.616793,0.64142,0.555102,0.550234,0.470331,0.525623,0.174257,0.20942,0.61419,0.638191,0.547701,0.551133,0.145396,0.114964,0.150989,0.074736,0.208462,0.16,0.265625,0.195635,0.361777,0.278261,0.403846,0.359277,0.472501
1,1.0,0.235294,0.0,1.0,0.044969,0.0,0.108935,0.32967,0.092198,0.125,0.418994,0.387387,0.600985,0.0,0.185714,0.39,0.430508,0.0,0.026087,0.0,0.130806,0.3,0.118255,0.2,0.098039,0.0,0.506912,1.0,0.52381,1.0,0.136364,0.0,1.0,0.25,0.002515,0.259649,0.001497,0.003439,0.001309,0.001497,0.023632,0.878426,0.983161,0.0,0.002679,0.003331,0.003397,0.002985,0.00114,0.0,0.485432,1.0,0.418052,1.0,0.367089,0.419355,1.0,0.0,0.07082,0.0,0.0,0.999983,0.841076,0.533001,0.533739,0.999599,0.999572,0.01325,0.013542,0.003253,0.529755,0.004508,0.718209,0.102148,0.215117,0.168281,0.16733,0.165544,0.254735,0.211392,0.12124,0.117969,0.280668,0.300151,0.054093,0.064121,0.507802,0.672727,0.966667,0.401283,0.693078,0.743205,0.869999,0.481168,0.116124,0.186686,0.057311,0.05416,0.763823,0.843243,0.72,0.247109,0.600721,0.601789,0.82267,0.382148,0.424259,0.419829,0.451312,0.34774,0.544977,0.55858,0.440816,0.35184,0.45782,0.501446,0.162974,0.193404,0.529774,0.566332,0.447126,0.446134,0.197297,0.167914,0.152207,0.059771,0.238515,0.186667,0.2125,0.126457,0.242287,0.173913,0.319231,0.302898,0.521496
2,1.0,0.294118,1.0,2.0,0.047228,0.0,0.160568,0.516484,0.19209,0.21875,0.363128,0.603604,0.576355,1.0,0.178571,0.35,0.559322,0.0,0.173913,1.0,0.082258,0.0,0.11874,0.5,0.137255,1.0,0.460829,1.0,0.47619,1.0,0.227273,0.0,2.0,0.35,0.002536,0.337045,0.00199,0.00379,0.003269,0.00199,0.030139,0.891034,0.984298,0.8,0.003793,0.001807,0.004306,0.004152,0.002737,0.0,0.410815,2.0,0.377672,1.0,0.265823,0.258065,1.0,0.666667,0.113554,0.30303,0.20585,0.999986,0.82914,0.498069,0.32508,0.999612,0.999586,0.006929,0.006632,0.003645,0.525437,0.003137,0.469759,0.210822,0.081708,0.044211,0.096055,0.094623,0.126264,0.081772,0.082537,0.090349,0.153814,0.09433,0.050967,0.069517,0.487118,0.672727,1.0,0.426515,0.436156,0.393434,0.895356,0.517868,0.104836,0.158265,0.037917,0.048202,0.726296,0.854054,0.8,0.361669,0.401503,0.39745,0.714573,0.486115,0.276943,0.256091,0.394892,0.460627,0.354104,0.328994,0.391837,0.469385,0.241225,0.248455,0.110749,0.155677,0.337312,0.251256,0.418966,0.428755,0.112539,0.097986,0.091324,0.049488,0.146276,0.123333,0.140625,0.110008,0.165154,0.104348,0.261538,0.235129,0.556568
3,2.0,0.235294,0.0,2.0,0.047228,1.0,0.193045,0.505495,0.212338,0.25,0.335196,0.630631,0.576355,2.0,0.214286,0.45,0.627119,1.0,0.156522,1.0,0.152258,0.3,0.172859,0.6,0.098039,0.0,0.322581,0.0,0.333333,0.0,0.318182,1.0,3.0,0.5,0.002821,0.338667,0.003862,0.006874,0.004272,0.003863,0.029581,0.891953,0.98483,0.5,0.006513,0.003913,0.007578,0.006057,0.004481,0.0,0.435556,3.0,0.444418,2.0,0.177215,0.112903,2.0,0.0,0.111579,0.0,0.0,0.999987,0.827746,0.392341,0.385111,0.999623,0.9996,0.007727,0.008432,0.005529,0.526203,0.004204,0.47613,0.249886,0.012571,0.002905,0.026145,0.022357,0.060722,0.016456,0.096935,0.087804,0.129364,0.094833,0.035463,0.049609,0.495344,0.636364,1.0,0.396503,0.340496,0.363651,0.870869,0.621878,0.099142,0.185335,0.039635,0.042201,0.632258,0.864865,1.0,0.485292,0.33581,0.238847,0.830904,0.645059,0.090701,0.05398,0.230555,0.268082,0.14326,0.08284,0.326531,0.361346,0.070895,0.083598,0.059028,0.084679,0.164492,0.100503,0.356322,0.318227,0.02712,0.020144,0.022324,0.011862,0.040698,0.033333,0.03125,0.018732,0.023629,0.0,0.038462,0.05535,0.469728
4,3.0,0.470588,1.0,0.0,0.025667,1.0,0.271836,0.582418,0.315605,0.4,0.335196,0.414414,0.502463,0.0,0.15,0.25,0.528814,1.0,0.104348,0.0,0.133065,0.5,0.144588,0.5,0.117647,0.0,0.460829,1.0,0.52381,1.0,0.363636,0.0,3.0,0.25,0.002939,0.5594,0.006286,0.007464,0.008796,0.006286,0.042032,0.93367,0.990353,0.5,0.007541,0.005241,0.012034,0.008527,0.007502,0.0,0.422667,4.0,0.83848,2.0,0.291139,0.290323,4.0,0.0,0.244578,0.0,0.0,0.999994,0.82505,0.347861,0.369355,0.999718,0.999691,0.006024,0.005155,0.007355,0.52419,0.004191,0.185995,0.222288,0.164637,0.104994,0.167671,0.173869,0.305297,0.217722,0.164647,0.216136,0.080172,0.005332,0.071584,0.078385,0.448666,0.590909,1.0,0.470518,0.497604,0.467203,0.956664,0.609693,0.035499,0.050755,0.022562,0.02486,0.83871,0.918919,0.8,0.241425,0.46264,0.406645,0.92042,0.646888,0.129148,0.111907,0.139192,0.149159,0.209759,0.218935,0.197279,0.215002,0.318492,0.09482,0.287912,0.478169,0.127173,0.005025,0.238506,0.338807,0.205452,0.152518,0.143075,0.084617,0.230233,0.2,0.171875,0.123969,0.067513,0.0,0.192308,0.185393,0.589786


In [42]:
test.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,BMI_PHR,mean_no_motion_duration,median_no_motion_duration,max_no_motion_duration,std_no_motion_duration,mean_no_motion_count,median_no_motion_count,max_no_motion_count,std_no_motion_count,mean_std_across_hours_day,median_std_across_hours_day,max_std_across_hours_day,std_std_across_hours_day,mean_peak_hour_day,median_peak_hour_day,max_peak_hour_day,std_peak_hour_day,mean_entropy_day,median_entropy_day,max_entropy_day,std_entropy_day,mean_std_across_hours_night,median_std_across_hours_night,max_std_across_hours_night,std_std_across_hours_night,mean_peak_hour_night,median_peak_hour_night,max_peak_hour_night,std_peak_hour_night,mean_entropy_night,median_entropy_night,max_entropy_night,std_entropy_night,mean_mvpa_total_duration,median_mvpa_total_duration,max_mvpa_total_duration,std_mvpa_total_duration,mean_mvpa_count_periods,median_mvpa_count_periods,max_mvpa_count_periods,std_mvpa_count_periods,mean_low_act_total_duration,median_low_act_total_duration,max_low_act_total_duration,std_low_act_total_duration,mean_low_act_count_periods,median_low_act_count_periods,max_low_act_count_periods,std_low_act_count_periods,mean_moderate_act_total_duration,median_moderate_act_total_duration,max_moderate_act_total_duration,std_moderate_act_total_duration,mean_moderate_act_count_periods,median_moderate_act_count_periods,max_moderate_act_count_periods,std_moderate_act_count_periods,mean_count_transitions,median_count_transitions,max_count_transitions,std_count_transitions,correlation_light_enmo
0,0.0,0.0,0.0,0.0,0.026694,0.0,0.165085,0.285714,0.108396,0.1625,0.330726,0.495495,0.525123,0.0,0.135714,0.23,0.613559,0.0,0.0,0.0,0.111129,0.3,0.099192,0.2,0.0,0.0,0.322581,0.0,0.285714,0.0,0.272727,1.0,0.0,0.25,0.002536,0.312401,0.001446,0.003385,0.002001,0.001446,0.02836,0.886743,0.98375,0.0,0.00407,0.001372,0.003283,0.004131,0.002134,0.0,0.515556,0.0,0.369596,0.0,0.24557,0.222581,0.0,1.0,0.026206,0.227273,0.313016,0.999985,0.831784,0.580108,0.308812,0.999605,0.999579,0.011544,0.011721,0.004809,0.526263,0.005195,0.655781,0.177046,0.211682,0.15904,0.154803,0.158995,0.243241,0.187848,0.14248,0.147826,0.354845,0.341267,0.065589,0.087521,0.527199,0.618182,1.0,0.389441,0.800796,0.821121,0.957594,0.443837,0.155801,0.284307,0.048856,0.048281,0.755894,0.859459,0.64,0.278452,0.658257,0.6602,0.919397,0.386908,0.539799,0.536822,0.585349,0.571354,0.616793,0.64142,0.555102,0.550234,0.470331,0.525623,0.174257,0.20942,0.61419,0.638191,0.547701,0.551133,0.145396,0.114964,0.150989,0.074736,0.208462,0.16,0.265625,0.195635,0.361777,0.278261,0.403846,0.359277,0.472501
1,1.0,0.235294,0.0,1.0,0.044969,0.0,0.108935,0.32967,0.092198,0.125,0.418994,0.387387,0.600985,0.0,0.185714,0.39,0.430508,0.0,0.026087,0.0,0.130806,0.3,0.118255,0.2,0.098039,0.0,0.506912,1.0,0.52381,1.0,0.136364,0.0,1.0,0.25,0.002515,0.259649,0.001497,0.003439,0.001309,0.001497,0.023632,0.878426,0.983161,0.0,0.002679,0.003331,0.003397,0.002985,0.00114,0.0,0.485432,1.0,0.418052,1.0,0.367089,0.419355,1.0,0.0,0.07082,0.0,0.0,0.999983,0.841076,0.533001,0.533739,0.999599,0.999572,0.01325,0.013542,0.003253,0.529755,0.004508,0.718209,0.102148,0.215117,0.168281,0.16733,0.165544,0.254735,0.211392,0.12124,0.117969,0.280668,0.300151,0.054093,0.064121,0.507802,0.672727,0.966667,0.401283,0.693078,0.743206,0.869999,0.481168,0.116124,0.186686,0.057311,0.05416,0.763823,0.843243,0.72,0.247109,0.600721,0.601789,0.82267,0.382148,0.424259,0.419829,0.451312,0.34774,0.544977,0.55858,0.440816,0.35184,0.45782,0.501446,0.162974,0.193404,0.529774,0.566332,0.447126,0.446134,0.197297,0.167914,0.152207,0.059771,0.238515,0.186667,0.2125,0.126457,0.242287,0.173913,0.319231,0.302898,0.521496
2,1.0,0.294118,1.0,2.0,0.047228,0.0,0.160568,0.516484,0.19209,0.21875,0.363128,0.603604,0.576355,1.0,0.178571,0.35,0.559322,0.0,0.173913,1.0,0.082258,0.0,0.11874,0.5,0.137255,1.0,0.460829,1.0,0.47619,1.0,0.227273,0.0,2.0,0.35,0.002536,0.337045,0.00199,0.00379,0.003269,0.00199,0.030139,0.891034,0.984298,0.8,0.003793,0.001807,0.004306,0.004152,0.002737,0.0,0.410815,2.0,0.377672,1.0,0.265823,0.258065,1.0,0.666667,0.113554,0.30303,0.20585,0.999986,0.82914,0.498069,0.32508,0.999612,0.999586,0.006929,0.006632,0.003645,0.525437,0.003137,0.469759,0.210822,0.081708,0.044211,0.096055,0.094623,0.126264,0.081772,0.082537,0.090349,0.153814,0.09433,0.050967,0.069517,0.487118,0.672727,1.0,0.426515,0.436156,0.393434,0.895356,0.517868,0.104836,0.158265,0.037917,0.048202,0.726296,0.854054,0.8,0.361669,0.401503,0.39745,0.714573,0.486115,0.276943,0.256091,0.394892,0.460627,0.354104,0.328994,0.391837,0.469385,0.241225,0.248455,0.110749,0.155677,0.337312,0.251256,0.418966,0.428755,0.112539,0.097986,0.091324,0.049488,0.146276,0.123333,0.140625,0.110008,0.165154,0.104348,0.261538,0.235129,0.556568
3,2.0,0.235294,0.0,2.0,0.047228,1.0,0.193045,0.505495,0.212338,0.25,0.335196,0.630631,0.576355,2.0,0.214286,0.45,0.627119,1.0,0.156522,1.0,0.152258,0.3,0.172859,0.6,0.098039,0.0,0.322581,0.0,0.333333,0.0,0.318182,1.0,3.0,0.5,0.002821,0.338667,0.003862,0.006874,0.004272,0.003863,0.029581,0.891953,0.98483,0.5,0.006513,0.003913,0.007578,0.006057,0.004481,0.0,0.435556,3.0,0.444418,2.0,0.177215,0.112903,2.0,0.0,0.111579,0.0,0.0,0.999987,0.827746,0.392341,0.385111,0.999623,0.9996,0.007727,0.008432,0.005529,0.526203,0.004204,0.47613,0.249886,0.012571,0.002905,0.026145,0.022357,0.060722,0.016456,0.096935,0.087804,0.129364,0.094833,0.035463,0.049609,0.495344,0.636364,1.0,0.396503,0.340496,0.363651,0.870868,0.621878,0.099142,0.185335,0.039635,0.042201,0.632258,0.864865,1.0,0.485292,0.33581,0.238847,0.830904,0.645059,0.090701,0.05398,0.230555,0.268082,0.14326,0.08284,0.326531,0.361346,0.070895,0.083598,0.059028,0.084679,0.164492,0.100503,0.356322,0.318227,0.02712,0.020144,0.022324,0.011862,0.040698,0.033333,0.03125,0.018732,0.023629,0.0,0.038462,0.05535,0.469728
4,3.0,0.764706,1.0,3.0,0.043943,2.0,0.359442,0.684396,0.358801,0.4875,0.394413,0.425225,0.619704,0.0,0.157143,0.42,0.301695,2.0,0.111304,0.2,0.229677,0.5,0.232633,0.5,0.027451,0.0,0.465438,0.6,0.452381,0.6,0.486364,0.8,2.0,0.35,0.002952,0.482937,0.007063,0.008663,0.008645,0.007063,0.038844,0.918158,0.989083,0.7,0.008549,0.006803,0.013486,0.011902,0.00785,1.0,0.093827,0.0,0.354489,0.0,0.316456,0.335484,3.0,0.933333,0.448005,0.763636,0.462416,0.999994,0.825249,0.310884,0.424563,0.999702,0.999678,0.005427,0.004794,0.009552,0.525353,0.003672,0.231784,0.288653,0.102382,0.072942,0.095498,0.077739,0.267501,0.203038,0.16201,0.162075,0.108876,0.113218,0.018112,0.023155,0.514082,0.645455,1.0,0.367622,0.686068,0.759014,0.893891,0.499072,0.07404,0.144905,0.028736,0.02296,0.665779,0.854054,0.96,0.423052,0.660234,0.691751,0.905902,0.541591,0.151676,0.150519,0.181077,0.164488,0.260897,0.263905,0.255782,0.239333,0.354511,0.381725,0.136762,0.187092,0.52009,0.512563,0.458046,0.460671,0.19243,0.154676,0.191273,0.094039,0.175233,0.14,0.18125,0.135571,0.274264,0.234783,0.292308,0.261286,0.49053


In [43]:
selected_columns = test.columns

In [44]:
selected_columns.shape[0] == x_train.shape[1]

True

# Supevised learning phase - Voting regressors

## Convert Numpy arrays to DataFrame

In [45]:
x_train_pd = pd.DataFrame(x_train)
y_train_pd = pd.DataFrame(y_train)
x_train_pd.columns = selected_columns

global cat_c
for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    x_train_pd[col] = x_train_pd[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)


## Randomized CV Parameters Search

In [46]:
lightgbm_param_grid = {
    'learning_rate': [0.04, 0.046, 0.05],
    'max_depth': [10, 12, 14],
    'num_leaves': [450, 478, 500],
    'min_data_in_leaf': [12, 13, 14],
    'feature_fraction': [0.88, 0.893, 0.9],
    'bagging_fraction': [0.78, 0.784, 0.79],
    'bagging_freq': [3, 4, 5],
    'lambda_l1': [8, 10, 12],
    'lambda_l2': [0.005, 0.01, 0.02],
    'n_estimators': [200, 250, 300]
}


xgboost_param_grid = {
    'learning_rate': [0.045, 0.05, 0.055],
    'max_depth': [5, 6, 7],
    'n_estimators': [150, 200, 250],
    'subsample': [0.75, 0.8, 0.85],
    'colsample_bytree': [0.75, 0.8, 0.85],
    'reg_alpha': [0.8, 1, 1.2],
    'reg_lambda': [4, 5, 6],
    'random_state': [SEED]
}


catboost_param_grid = {
    'learning_rate': [0.045, 0.05, 0.055],  # Around 0.05
    'depth': [5, 6, 7],  # Around 6
    'iterations': [150, 200, 250],  # Around 200
    'l2_leaf_reg': [8, 10, 12],  # Around 10
    'random_seed': [SEED],
    'verbose': [0]
}


lgbm_model = LGBMRegressor(verbosity=-1)
xgb_model = XGBRegressor(verbosity=0)
catboost_model = CatBoostRegressor(cat_features=cat_c, verbose=0)

lgbm_random = RandomizedSearchCV(estimator=lgbm_model, param_distributions=lightgbm_param_grid, 
                                  scoring='accuracy', cv=3, n_iter=15, verbose=2, random_state=SEED)

xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgboost_param_grid, 
                                 scoring='accuracy', cv=3, n_iter=15, verbose=2, random_state=SEED)

catboost_random = RandomizedSearchCV(estimator=catboost_model, param_distributions=catboost_param_grid, 
                                      scoring='accuracy', cv=3, n_iter=15, verbose=2, random_state=SEED)

lgbm_random.fit(x_train_pd, y_train_pd)
xgb_random.fit(x_train_pd, y_train_pd)
catboost_random.fit(x_train_pd, y_train_pd)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] END bagging_fraction=0.79, bagging_freq=5, feature_fraction=0.893, lambda_l1=12, lambda_l2=0.01, learning_rate=0.04, max_depth=12, min_data_in_leaf=14, n_estimators=200, num_leaves=478; total time=   2.0s
[CV] END bagging_fraction=0.79, bagging_freq=5, feature_fraction=0.893, lambda_l1=12, lambda_l2=0.01, learning_rate=0.04, max_depth=12, min_data_in_leaf=14, n_estimators=200, num_leaves=478; total time=   1.1s
[CV] END bagging_fraction=0.79, bagging_freq=5, feature_fraction=0.893, lambda_l1=12, lambda_l2=0.01, learning_rate=0.04, max_depth=12, min_data_in_leaf=14, n_estimators=200, num_leaves=478; total time=   1.3s
[CV] END bagging_fraction=0.78, bagging_freq=5, feature_fraction=0.893, lambda_l1=8, lambda_l2=0.02, learning_rate=0.04, max_depth=10, min_data_in_leaf=12, n_estimators=200, num_leaves=450; total time=   1.5s
[CV] END bagging_fraction=0.78, bagging_freq=5, feature_fraction=0.893, lambda_l1=8, lambda_l2=0.02,

In [47]:
print("Best LightGBM Params:", lgbm_random.best_params_)
print("Best LightGBM Score:", lgbm_random.best_score_)

print("Best XGBoost Params:", xgb_random.best_params_)
print("Best XGBoost Score:", xgb_random.best_score_)

print("Best CatBoost Params:", catboost_random.best_params_)
print("Best CatBoost Score:", catboost_random.best_score_)

Best LightGBM Params: {'num_leaves': 478, 'n_estimators': 200, 'min_data_in_leaf': 14, 'max_depth': 12, 'learning_rate': 0.04, 'lambda_l2': 0.01, 'lambda_l1': 12, 'feature_fraction': 0.893, 'bagging_freq': 5, 'bagging_fraction': 0.79}
Best LightGBM Score: nan
Best XGBoost Params: {'subsample': 0.85, 'reg_lambda': 5, 'reg_alpha': 1.2, 'random_state': 42, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.045, 'colsample_bytree': 0.8}
Best XGBoost Score: nan
Best CatBoost Params: {'verbose': 0, 'random_seed': 42, 'learning_rate': 0.045, 'l2_leaf_reg': 10, 'iterations': 150, 'depth': 6}
Best CatBoost Score: nan


## Training

In [48]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)


def modelTraining(model_class, train_feat, train_obj, test_data, n_splits=5, selected_columns=selected_columns):
    X = (train_feat)
    y = (train_obj)
    
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.49, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    thresholds = KappaOPtimizer.x
    
    oof_tuned = threshold_Rounder(oof_non_rounded, thresholds)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    fold_weights = [1.25, 1.0, 1.0, 1.0, 1.0]
    tpm = test_preds.dot(fold_weights) / np.sum(fold_weights)
    tpTuned = threshold_Rounder(tpm, thresholds)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [49]:
Light = LGBMRegressor(**lgbm_random.best_params_, verbose=-1)
XGB_Model = XGBRegressor(**xgb_random.best_params_)
CatBoost_Model = CatBoostRegressor(**catboost_random.best_params_)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

## Submission

In [50]:
submission = modelTraining(voting_model, x_train_pd, y_train_pd, test)

Training Folds: 100%|██████████| 5/5 [00:39<00:00,  7.89s/it]

Mean Train QWK --> 0.8174
Mean Validation QWK ---> 0.5371





----> || Optimized QWK SCORE :: [36m[1m 0.591[0m


In [51]:
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,2
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,3
9,0083e397,1
