In [1]:
import numpy as np
import pandas as pd
import os
from colorama import Fore, Style

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from sklearn.ensemble import VotingRegressor

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


In [2]:
df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
df = df.dropna(subset=['sii']).reset_index() # keeping labeled values only
test = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")
season_mapping = {
    'Winter': -1,
    'Spring': -0.5,
    'Summer': 0.5,
    'Fall': 1
}
# mapping non-string values
df = df.replace(season_mapping)
test = test.replace(season_mapping)

# dropping questions not in test dataset
test_missing_columns = set(df.columns) - set(test.columns)
for col in test_missing_columns:
    if col != 'sii':  # Retain the target column for training
        df.drop(columns=col, inplace=True)
# for later use
train_ids = df['id']
test_ids = test['id']
train_labels = df['sii']

In [3]:
df['sii'].value_counts()

sii
0.0    1594
1.0     730
2.0     378
3.0      34
Name: count, dtype: int64

In [4]:
featureCols = sorted(list(set(df.columns) - set(['sii', 'id'])))

In [5]:
dropCols = []
for column in featureCols:
    if (df[column].isnull().sum() > 1300):
        dropCols.append(column)
dropCols
df = df.drop(dropCols, axis=1)

In [6]:
featureCols = sorted(list(set(df.columns) - set(['sii', 'id'])))
train = pd.DataFrame(df, columns=featureCols)

In [7]:
imputer = KNNImputer(n_neighbors=5) 
data = pd.concat([train, test], axis=0, ignore_index=True)
imputed_data = imputer.fit_transform(data[featureCols])
data = pd.DataFrame(imputed_data, columns=featureCols)
data

Unnamed: 0,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2.0,2.668550,16.87920,932.4980,1492.000,8.255980,41.58620,13.81770,3.061430,9.213770,...,79.8,46.00,1.0,110.2,50.80,1.0,3.0,36.8,52.2,0.3
1,2.0,2.579490,14.03710,936.6560,1498.650,6.019930,42.02910,12.82540,1.211720,3.970850,...,70.0,48.00,1.0,122.0,46.00,0.5,0.0,46.0,64.0,1.0
2,2.6,4.031068,17.82080,1110.5880,1798.300,16.271700,60.55550,13.98800,3.832798,16.404500,...,94.0,56.50,1.0,117.0,75.60,0.5,2.0,38.0,54.0,1.0
3,3.0,3.841910,18.29430,1131.4300,1923.440,15.592500,62.77570,14.07400,4.220330,18.824300,...,97.0,56.00,0.5,117.0,81.60,-1.0,0.0,31.0,45.0,0.5
4,2.0,4.330360,30.18650,1330.9700,1996.450,30.212400,84.02850,16.68770,13.498800,67.971500,...,73.0,59.50,0.5,102.0,112.20,-0.5,0.0,40.0,56.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,3.0,4.247042,19.72338,1528.2464,2635.706,33.002410,105.04130,24.55786,-4.834472,-19.761292,...,75.0,54.00,-0.5,99.0,121.60,-1.0,2.0,35.0,50.0,-0.5
2752,2.0,2.392498,15.68960,906.9566,1363.528,7.177296,38.86582,13.37056,2.319032,6.694184,...,76.0,44.00,-0.5,109.0,47.60,-0.5,0.0,37.0,53.0,-0.5
2753,3.4,3.761358,19.27118,1143.7360,2104.080,16.165060,64.08602,14.41104,4.860098,21.673980,...,81.0,55.00,1.0,133.2,85.60,1.0,1.0,37.6,53.4,0.2
2754,2.0,2.750350,17.27380,1003.0700,1504.610,15.145600,49.10340,14.08980,3.184070,11.096600,...,91.0,37.50,-1.0,95.0,60.20,-1.0,3.0,39.0,55.0,-1.0


In [8]:
train.shape[0]

2736

In [9]:
train = data.head(train.shape[0])
test = data.drop(train.index, axis=0).reset_index()

In [10]:
train

Unnamed: 0,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2.0,2.668550,16.8792,932.498,1492.00,8.25598,41.5862,13.8177,3.061430,9.21377,...,79.8,46.0,1.0,110.2,50.8,1.0,3.0,36.8,52.2,0.3
1,2.0,2.579490,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.211720,3.97085,...,70.0,48.0,1.0,122.0,46.0,0.5,0.0,46.0,64.0,1.0
2,2.6,4.031068,17.8208,1110.588,1798.30,16.27170,60.5555,13.9880,3.832798,16.40450,...,94.0,56.5,1.0,117.0,75.6,0.5,2.0,38.0,54.0,1.0
3,3.0,3.841910,18.2943,1131.430,1923.44,15.59250,62.7757,14.0740,4.220330,18.82430,...,97.0,56.0,0.5,117.0,81.6,-1.0,0.0,31.0,45.0,0.5
4,2.0,4.330360,30.1865,1330.970,1996.45,30.21240,84.0285,16.6877,13.498800,67.97150,...,73.0,59.5,0.5,102.0,112.2,-0.5,0.0,40.0,56.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,3.0,3.203030,17.1417,1035.270,1759.96,11.00630,52.5331,13.4004,3.741300,14.66690,...,65.0,52.5,1.0,112.0,67.2,1.0,2.0,41.0,58.0,1.0
2732,1.0,2.366800,13.6457,966.287,1256.17,9.98802,45.1853,13.2315,0.414263,1.41470,...,75.0,48.5,0.5,105.0,46.6,0.5,0.0,48.0,67.0,0.5
2733,3.0,4.522770,16.3642,1206.880,2051.70,19.46110,70.8117,14.0629,2.301380,11.58830,...,70.0,59.5,1.0,104.0,82.4,1.0,1.0,35.0,50.0,-1.0
2734,2.0,4.413050,21.4438,1253.740,2005.99,20.48250,75.8033,14.8043,6.639520,33.99670,...,99.0,60.0,-1.0,116.0,109.8,1.0,0.0,56.0,77.0,-1.0


In [11]:
test

Unnamed: 0,index,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2736,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,...,79.8,46.0,1.0,110.2,50.8,1.0,3.0,36.8,52.2,0.3
1,2737,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,...,70.0,48.0,1.0,122.0,46.0,0.5,0.0,46.0,64.0,1.0
2,2738,2.6,4.031068,17.8208,1110.588,1798.3,16.2717,60.5555,13.988,3.832798,...,94.0,56.5,1.0,117.0,75.6,0.5,2.0,38.0,54.0,1.0
3,2739,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,...,97.0,56.0,0.5,117.0,81.6,-1.0,0.0,31.0,45.0,0.5
4,2740,2.8,4.60516,25.22772,1434.06,2331.776,32.86892,95.00908,15.82544,9.402284,...,75.4,65.15,0.0,128.4,144.36,-0.6,2.6,42.0,56.8,-0.2
5,2741,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,...,73.0,59.5,0.5,102.0,112.2,-0.5,0.0,40.0,56.0,0.5
6,2742,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,...,83.0,55.0,1.0,163.0,84.6,1.0,3.0,27.0,40.0,-1.0
7,2743,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,...,90.0,59.25,1.0,116.0,84.2,1.0,2.0,35.4,50.6,0.7
8,2744,3.0,6.10857,25.2143,1594.936,2793.236,34.93312,112.14454,17.40602,7.808244,...,77.4,67.29,-0.5,129.8,157.2,0.5,2.0,53.0,72.8,-0.1
9,2745,2.0,4.671482,30.64844,1584.146,2358.146,38.95434,110.99542,18.0133,12.635126,...,82.4,65.12,0.6,138.0,187.52,0.7,3.0,43.2,60.6,0.1


In [None]:
train['sii'] = df['sii']

In [13]:
test

Unnamed: 0,index,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,...,Physical-HeartRate,Physical-Height,Physical-Season,Physical-Systolic_BP,Physical-Weight,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,SDS-SDS_Total_Raw,SDS-SDS_Total_T,SDS-Season
0,2736,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,...,79.8,46.0,1.0,110.2,50.8,1.0,3.0,36.8,52.2,0.3
1,2737,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,...,70.0,48.0,1.0,122.0,46.0,0.5,0.0,46.0,64.0,1.0
2,2738,2.6,4.031068,17.8208,1110.588,1798.3,16.2717,60.5555,13.988,3.832798,...,94.0,56.5,1.0,117.0,75.6,0.5,2.0,38.0,54.0,1.0
3,2739,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,...,97.0,56.0,0.5,117.0,81.6,-1.0,0.0,31.0,45.0,0.5
4,2740,2.8,4.60516,25.22772,1434.06,2331.776,32.86892,95.00908,15.82544,9.402284,...,75.4,65.15,0.0,128.4,144.36,-0.6,2.6,42.0,56.8,-0.2
5,2741,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,...,73.0,59.5,0.5,102.0,112.2,-0.5,0.0,40.0,56.0,0.5
6,2742,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,...,83.0,55.0,1.0,163.0,84.6,1.0,3.0,27.0,40.0,-1.0
7,2743,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,...,90.0,59.25,1.0,116.0,84.2,1.0,2.0,35.4,50.6,0.7
8,2744,3.0,6.10857,25.2143,1594.936,2793.236,34.93312,112.14454,17.40602,7.808244,...,77.4,67.29,-0.5,129.8,157.2,0.5,2.0,53.0,72.8,-0.1
9,2745,2.0,4.671482,30.64844,1584.146,2358.146,38.95434,110.99542,18.0133,12.635126,...,82.4,65.12,0.6,138.0,187.52,0.7,3.0,43.2,60.6,0.1


In [None]:
def extract_features(df):
    df["Feat_0"] = df["Physical-Height"] * df["PAQ_C-PAQ_C_Total"]
    df["Feat_1"] = df["FGC-FGC_TL_Zone"] * df["Physical-Height"]
    df["Feat_2"] = df["PreInt_EduHx-computerinternet_hoursday"] * df["BIA-BIA_Activity_Level_num"]
    # df["Feat_3"] = df["Fitness_Endurance-Time_Sec"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_4"] = df["CGAS-CGAS_Score"] / df["FGC-FGC_CU_Zone"]
    df["Feat_5"] = df["Basic_Demos-Age"] / df["FGC-FGC_SRR_Zone"]
    df["Feat_7"] = df["PAQ_C-PAQ_C_Total"] * df["BIA-BIA_Frame_num"]
    # df["Feat_9"] = df["FGC-FGC_GSD"] / df["SDS-SDS_Total_Raw"]
    # df["Feat_10"] = df["PAQ_A-PAQ_A_Total"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_11"] = df["BIA-BIA_LDM"] / df["PreInt_EduHx-computerinternet_hoursday"]
    df["Feat_14"] = df["BIA-BIA_BMI"] / df["SDS-SDS_Total_Raw"]
    df["Feat_15"] = df["Physical-Height"] * df["SDS-SDS_Total_T"]
    df["Feat_16"] = df["Physical-Height"] * df["Physical-Height"]
    df["Feat_17"] = df["FGC-FGC_SRL_Zone"] / df["Physical-Weight"]
    df["Feat_18"] = df["Basic_Demos-Sex"] * df["Basic_Demos-Sex"]
    # df["Feat_19"] = df["FGC-FGC_GSND_Zone"] / df["BIA-BIA_Fat"]

    return df

train = extract_features(train)
test = extract_features(test)

In [15]:
# qwk score
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# threshold rounder
def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))
# prediction evaluation using qwk function
def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, train, test_data, featureCols) -> list[float]:
    X = train[featureCols]
    y = train['sii']

    # Identify rows with NaN values in X

    # Fill NaN and infinite values in X and test_data
    X = X.fillna(0)
    X = X.replace([np.inf, -np.inf], 0)
    # test_data = test_data.fillna(0)
    test_data = test_data.replace([np.inf, -np.inf], 0)

    scaler = StandardScaler()
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), columns=X.columns)
    test_data = test_data[featureCols]
    test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
    n_splits = 5
    random_state = 42
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        test_preds[:, fold] = model.predict(test_data)

    KappaOptimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOptimizer.success, "Optimization did not converge."
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOptimizer.x)

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOptimizer.x)

    predictions = np.array(tp_rounded.tolist())
    print(predictions)
    # for idx in nan_indices:
    #     predictions[idx] = np.nan

    return predictions.tolist()

In [16]:
LGBM_params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  
    'lambda_l2': 0.01
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 400,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  
    'reg_lambda': 5,  
    'random_state': 42,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 400,
    'random_seed': 42,
    'verbose': 0,
    'l2_leaf_reg': 10  
}


In [17]:
Light = LGBMRegressor(**LGBM_params, random_state=42, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)],
     weights=[0.3, 0.5, 0.2]
)


In [18]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [19]:
# def TrainML(model_class, train, test_data, featureCols) -> list[int]:
nots_featureCols = sorted(list(set(train.columns) - set(['sii', 'id'])))
# featureCols
nots_preds = TrainML(voting_model, train, test, nots_featureCols)

Training Folds: 100%|██████████| 5/5 [00:30<00:00,  6.01s/it]


[2 0 0 0 2 1 0 0 2 2 1 0 1 1 2 2 0 0 0 2]


In [20]:
nots_preds

[2, 0, 0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0, 2]

In [21]:
# vote_preds = TrainML(model_class=voting_model, test_data=test)
final_sub = pd.DataFrame({
    
    'id'   : test_ids,
    'sii': nots_preds
})


final_sub.to_csv('submission.csv', index=False)
final_sub

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,2
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,2
