<a href="https://www.kaggle.com/code/uwu1234/modular-data-load-class-for-starter-notebook-gpu?scriptVersionId=205240132" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Modular Data Load Class for Starter Notebook

I improved the [starter notebook](https://www.kaggle.com/code/onodera/starter-notebook-with-polars-gpu)  by turning the data load into a class. This helped me get up to speed faster and will hopefully make it easier for others to jump in as well!



In [None]:
!nvidia-smi

In [None]:
!pip install /kaggle/input/polars-gpu-1-7-1/cupy_cuda12x-13.3.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install /kaggle/input/polars-gpu-1-7-1/rmm_cu12-24.8.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
!pip install /kaggle/input/polars-gpu-1-7-1/cudf_cu12-24.8.3-cp310-cp310-manylinux_2_28_x86_64.whl
!pip install /kaggle/input/polars-gpu-1-7-1/polars-1.7.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/polars-gpu-1-7-1/cudf_polars_cu12-24.8.3-py3-none-any.whl

In [None]:
!ls /kaggle/usr/lib


In [None]:
import warnings
from functools import partial
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import optuna
import polars as pl
import polars.selectors as cs
from catboost import CatBoostRegressor, MultiTargetCustomMetric
from numpy.typing import ArrayLike, NDArray
from polars.testing import assert_frame_equal
from sklearn.base import BaseEstimator
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", message="Failed to optimize method")

In [None]:
class Load_Child_Mind_Data:
    """ loads in the csv and parquet data, defines target cols and feature columns, and combines 
    all the data into a final 'train' and 'test' set. Performs some validation on the train/test sets
    """
    def __init__(self):
        self.IS_TEST = None
        self.train = None
        self.test = None
        self.train_test = None
        self.train_agg = None
        self.test_agg = None
        self.X = None
        self.X_test = None
        self.y = None
        self.y_sii = None
        self.cat_features = None
        self.DATA_DIR = Path("/kaggle/input/child-mind-institute-problematic-internet-use")
        self.TARGET_COLS = [
            "PCIAT-PCIAT_01",
            "PCIAT-PCIAT_02",
            "PCIAT-PCIAT_03",
            "PCIAT-PCIAT_04",
            "PCIAT-PCIAT_05",
            "PCIAT-PCIAT_06",
            "PCIAT-PCIAT_07",
            "PCIAT-PCIAT_08",
            "PCIAT-PCIAT_09",
            "PCIAT-PCIAT_10",
            "PCIAT-PCIAT_11",
            "PCIAT-PCIAT_12",
            "PCIAT-PCIAT_13",
            "PCIAT-PCIAT_14",
            "PCIAT-PCIAT_15",
            "PCIAT-PCIAT_16",
            "PCIAT-PCIAT_17",
            "PCIAT-PCIAT_18",
            "PCIAT-PCIAT_19",
            "PCIAT-PCIAT_20",
            "PCIAT-PCIAT_Total",
            "sii",
        ]
        self.FEATURE_COLS = [
            "Basic_Demos-Enroll_Season",
            "Basic_Demos-Age",
            "Basic_Demos-Sex",
            "CGAS-Season",
            "CGAS-CGAS_Score",
            "Physical-Season",
            "Physical-BMI",
            "Physical-Height",
            "Physical-Weight",
            "Physical-Waist_Circumference",
            "Physical-Diastolic_BP",
            "Physical-HeartRate",
            "Physical-Systolic_BP",
            "Fitness_Endurance-Season",
            "Fitness_Endurance-Max_Stage",
            "Fitness_Endurance-Time_Mins",
            "Fitness_Endurance-Time_Sec",
            "FGC-Season",
            "FGC-FGC_CU",
            "FGC-FGC_CU_Zone",
            "FGC-FGC_GSND",
            "FGC-FGC_GSND_Zone",
            "FGC-FGC_GSD",
            "FGC-FGC_GSD_Zone",
            "FGC-FGC_PU",
            "FGC-FGC_PU_Zone",
            "FGC-FGC_SRL",
            "FGC-FGC_SRL_Zone",
            "FGC-FGC_SRR",
            "FGC-FGC_SRR_Zone",
            "FGC-FGC_TL",
            "FGC-FGC_TL_Zone",
            "BIA-Season",
            "BIA-BIA_Activity_Level_num",
            "BIA-BIA_BMC",
            "BIA-BIA_BMI",
            "BIA-BIA_BMR",
            "BIA-BIA_DEE",
            "BIA-BIA_ECW",
            "BIA-BIA_FFM",
            "BIA-BIA_FFMI",
            "BIA-BIA_FMI",
            "BIA-BIA_Fat",
            "BIA-BIA_Frame_num",
            "BIA-BIA_ICW",
            "BIA-BIA_LDM",
            "BIA-BIA_LST",
            "BIA-BIA_SMM",
            "BIA-BIA_TBW",
            "PAQ_A-Season",
            "PAQ_A-PAQ_A_Total",
            "PAQ_C-Season",
            "PAQ_C-PAQ_C_Total",
            "SDS-Season",
            "SDS-SDS_Total_Raw",
            "SDS-SDS_Total_T",
            "PreInt_EduHx-Season",
            "PreInt_EduHx-computerinternet_hoursday",
            
            # stats features from parquets
            "X_min",
            "Y_min",
            "Z_min",
            "enmo_min",
            "anglez_min",
            "light_min",
            "battery_voltage_min",
            "X_mean",
            "Y_mean",
            "Z_mean",
            "enmo_mean",
            "anglez_mean",
            "light_mean",
            "battery_voltage_mean",
            "X_max",
            "Y_max",
            "Z_max",
            "enmo_max",
            "anglez_max",
            "light_max",
            "battery_voltage_max",
            "X_std",
            "Y_std",
            "Z_std",
            "enmo_std",
            "anglez_std",
            "light_std",
            "battery_voltage_std"]

    def load_csv_data(self):
        """ reads in train, test, train_test data"""
        self.train = pl.read_csv(self.DATA_DIR / "train.csv")
        self.test = pl.read_csv(self.DATA_DIR / "test.csv")
        self.train_test = pl.concat([self.train, self.test], how="diagonal")
        return self.train, self.test, self.train_test

    def validate_csv_data(self):
        """validates that the datasets are at the appropriate height, and that train and test 
        were combined properly into train_test. Sets IS_TEST variable.
        Since this is encapsulated in a function, I need to figure something out for the assert_frame_equal"""
        self.IS_TEST = self.test.height <= 100
        assert_frame_equal(self.train, self.train_test[: self.train.height].select(self.train.columns))
        assert_frame_equal(self.test, self.train_test[self.train.height :].select(self.test.columns))
        return

    def fill_nans_csv_data(self):
        """fill NANs, returns train,test,train_test"""
        self.train_test = self.train_test.with_columns(cs.string().cast(pl.Categorical).fill_null("NAN"))
        self.train = self.train_test[: self.train.height]
        self.test = self.train_test[self.train.height :]
        return self.train, self.test, self.train_test
        
    def get_globals(self):
        """data_dir , target_cols,and  feature_cols """
        return self.IS_TEST, self.DATA_DIR, self.TARGET_COLS, self.FEATURE_COLS
        
    def split_array(self,ar, n_group):
        for i_chunk in range(n_group):
            yield ar[i_chunk * len(ar) // n_group : (i_chunk + 1) * len(ar) // n_group]
        return
    
    def agg_parquets(self,files):
        cols = ["X", "Y", "Z", "enmo", "anglez", "light", "battery_voltage"]
        aggs = []
        files_chunks = list(self.split_array(files, 10))
        for files_tmp in tqdm(files_chunks):
            if len(files_tmp) == 0:
                continue
            dfs = []
            for file in files_tmp:
                df = pl.scan_parquet(file)
                df = df.with_columns(pl.lit(file.parts[-1].split("=")[1]).alias("id"))
                dfs.append(df)
            df = pl.concat(dfs)
            agg = (
                df.group_by("id")
                .agg(
                    [pl.col(c).cast(pl.Float32).min().alias(f"{c}_min") for c in cols]
                    + [pl.col(c).cast(pl.Float32).mean().alias(f"{c}_mean") for c in cols]
                    + [pl.col(c).cast(pl.Float32).max().alias(f"{c}_max") for c in cols]
                    + [pl.col(c).cast(pl.Float32).std().alias(f"{c}_std") for c in cols]
                )
                .collect(engine="gpu")
            )
            aggs.append(agg)
        return pl.concat(aggs)
    
    def train_test_parquet_agg(self):
        """ runs the agg_parquets() function on the test and train data. returns train_agg and test_agg"""
        self.train_agg = self.agg_parquets(sorted(Path("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet").glob("*")))
        self.test_agg = self.agg_parquets(sorted(Path("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet").glob("*")))
        return self.train_agg, self.test_agg
    
    def combine_datasets(self):
        """this combines the combine train_agg/test_agg from parquet with the dataframe train/test loaded earlier.
        Returns train, test"""
        self.train = self.train.join(self.train_agg.with_columns(pl.col("id").cast(pl.Categorical)), on='id', how='left')
        self.test = self.test.join(self.test_agg.with_columns(pl.col("id").cast(pl.Categorical)), on='id', how='left')
        return self.train, self.test

    def final_datasets(self):
        """ignore rows with null values in TARGET_COLS, create ground_truth column, 
        separate into X, X_test, y, y_sii, and categorical features"""
        train_without_null = self.train.drop_nulls(subset=self.TARGET_COLS)
        self.X = train_without_null.select(self.FEATURE_COLS)
        self.X_test = self.test.select(self.FEATURE_COLS)
        self.y = train_without_null.select(self.TARGET_COLS)
        self.y_sii = self.y.get_column("sii").to_numpy()  # ground truth
        self.cat_features = self.X.select(cs.categorical()).columns
        return 

    def load_data(self):
        """loads csv data, validates csv data, loads parquet data and aggregates, then combines these datasets. They become
        X, X_test, y, y_sii and cat_features as attributes of the Load_Child_Mind_Data object
        Returns none"""
        self.load_csv_data()
        self.validate_csv_data()
        self.fill_nans_csv_data()
        self.train_test_parquet_agg()
        self.combine_datasets()
        self.final_datasets()
        return 

    def get_final_datasets(self):
        """returns self.X, self.X_test, self.y, self.y_sii, self.cat_features
        which are created in load_data function"""
        return self.X, self.X_test, self.y, self.y_sii, self.cat_features, self.test
        

In [None]:

import load_child_mind_institute_data as data_class

data_obj = Load_Child_Mind_Data()
IS_TEST, DATA_DIR, TARGET_COLS, FEATURE_COLS = data_obj.get_globals()
data_obj.load_data()
X, X_test, y, y_sii, cat_features,test = data_obj.get_final_datasets()

In [None]:
X_test.head()

In [None]:
class MultiTargetQWK(MultiTargetCustomMetric):
    def get_final_error(self, error, weight):
        return np.sum(error)  # / np.sum(weight)

    def is_max_optimal(self):
        # if True, the bigger the better
        return True

    def evaluate(self, approxes, targets, weight):
        # approxes: 予測値 (shape: [ターゲット数, サンプル数])
        # targets: 実際の値 (shape: [ターゲット数, サンプル数])
        # weight: サンプルごとの重み (Noneも可)

        approx = np.clip(approxes[-1], 0, 3).round().astype(int)
        target = targets[-1]

        qwk = cohen_kappa_score(target, approx, weights="quadratic")

        return qwk, 1

    def get_custom_metric_name(self):
        return "MultiTargetQWK"


class OptimizedRounder:
    """
    A class for optimizing the rounding of continuous predictions into discrete class labels using Optuna.
    The optimization process maximizes the Quadratic Weighted Kappa score by learning thresholds that separate
    continuous predictions into class intervals.

    Args:
        n_classes (int): The number of discrete class labels.
        n_trials (int, optional): The number of trials for the Optuna optimization. Defaults to 100.

    Attributes:
        n_classes (int): The number of discrete class labels.
        labels (NDArray[np.int_]): An array of class labels from 0 to `n_classes - 1`.
        n_trials (int): The number of optimization trials.
        metric (Callable): The Quadratic Weighted Kappa score metric used for optimization.
        thresholds (List[float]): The optimized thresholds learned after calling `fit()`.

    Methods:
        fit(y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
            Fits the rounding thresholds based on continuous predictions and ground truth labels.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions that need to be rounded.
                y_true (NDArray[np.int_]): Ground truth class labels.

            Returns:
                None

        predict(y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
            Predicts discrete class labels by rounding continuous predictions using the fitted thresholds.
            `fit()` must be called before `predict()`.

            Args:
                y_pred (NDArray[np.float_]): Continuous predictions to be rounded.

            Returns:
                NDArray[np.int_]: Predicted class labels.

        _normalize(y: NDArray[np.float_]) -> NDArray[np.float_]:
            Normalizes the continuous values to the range [0, `n_classes - 1`].

            Args:
                y (NDArray[np.float_]): Continuous values to be normalized.

            Returns:
                NDArray[np.float_]: Normalized values.

    References:
        - This implementation uses Optuna for threshold optimization.
        - Quadratic Weighted Kappa is used as the evaluation metric.
    """

    def __init__(self, n_classes: int, n_trials: int = 100):
        self.n_classes = n_classes
        self.labels = np.arange(n_classes)
        self.n_trials = n_trials
        self.metric = partial(cohen_kappa_score, weights="quadratic")

    def fit(self, y_pred: NDArray[np.float_], y_true: NDArray[np.int_]) -> None:
        y_pred = self._normalize(y_pred)

        def objective(trial: optuna.Trial) -> float:
            thresholds = []
            for i in range(self.n_classes - 1):
                low = max(thresholds) if i > 0 else min(self.labels)
                high = max(self.labels)
                th = trial.suggest_float(f"threshold_{i}", low, high)
                thresholds.append(th)
            try:
                y_pred_rounded = np.digitize(y_pred, thresholds)
            except ValueError:
                return -100
            return self.metric(y_true, y_pred_rounded)

        optuna.logging.disable_default_handler()
        study = optuna.create_study(direction="maximize")
        study.optimize(
            objective,
            n_trials=self.n_trials,
        )
        self.thresholds = [study.best_params[f"threshold_{i}"] for i in range(self.n_classes - 1)]

    def predict(self, y_pred: NDArray[np.float_]) -> NDArray[np.int_]:
        assert hasattr(self, "thresholds"), "fit() must be called before predict()"
        y_pred = self._normalize(y_pred)
        return np.digitize(y_pred, self.thresholds)

    def _normalize(self, y: NDArray[np.float_]) -> NDArray[np.float_]:
        # normalize y_pred to [0, n_classes - 1]
        return (y - y.min()) / (y.max() - y.min()) * (self.n_classes - 1)

In [None]:
# setting catboost parameters
params = dict(
    loss_function="MultiRMSE",
    eval_metric=MultiTargetQWK(),
    iterations=1 if IS_TEST else 100000,
    learning_rate=0.1,
    depth=5,
    early_stopping_rounds=50,
)

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)
models: list[CatBoostRegressor] = []
y_pred = np.full((X.height, len(TARGET_COLS)), fill_value=np.nan)
for train_idx, val_idx in skf.split(X, y_sii):
    X_train: pl.DataFrame
    X_val: pl.DataFrame
    y_train: pl.DataFrame
    y_val: pl.DataFrame
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # train model
    model = CatBoostRegressor(**params)
    model.fit(
        X_train.to_pandas(),
        y_train.to_pandas(),
        eval_set=(X_val.to_pandas(), y_val.to_pandas()),
        cat_features=cat_features,
        verbose=False,
    )
    models.append(model)

    # predict
    y_pred[val_idx] = model.predict(X_val.to_pandas())

assert np.isnan(y_pred).sum() == 0
# Optimize thresholds
optimizer = OptimizedRounder(n_classes=4, n_trials=300)
y_pred_total = y_pred[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
optimizer.fit(y_pred_total, y_sii)
y_pred_rounded = optimizer.predict(y_pred_total)

# Calculate QWK
qwk = cohen_kappa_score(y_sii, y_pred_rounded, weights="quadratic")
print(f"Cross-Validated QWK Score: {qwk}")

In [None]:
feature_importance = np.mean([model.get_feature_importance() for model in models], axis=0)
sorted_idx = np.argsort(feature_importance)
sorted_idx = sorted_idx[-30:]
fig = plt.figure(figsize=(12, 10))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
class AvgModel:
    def __init__(self, models: list[BaseEstimator]):
        self.models = models

    def predict(self, X: ArrayLike) -> NDArray[np.int_]:
        preds: list[NDArray[np.int_]] = []
        for model in self.models:
            pred = model.predict(X)
            preds.append(pred)

        return np.mean(preds, axis=0)

In [None]:
avg_model = AvgModel(models)
test_pred = avg_model.predict(X_test.to_pandas())[:, TARGET_COLS.index("PCIAT-PCIAT_Total")]
test_pred_rounded = optimizer.predict(test_pred)
test.select("id").with_columns(
    pl.Series("sii", pl.Series("sii", test_pred_rounded)),
).write_csv("submission.csv")