# Catbooster

## Meta preparation

In [None]:
# Environment settings
is_kaggle = False

dataset_name = "hms-harmful-brain-activity-classification"

In [None]:
# Set-up for Google Colab.
import sys

is_colab = True if "google.colab" in sys.modules else False

if is_colab:
    # Mount Google Drive if it is running on Google Colab.
    from google.colab import drive
    drive.mount("/content/drive")

    # Get my kaggle API roken.
    !mkdir -p /root/.kaggle
    !cp /content/drive/MyDrive/.kaggle/kaggle.json /root/.kaggle/
    !chmod 600 /root/.kaggle/kaggle.json

    # Prepare for directory.
    !pip install kaggle
    !apt install unzip
    !mkdir input output
    
    !kaggle competitions download -c {dataset_name}
    !unzip -o {dataset_name}.zip -d input/{dataset_name}

In [None]:
# Load libraries.
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1"
import pandas as pd, numpy as np
import matplotlib.pyplot as pyplot

VER = 2

## Train data preparation

In [None]:
train_csv_path = "./../data/input/train.csv" if not is_kaggle else "/kaggle/input/hms-harmful-brain-activity-classification/train.csv"
train_df = pd.read_csv(train_csv_path)
targets = train_df.columns[-6:]

print("Train shape: ", train_df.shape)
print("Targets: ", list(targets))

train_df.head()

### Create Non-Overlapping Eeg Id Train Data

The competition data description says that test data does not have multiple crops from the same eeg_id. Therefore we will train and validate using only 1 crop per eeg_id. There is a discussion about this [discussion](https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/467021).

In [None]:
train_df_grouped_by_eeg_id = train_df.groupby("eeg_id")

train_df_agg = train_df_grouped_by_eeg_id[["spectrogram_id", "spectrogram_label_offset_seconds"]].agg(
    {"spectrogram_id": "first",
     "spectrogram_label_offset_seconds": "min"}
)
train_df_agg.columns = ["spec_id", "min"]

tmp = train_df_grouped_by_eeg_id[["spectrogram_id", "spectrogram_label_offset_seconds"]].agg(
    {"spectrogram_label_offset_seconds": "max"}
)
train_df_agg['max'] = tmp

tmp = train_df_grouped_by_eeg_id[["patient_id"]].agg("first")
train_df_agg['patient_id'] = tmp

tmp = train_df_grouped_by_eeg_id[targets].agg("sum")
for target in targets:
    train_df_agg[target] = tmp[target].values

tmp = train_df_grouped_by_eeg_id[["expert_consensus"]].agg("first")
train_df_agg["target"] = tmp

train_df_agg = train_df_agg.reset_index()
print("Train non-overlapped eeg_id shape: ", train_df_agg.shape)

train_df_agg.head()

### Feature Engineer
First we need to read in all 11k train spectrogram files. Reading thousands of files takes 11 minutes with Pandas. Instead, we can read 1 file from [Brain-Spectrograms dataset](https://www.kaggle.com/datasets/cdeotte/brain-spectrograms), which contains all the 11k spectrograms in less than 1 minute! To use this dataset, set variable READ_SPEC_FILES = False.

Next we need to engineer features for our CatBoost model. In this notebook, we just take the mean (over time) of each of the 400 spectrogram frequencies (using middle 10 minutes). This produces 400 features (per each unique eeg id). We can improve CV and LB score by engineering new features (and/or tuning CatBoost).

Also here we create features from means and mins and use 10 minute windows and 20 second windows.

In [None]:
READ_SPEC_FILES = False
FEATURE_ENGINEER = True

In [None]:
%%time
# Read all spectrograms.
spectrograms_dir_path = "./../data/input/train_spectrograms"
spectrogram_files = os.listdir(spectrograms_dir_path)

print(f"There are {(len(spectrogram_files))} spectrogram parquets")

if READ_SPEC_FILES:
    spectrograms = {}
    for index, spectrogram_file in enumerate(spectrogram_files):
        if index % 100 == 0:
            print(f"{index}, ", end="")
        tmp = pd.read_parquet(f"{spectrograms_dir_path}{spectrogram_file}")
        name = int(spectrogram_file.split(".")[0])
        spectrograms[name] = tmp.iloc[:, 1:].values
else:
    spectrograms_npy_path = "./../data/input/brain-spectrograms/specs.npy" if not is_kaggle else "kaggle/input/brain-spectrograms/specs.npy"
    spectrograms = np.load(spectrograms_npy_path, allow_pickle=True).item()

In [None]:
%time
# Engineer features.
import warnings
warnings.filterwarnings("ignore")

spec_cols = pd.read_parquet(os.path.join(spectrograms_dir_path, "1000086677.parquet")).columns[1:]
features = [f"{col}_mean_10m" for col in spec_cols]
features += [f"{col}_min_10m" for col in spec_cols]
features += [f"{col}_mean_20m" for col in spec_cols]
features += [f"{col}_min_20m" for col in spec_cols]

print(f"We are creating {len(features)} features for {len(train_df_agg)} rows...", end="")

if FEATURE_ENGINEER:
    data = np.zeros((len(train_df_agg), len(features)))
    for k in range(len(train_df_agg)):
        if k % 100 == 0:
            print(f"{k}, ", end="")
        
        row = train_df_agg.iloc[k]
        r = int((row['min'] + row['max']) // 4)
        
        # 10 minute window features (means and mins)
        x = np.nanmean(spectrograms[row.spec_id][r:r+300, :], axis=0)
        data[k, :400] = x
        x = np.nanmin(spectrograms[row.spec_id][r:r+300, :], axis=0)
        data[k, 400:800] = x
        
        # 20 second window features (means and mins)
        x = np.nanmean(spectrograms[row.spec_id][r+145:r+155, :], axis=0)
        data[k, 800:1200] = x
        x = np.nanmin(spectrograms[row.spec_id][r+145:r+155, :], axis=0)
        data[k, 1200:1600] = x
        
    train_df_agg[features] = data
else:
    train_pqt_path = "./../data/input/brain-spectrograms/train.pqt" if not is_kaggle else "kaggle/input/brain-spectrograms/train.pqt"
    train_df_agg = pd.read_parquet(train_pqt_path)

print()
print("New train shape: ", train_df_agg.shape)

### Train CatBoost
We use the default settings for CatBoost which are pretty good. We can tune CatBoost manually to improve CV and LB score. Note that CatBoost will automatically use both Kaggle T4 GPUs (when we add parameter task_type='GPU') for super fast training!

In [None]:
import catboost as cat, gc
from catboost import CatBoostClassifier, Pool
print("CatBoost version", cat.__version__)

In [None]:
from sklearn.model_selection import KFold, GroupKFold

all_oof = []
all_true = []

tars = {"Seizure": 0, "LPD": 1, "GPD": 2, "LRDA": 3, "GRDA": 4, "Other": 5}

group_k_fold = GroupKFold(n_splits=5)

for index, (train_index, valid_index) in enumerate(group_k_fold.split(train_df_agg, train_df_agg.target, train_df_agg.patient_id)):
    print("#" * 25)
    print(f"### Fold {index+1}")
    print(f"### train size {len(train_index)}, valid size {len(valid_index)}")
    print("#" * 25)
    
    # model = CatBoostClassifier(task_type="GPU", loss_function="MultiClass")
    model = CatBoostClassifier(task_type="CPU", loss_function="MultiClass")
    
    train_pool = Pool(
        data=train_df_agg.loc[train_index, features],
        label=train_df_agg.loc[train_index, "target"].map(tars),
    )
    
    valid_pool = Pool(
        data=train_df_agg.loc[valid_index, features],
        label=train_df_agg.loc[valid_index, "target"].map(tars),
    )
    
    model.fit(train_pool, verbose=100, eval_set=valid_pool)
    
    oof = model.predict_proba(valid_pool)
    all_oof.append(oof)
    all_true.append(train_df_agg.loc[valid_index, targets].values)
    
    del train_pool, valid_pool, oof
    gc.collect()
    
all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

## References
- [CatBoost Starter](https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-67)