# Phase 3-1: HP Tuning for CatBoost at $t=2$

## Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures.

In [None]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
import random
seed_val = 32
np.random.seed(seed_val)
random.seed(seed_val)


## Data Loading

In [2]:
seed = 42
df = pd.read_csv(f"datasets/train_set_{seed}_t_2.csv")

In [3]:
X = df.drop(['is_drop'], axis=1)
y = df['is_drop'].copy()

## Data Transformation

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attrs = ['grade', 'gpa_last_seme', 'credits_last_seme', 'credits_tot', 'n_seme', 'years_since']
cat_attrs = ['semester', 'sex', 'adm_unit', 'nation', 'in_capa', 'college', 'leave']

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attrs),
    ('cat', 'passthrough', cat_attrs)
])

X_t = full_pipeline.fit_transform(X)
X_t = pd.DataFrame(X_t, columns=full_pipeline.get_feature_names_out())

## Hyper-parameter Tuning

5-fold Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid_cat = [
    {
    'iterations': [800, 1000],
    'depth': [6, 7, 8, 9, 10],
    }
]

grid_search_cat = GridSearchCV(cat_clf, param_grid_cat, cv=5,
                           scoring='average_precision',
                           return_train_score=True)
grid_search_cat.fit(X_t, y, cat_features=cat_indices)

Learning rate set to 0.045743
0:	learn: 0.6185146	total: 44.6ms	remaining: 35.7s
1:	learn: 0.5495106	total: 86.9ms	remaining: 34.7s
2:	learn: 0.4988487	total: 126ms	remaining: 33.5s
3:	learn: 0.4528797	total: 161ms	remaining: 32.1s
4:	learn: 0.4144897	total: 199ms	remaining: 31.7s
5:	learn: 0.3824345	total: 238ms	remaining: 31.5s
6:	learn: 0.3589259	total: 262ms	remaining: 29.7s
7:	learn: 0.3350529	total: 304ms	remaining: 30.1s
8:	learn: 0.3135857	total: 343ms	remaining: 30.2s
9:	learn: 0.2921087	total: 382ms	remaining: 30.1s
10:	learn: 0.2755393	total: 425ms	remaining: 30.5s
11:	learn: 0.2631575	total: 465ms	remaining: 30.5s
12:	learn: 0.2522337	total: 510ms	remaining: 30.9s
13:	learn: 0.2388307	total: 550ms	remaining: 30.9s
14:	learn: 0.2303063	total: 586ms	remaining: 30.7s
15:	learn: 0.2197370	total: 621ms	remaining: 30.4s
16:	learn: 0.2115351	total: 655ms	remaining: 30.2s
17:	learn: 0.2030106	total: 693ms	remaining: 30.1s
18:	learn: 0.1981979	total: 736ms	remaining: 30.3s
19:	learn

In [12]:
cvres = grid_search_cat.cv_results_
for ap_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(ap_score, params) 

0.4845455596200555 {'depth': 6, 'iterations': 800}
0.48824030880399255 {'depth': 6, 'iterations': 1000}
0.48433423690108 {'depth': 7, 'iterations': 800}
0.48081949955245157 {'depth': 7, 'iterations': 1000}
0.478373903655552 {'depth': 8, 'iterations': 800}
0.4818731765013161 {'depth': 8, 'iterations': 1000}
0.4747980751900453 {'depth': 9, 'iterations': 800}
0.4747024762948119 {'depth': 9, 'iterations': 1000}
0.46369136467040395 {'depth': 10, 'iterations': 800}
0.4617870280568611 {'depth': 10, 'iterations': 1000}


In [13]:
grid_search_cat.best_params_

{'depth': 6, 'iterations': 1000}

In [14]:
cat_final = grid_search_cat.best_estimator_
scores_cat = cat_final.predict_proba(X_t)[:, 1]