<a href="https://colab.research.google.com/github/jagnathan/crosstalk-q1-2025/blob/jag/notebooks/3_1_train_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

⚙️ Step 1: Set your notebook to GPU

The next two cells take ~2 min.... start running them now while we talk! 👇👇

In [2]:
# get workshop code
import os
import sys
IN_COLAB = os.getenv("COLAB_RELEASE_TAG")
if IN_COLAB:
    !git clone https://github.com/rajaonsonella/crosstalk-q2-2025
    sys.path.append('./crosstalk-q2-2025')
else:
    sys.path.append('..')
!pip install -r crosstalk-q2-2025/requirements.txt

Cloning into 'crosstalk-q2-2025'...
remote: Enumerating objects: 365, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 365 (delta 72), reused 41 (delta 27), pack-reused 248 (from 1)[K
Receiving objects: 100% (365/365), 37.10 MiB | 30.29 MiB/s, done.
Resolving deltas: 100% (196/196), done.
Collecting catboost (from -r crosstalk-q2-2025/requirements.txt (line 5))
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pympler (from -r crosstalk-q2-2025/requirements.txt (line 6))
  Downloading Pympler-1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdkit (from -r crosstalk-q2-2025/requirements.txt (line 8))
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00

In [3]:
# Download data from google drive
import gdown
import os

file_ids = {'test_inputs' : '1Gyv_ldUTi0Ymy6wVMfruAO0UraCQ70CR',
            'train': '11S5p0QgP1X9rOFiIjNSLydLenJwm7hle'}

for name, file_id in file_ids.items():
    filename = f'crosstalk_{name}.parquet'
    if not os.path.exists(filename):
        gdown.download(id=file_id, output=filename, quiet=False)

Or, if you have the file located in your drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Load the train datasets

See the bonus content from last notebook to get a peek under the hood of the data loaders

Or check it out in the files you downloaded to colab on the left 👈

In [4]:
import pandas as pd
import numpy as np
from dataset import basic_dataloader

In [7]:
X_train, y_train = basic_dataloader('/content/crosstalk_train.parquet', x_col="AVALON", y_col = 'DELLabel', max_to_load=1000) # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

Loading chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
X_train.shape

(1000, 2048)

In [9]:
y_train.shape

(1000,)

In [10]:
print(y_train)

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 

# Let's train catboost classifier and see how well it fits the training data

🐞 do you see a CUDA error? raise your hand now and brag about it

In [11]:
%%time
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
model.fit(X_train, y_train)
yp = model.predict_proba(X_train)[:, 1] # or validation

CPU times: user 35.5 s, sys: 3.82 s, total: 39.3 s
Wall time: 35.6 s


In [12]:
eval = BinaryEvaluator(X_train.toarray(), y_train)
metric_dict = eval.compute_metrics(yt=y_train, yp=yp) # or validation

In [13]:
for metric_name, metric_value in metric_dict.items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 1.00
balanced_accuracy   : 1.00
roc_auc             : 1.00
precision           : 1.00
recall              : 1.00
mean_reciprocal_rank: 0.08
positives           : 59.00
predicted_positives : 59.00
hits_at_5           : 0.08
precision_at_5      : 1.00
hits_at_10          : 0.17
precision_at_10     : 1.00
hits_at_30          : 0.51
precision_at_30     : 1.00
hits_at_59          : 1.00
precision_at_59     : 1.00


# How well does it generalize though? Let's try 5-fold cross-validation

In [14]:
%%time
model_cv = cb.CatBoostClassifier(**params)
metric_dict_cv = eval.CV_model(model_cv)

CPU times: user 3min 5s, sys: 18.7 s, total: 3min 24s
Wall time: 2min 58s


In [15]:
for metric_name, metric_value in metric_dict_cv['mean'].items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

accuracy            : 0.95
balanced_accuracy   : 0.60
roc_auc             : 0.79
precision           : 0.82
recall              : 0.20
mrr                 : 0.16
precision_at_k_5    : 0.56
hits_at_k_5         : 0.24
precision_at_k_10   : 0.46
hits_at_k_10        : 0.39
precision_at_k_30   : 0.23
hits_at_k_30        : 0.59


# Submit predictions

Update the next cell with your team name

In [16]:
team_name = 'demo'

In [17]:
%%time
X_test = basic_dataloader('/content/crosstalk_test_inputs.parquet', x_col="AVALON", y_col = None, max_to_load = None, chunk_size = 20000)

Loading chunks:   0%|          | 0/17 [00:00<?, ?it/s]

CPU times: user 3min 13s, sys: 9.38 s, total: 3min 22s
Wall time: 3min 20s


In [18]:
X_test.shape

(339258, 2048)

In [26]:
yp = model.predict_proba(X_test)[:,1]

Upload this baseline to kaggle and check out the leaderboard!

In [27]:
import pyarrow as pa
from pyarrow import parquet as pq

In [36]:
pf = pq.ParquetFile('/content/crosstalk_test_inputs.parquet')

In [37]:
pf.metadata

<pyarrow._parquet.FileMetaData object at 0x7fe6cc5f5300>
  created_by: parquet-cpp-arrow version 14.0.2
  num_columns: 12
  num_rows: 339258
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 59684

In [38]:
preds = pf.read(columns = ['RandomID']).to_pandas()
preds['DELLabel'] = yp
display(preds)

Unnamed: 0,RandomID,DELLabel
0,ID_0,0.004303
1,ID_1,0.007036
2,ID_2,0.000366
3,ID_3,0.002992
4,ID_4,0.006079
...,...,...
339253,ID_339253,0.000963
339254,ID_339254,0.001228
339255,ID_339255,0.007625
339256,ID_339256,0.005785


In [29]:
preds.to_csv(f'{team_name}.csv', index=False)

# Let's compare it against some sklearn baselines

⚠️ these next cells are slow to run! Start them now and come back in 5 minutes

In [39]:
%%time
from eval import get_baseline_models

eval = BinaryEvaluator(X_train, y_train)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

TypeError: X must be a numpy array or pandas DataFrame

In [None]:
# display all the models results
baselines_res.update({'catboost': metric_dict_cv})
pd.DataFrame({model: metrics['mean'] for model, metrics in baselines_res.items()}).T.round(2)