In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.6f' % x)

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

from catboost import CatBoostClassifier

In [2]:
train = pd.read_pickle('train_04j20p_fe_v3.pickle')
test = pd.read_pickle('test_04j20p_fe_v3.pickle')

In [3]:
mapping_dict = {21:0, 1:1, 4:2, 5:3}
mapping_dict_inv = {0:21, 1:1, 2:4, 3:5}

In [4]:
train['label'] = train.label.map(mapping_dict)

In [6]:
use_features = [col for col in train.columns if col not in ['event_id', 'label']]

In [7]:
def run_cat(df_train, df_test, use_features):
    
    target = 'label'
    oof_pred = np.zeros((len(df_train), 4))
    y_pred = np.zeros((len(df_test), 4))
    
    folds = GroupKFold(n_splits=5)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train['label'], train['event_id'])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind]
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        params = {
            'task_type': 'GPU',
            'learning_rate': 0.1,
            'eval_metric': 'MultiClass',
            'loss_function': 'MultiClass',
            'classes_count': 4,
            'iterations': 10000,
            'random_seed': 1029,
            'max_depth': 8,
            'max_leaves': 64,
            'reg_lambda': 0.5,
            'early_stopping_rounds': 100
        }
        
        model = CatBoostClassifier(**params)
        
        model.fit(x_train,
                  y_train,
                  eval_set=(x_val, y_val),
                  verbose=100)
        oof_pred[val_ind] = model.predict_proba(x_val)
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        y_one_hot = label_binarize(y_val, np.arange(4)) 
        oof_one_hot = label_binarize(oof_pred[val_ind].argmax(axis=1), np.arange(4)) 
        score = roc_auc_score(y_one_hot, oof_one_hot) 
        print('auc: ', score)
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [8]:
y_pred, oof_pred = run_cat(train, test, use_features)

Fold 1
0:	learn: -1.3387009	test: -1.3392112	best: -1.3392112 (0)	total: 83.2ms	remaining: 13m 51s
100:	learn: -0.9138571	test: -0.9192387	best: -0.9192387 (100)	total: 5.57s	remaining: 9m 5s
200:	learn: -0.8647482	test: -0.8749943	best: -0.8749943 (200)	total: 10.8s	remaining: 8m 48s
300:	learn: -0.8358561	test: -0.8513758	best: -0.8513758 (300)	total: 16.1s	remaining: 8m 39s
400:	learn: -0.8123537	test: -0.8332126	best: -0.8332126 (400)	total: 21.4s	remaining: 8m 33s
500:	learn: -0.7961787	test: -0.8222627	best: -0.8222627 (500)	total: 26.6s	remaining: 8m 25s
600:	learn: -0.7826632	test: -0.8145351	best: -0.8145351 (600)	total: 31.9s	remaining: 8m 18s
700:	learn: -0.7701508	test: -0.8075624	best: -0.8075624 (700)	total: 37s	remaining: 8m 11s
800:	learn: -0.7598803	test: -0.8028268	best: -0.8028268 (800)	total: 42.1s	remaining: 8m 3s
900:	learn: -0.7492385	test: -0.7975059	best: -0.7975059 (900)	total: 47.2s	remaining: 7m 56s
1000:	learn: -0.7406135	test: -0.7942950	best: -0.7942950 (

1200:	learn: -0.7223182	test: -0.7855482	best: -0.7855463 (1199)	total: 1m 1s	remaining: 7m 29s
1300:	learn: -0.7149451	test: -0.7834298	best: -0.7834298 (1300)	total: 1m 6s	remaining: 7m 22s
1400:	learn: -0.7069607	test: -0.7805632	best: -0.7805632 (1400)	total: 1m 11s	remaining: 7m 16s
1500:	learn: -0.7005104	test: -0.7794867	best: -0.7794867 (1500)	total: 1m 16s	remaining: 7m 10s
1600:	learn: -0.6939338	test: -0.7776323	best: -0.7776155 (1598)	total: 1m 20s	remaining: 7m 4s
1700:	learn: -0.6875503	test: -0.7761503	best: -0.7761503 (1700)	total: 1m 25s	remaining: 6m 58s
1800:	learn: -0.6815684	test: -0.7751161	best: -0.7751154 (1799)	total: 1m 30s	remaining: 6m 52s
1900:	learn: -0.6753235	test: -0.7737024	best: -0.7737024 (1900)	total: 1m 35s	remaining: 6m 47s
2000:	learn: -0.6691664	test: -0.7723104	best: -0.7723104 (2000)	total: 1m 40s	remaining: 6m 41s
2100:	learn: -0.6633433	test: -0.7712012	best: -0.7711981 (2099)	total: 1m 45s	remaining: 6m 36s
2200:	learn: -0.6580928	test: -0.

2200:	learn: -0.6574360	test: -0.7682258	best: -0.7682169 (2199)	total: 1m 49s	remaining: 6m 29s
2300:	learn: -0.6518908	test: -0.7672591	best: -0.7672591 (2300)	total: 1m 54s	remaining: 6m 23s
2400:	learn: -0.6467321	test: -0.7664753	best: -0.7664753 (2400)	total: 1m 59s	remaining: 6m 18s
2500:	learn: -0.6416691	test: -0.7656695	best: -0.7656695 (2500)	total: 2m 4s	remaining: 6m 12s
2600:	learn: -0.6363703	test: -0.7646070	best: -0.7646070 (2600)	total: 2m 9s	remaining: 6m 7s
2700:	learn: -0.6308888	test: -0.7634849	best: -0.7634849 (2700)	total: 2m 14s	remaining: 6m 2s
2800:	learn: -0.6256113	test: -0.7626861	best: -0.7626672 (2793)	total: 2m 18s	remaining: 5m 56s
2900:	learn: -0.6203705	test: -0.7619407	best: -0.7619407 (2900)	total: 2m 23s	remaining: 5m 52s
3000:	learn: -0.6154145	test: -0.7613700	best: -0.7613643 (2999)	total: 2m 28s	remaining: 5m 47s
3100:	learn: -0.6106351	test: -0.7604568	best: -0.7604557 (3099)	total: 2m 33s	remaining: 5m 42s
3200:	learn: -0.6056487	test: -0.7

2500:	learn: -0.6408979	test: -0.7685210	best: -0.7685210 (2500)	total: 2m 4s	remaining: 6m 13s
2600:	learn: -0.6357059	test: -0.7677434	best: -0.7677374 (2598)	total: 2m 9s	remaining: 6m 8s
2700:	learn: -0.6304468	test: -0.7670177	best: -0.7670177 (2700)	total: 2m 14s	remaining: 6m 3s
2800:	learn: -0.6251735	test: -0.7662106	best: -0.7661979 (2798)	total: 2m 19s	remaining: 5m 58s
2900:	learn: -0.6201649	test: -0.7653516	best: -0.7653516 (2900)	total: 2m 24s	remaining: 5m 53s
3000:	learn: -0.6152366	test: -0.7647030	best: -0.7647030 (3000)	total: 2m 29s	remaining: 5m 48s
3100:	learn: -0.6102738	test: -0.7638594	best: -0.7638517 (3090)	total: 2m 34s	remaining: 5m 42s
3200:	learn: -0.6056880	test: -0.7635096	best: -0.7635048 (3193)	total: 2m 39s	remaining: 5m 37s
3300:	learn: -0.6011248	test: -0.7630575	best: -0.7630542 (3296)	total: 2m 43s	remaining: 5m 32s
3400:	learn: -0.5961263	test: -0.7621543	best: -0.7621543 (3400)	total: 2m 48s	remaining: 5m 27s
3500:	learn: -0.5912663	test: -0.7

2200:	learn: -0.6570665	test: -0.7731875	best: -0.7731875 (2200)	total: 1m 50s	remaining: 6m 30s
2300:	learn: -0.6514122	test: -0.7719129	best: -0.7719102 (2299)	total: 1m 55s	remaining: 6m 24s
2400:	learn: -0.6458749	test: -0.7705354	best: -0.7705351 (2399)	total: 1m 59s	remaining: 6m 19s
2500:	learn: -0.6401952	test: -0.7691820	best: -0.7691820 (2500)	total: 2m 4s	remaining: 6m 14s
2600:	learn: -0.6349869	test: -0.7683511	best: -0.7683288 (2596)	total: 2m 9s	remaining: 6m 9s
2700:	learn: -0.6296842	test: -0.7674680	best: -0.7674453 (2691)	total: 2m 14s	remaining: 6m 3s
2800:	learn: -0.6243687	test: -0.7665114	best: -0.7665114 (2800)	total: 2m 19s	remaining: 5m 58s
2900:	learn: -0.6193181	test: -0.7657344	best: -0.7657344 (2900)	total: 2m 24s	remaining: 5m 53s
3000:	learn: -0.6141029	test: -0.7648664	best: -0.7648650 (2999)	total: 2m 29s	remaining: 5m 48s
3100:	learn: -0.6088191	test: -0.7636356	best: -0.7636272 (3098)	total: 2m 34s	remaining: 5m 43s
3200:	learn: -0.6040906	test: -0.7

In [9]:
y_one_hot = label_binarize(train['label'], np.arange(4)) 
oof_one_hot = label_binarize(oof_pred.argmax(axis=1), np.arange(4)) 
score = roc_auc_score(y_one_hot, oof_one_hot) 
print('auc: ', score)

auc:  0.7990429584295026


In [10]:
test = pd.read_pickle('test_04j20p_fe_v3.pickle')[['event_id']]
test["label"]= y_pred.argmax(axis=1)
test["label"] = test["label"].map({0:21, 1:1, 2:4, 3:5})
test.head()

Unnamed: 0,event_id,label
0,fffecb31f40847b98e2175f689d535fb,5
1,fffe4696d3414a6288287e1b100c96c7,4
2,fffdee97573e44b2a51302dd74f78d78,5
3,fffdcc670e37433bb6227ea7d56dd29e,4
4,fffdbce9055843a5ae34ee9774e77454,1


In [11]:
submission = pd.read_csv('./jet_complex_data/complex_test_R04_jet.csv')[['jet_id', 'event_id']]
submission = pd.merge(submission, test, on=['event_id'])
submission = submission.drop(['event_id'], axis=1)
submission = submission.rename(columns={'jet_id': 'id'})
submission.head()

Unnamed: 0,id,label
0,cb3b5c6ea5e441e9b425fc1e6a4a00e6,5
1,ab8fc3fd74e643e68e070a391b8285c2,5
2,838bf5c3f9bc4a2194f51b3c57f403cd,21
3,f250685e3eee4e07ae31d851e72f4864,21
4,133b8f4676214e8191aa3a1da77439d4,21


In [12]:
submission.to_csv('submission_cat_0.7990429584295026.csv', index=False)

In [13]:
np.save('y_pred_cat_0.7990429584295026', y_pred)
np.save('oof_pred_cat_0.7990429584295026', oof_pred)