In [1]:
import warnings
warnings.simplefilter('ignore')

import gc

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.6f' % x)

from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

from catboost import CatBoostClassifier

In [2]:
train = pd.read_pickle('train_04j16p.float16.pickle')
test = pd.read_pickle('test_04j16p.float16.pickle')

In [3]:
mapping_dict = {21:0, 1:1, 4:2, 5:3}
mapping_dict_inv = {0:21, 1:1, 2:4, 3:5}

In [4]:
train['label'] = train.label.map(mapping_dict)

In [5]:
use_features = [col for col in train.columns if col not in ['event_id', 'label']]

In [6]:
def run_cat(df_train, df_test, use_features):
    
    target = 'label'
    oof_pred = np.zeros((len(df_train), 4))
    y_pred = np.zeros((len(df_test), 4))
    
    folds = GroupKFold(n_splits=5)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train['label'], train['event_id'])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind]
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        params = {
            'task_type': 'GPU',
            'learning_rate': 0.1,
            'eval_metric': 'MultiClass',
            'loss_function': 'MultiClass',
            'classes_count': 4,
            'iterations': 6000,
            'random_seed': 1029,
            'max_depth': 8,
            'max_leaves': 64,
            'reg_lambda': 0.5,
            'early_stopping_rounds': 100
        }
        
        model = CatBoostClassifier(**params)
        
        model.fit(x_train,
                  y_train,
                  eval_set=(x_val, y_val),
                  verbose=100)
        oof_pred[val_ind] = model.predict_proba(x_val)
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        y_one_hot = label_binarize(y_val, np.arange(4)) 
        oof_one_hot = label_binarize(oof_pred[val_ind].argmax(axis=1), np.arange(4)) 
        score = roc_auc_score(y_one_hot, oof_one_hot) 
        print('auc: ', score)
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [7]:
y_pred, oof_pred = run_cat(train, test, use_features)

Fold 1
0:	learn: -1.3398677	test: -1.3402443	best: -1.3402443 (0)	total: 151ms	remaining: 15m 5s
100:	learn: -0.9229606	test: -0.9287754	best: -0.9287754 (100)	total: 10.7s	remaining: 10m 22s
200:	learn: -0.8734323	test: -0.8844398	best: -0.8844398 (200)	total: 21s	remaining: 10m 4s
300:	learn: -0.8423594	test: -0.8585373	best: -0.8585373 (300)	total: 31.1s	remaining: 9m 48s
400:	learn: -0.8217837	test: -0.8433223	best: -0.8433223 (400)	total: 41.1s	remaining: 9m 34s
500:	learn: -0.8051491	test: -0.8320491	best: -0.8320491 (500)	total: 51s	remaining: 9m 20s
600:	learn: -0.7907867	test: -0.8231499	best: -0.8231499 (600)	total: 1m 1s	remaining: 9m 10s
700:	learn: -0.7797184	test: -0.8176188	best: -0.8176188 (700)	total: 1m 10s	remaining: 8m 56s
800:	learn: -0.7685218	test: -0.8118996	best: -0.8118996 (800)	total: 1m 20s	remaining: 8m 44s
900:	learn: -0.7584074	test: -0.8067992	best: -0.8067966 (899)	total: 1m 30s	remaining: 8m 32s
1000:	learn: -0.7495852	test: -0.8029700	best: -0.8029700

2400:	learn: -0.6548808	test: -0.7740093	best: -0.7740019 (2399)	total: 3m 49s	remaining: 5m 43s
2500:	learn: -0.6494910	test: -0.7728984	best: -0.7728984 (2500)	total: 3m 58s	remaining: 5m 34s
2600:	learn: -0.6439729	test: -0.7715618	best: -0.7715618 (2600)	total: 4m 8s	remaining: 5m 24s
2700:	learn: -0.6389059	test: -0.7707569	best: -0.7707535 (2694)	total: 4m 17s	remaining: 5m 14s
2800:	learn: -0.6337441	test: -0.7700647	best: -0.7700647 (2800)	total: 4m 27s	remaining: 5m 5s
2900:	learn: -0.6286979	test: -0.7690824	best: -0.7690824 (2900)	total: 4m 36s	remaining: 4m 55s
3000:	learn: -0.6238224	test: -0.7684429	best: -0.7684351 (2999)	total: 4m 46s	remaining: 4m 46s
3100:	learn: -0.6192199	test: -0.7678754	best: -0.7678743 (3099)	total: 4m 55s	remaining: 4m 36s
3200:	learn: -0.6138870	test: -0.7663417	best: -0.7663349 (3199)	total: 5m 4s	remaining: 4m 26s
3300:	learn: -0.6095222	test: -0.7657533	best: -0.7657533 (3300)	total: 5m 13s	remaining: 4m 16s
3400:	learn: -0.6048890	test: -0.

4800:	learn: -0.5475966	test: -0.7589151	best: -0.7589102 (4799)	total: 7m 36s	remaining: 1m 54s
4900:	learn: -0.5436251	test: -0.7584151	best: -0.7584146 (4897)	total: 7m 46s	remaining: 1m 44s
5000:	learn: -0.5398850	test: -0.7579899	best: -0.7579820 (4999)	total: 7m 55s	remaining: 1m 34s
5100:	learn: -0.5362132	test: -0.7577225	best: -0.7577157 (5090)	total: 8m 4s	remaining: 1m 25s
5200:	learn: -0.5324558	test: -0.7574014	best: -0.7574014 (5192)	total: 8m 14s	remaining: 1m 15s
5300:	learn: -0.5288299	test: -0.7571670	best: -0.7571547 (5297)	total: 8m 23s	remaining: 1m 6s
5400:	learn: -0.5250779	test: -0.7568949	best: -0.7568949 (5400)	total: 8m 33s	remaining: 57s
5500:	learn: -0.5214042	test: -0.7566357	best: -0.7566357 (5500)	total: 8m 43s	remaining: 47.5s
5600:	learn: -0.5177563	test: -0.7565325	best: -0.7565039 (5594)	total: 8m 53s	remaining: 38s
5700:	learn: -0.5139428	test: -0.7561512	best: -0.7561512 (5700)	total: 9m 2s	remaining: 28.5s
5800:	learn: -0.5104405	test: -0.7560517	

1000:	learn: -0.7485612	test: -0.8041282	best: -0.8041282 (1000)	total: 1m 37s	remaining: 8m 9s
1100:	learn: -0.7393154	test: -0.7999992	best: -0.7999990 (1099)	total: 1m 47s	remaining: 7m 58s
1200:	learn: -0.7314165	test: -0.7972351	best: -0.7972326 (1199)	total: 1m 56s	remaining: 7m 46s
1300:	learn: -0.7244399	test: -0.7953575	best: -0.7953437 (1299)	total: 2m 5s	remaining: 7m 34s
1400:	learn: -0.7161801	test: -0.7919597	best: -0.7919597 (1400)	total: 2m 15s	remaining: 7m 25s
1500:	learn: -0.7094662	test: -0.7900570	best: -0.7900558 (1498)	total: 2m 24s	remaining: 7m 14s
1600:	learn: -0.7023476	test: -0.7877199	best: -0.7877183 (1599)	total: 2m 34s	remaining: 7m 4s
1700:	learn: -0.6961917	test: -0.7862479	best: -0.7862250 (1693)	total: 2m 43s	remaining: 6m 53s
1800:	learn: -0.6900963	test: -0.7852145	best: -0.7852145 (1800)	total: 2m 53s	remaining: 6m 43s
1900:	learn: -0.6837134	test: -0.7836347	best: -0.7836330 (1899)	total: 3m 2s	remaining: 6m 33s
2000:	learn: -0.6775931	test: -0.7

In [8]:
y_one_hot = label_binarize(train['label'], np.arange(4)) 
oof_one_hot = label_binarize(oof_pred.argmax(axis=1), np.arange(4)) 
score = roc_auc_score(y_one_hot, oof_one_hot) 
print('auc: ', score)

auc:  0.795702682352037


In [9]:
test = pd.read_pickle('test_04j16p.float16.pickle')[['event_id']]
test["label"]= y_pred.argmax(axis=1)
test["label"] = test["label"].map({0:21, 1:1, 2:4, 3:5})
test.head()

Unnamed: 0,event_id,label
0,fffecb31f40847b98e2175f689d535fb,5
1,fffe4696d3414a6288287e1b100c96c7,1
2,fffdee97573e44b2a51302dd74f78d78,5
3,fffdcc670e37433bb6227ea7d56dd29e,4
4,fffdbce9055843a5ae34ee9774e77454,1


In [10]:
submission = pd.read_csv('./jet_complex_data/complex_test_R04_jet.csv')[['jet_id', 'event_id']]
submission = pd.merge(submission, test, on=['event_id'])
submission = submission.drop(['event_id'], axis=1)
submission = submission.rename(columns={'jet_id': 'id'})
submission.head()

Unnamed: 0,id,label
0,cb3b5c6ea5e441e9b425fc1e6a4a00e6,5
1,ab8fc3fd74e643e68e070a391b8285c2,5
2,838bf5c3f9bc4a2194f51b3c57f403cd,21
3,f250685e3eee4e07ae31d851e72f4864,21
4,133b8f4676214e8191aa3a1da77439d4,21


In [11]:
submission.to_csv('submission_cat_0.795702682352037.csv', index=False)

In [12]:
np.save('y_pred_cat_0.795702682352037', y_pred)
np.save('oof_pred_cat_0.795702682352037', oof_pred)