In [1]:
import pandas as pd
import catboost
from catboost import *
from catboost import datasets

In [2]:
train = pd.read_csv('flight_delays_train.csv')
validation = pd.read_csv('flight_delays_valid.csv')

In [3]:
X_train = train.drop('target', axis=1)
X_validation = validation.drop('target', axis=1)
y_train = train['target']
y_validation = validation['target']

In [4]:
cat_features = [0, 1, 3, 4, 5, 8, 9, 10]

In [5]:
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
valid_pool = Pool(data=X_validation, label=y_validation, cat_features=cat_features)

In [6]:
model = CatBoostClassifier(random_seed=1234,
                           iterations=10000,
                           learning_rate=0.03, 
                           depth=4, 
                           l2_leaf_reg=5,  
                           loss_function='Logloss',
                           od_type='IncToDec',
                           od_pval=0.2,
                           use_best_model=False,
                           task_type='GPU',
                           bagging_temperature=0,
                           random_strength=3,
                           one_hot_max_size=0,
                           max_ctr_complexity=4,
                           border_count=255,
                           feature_border_type='GreedyLogSum',
                           leaf_estimation_method='Gradient',
                           devices='0',
                           gpu_ram_part=0.99)
model.fit(train_pool, eval_set=valid_pool, verbose=1000, plot=False)


0:	learn: 0.6758829	test: 0.6787720	best: 0.6787720 (0)	total: 17.1ms	remaining: 2m 51s
1000:	learn: 0.3938404	test: 0.3991677	best: 0.3991677 (1000)	total: 15.4s	remaining: 2m 18s
2000:	learn: 0.3755514	test: 0.3918347	best: 0.3918314 (1989)	total: 30.4s	remaining: 2m 1s
3000:	learn: 0.3653346	test: 0.3889134	best: 0.3889134 (3000)	total: 45.5s	remaining: 1m 46s
4000:	learn: 0.3569312	test: 0.3856805	best: 0.3856805 (4000)	total: 1m	remaining: 1m 30s
5000:	learn: 0.3509050	test: 0.3842066	best: 0.3842043 (4998)	total: 1m 15s	remaining: 1m 15s
6000:	learn: 0.3448954	test: 0.3826773	best: 0.3826589 (5949)	total: 1m 30s	remaining: 1m
7000:	learn: 0.3399561	test: 0.3817894	best: 0.3817894 (7000)	total: 1m 46s	remaining: 45.7s
8000:	learn: 0.3349211	test: 0.3807782	best: 0.3807349 (7823)	total: 2m 3s	remaining: 30.7s
bestTest = 0.3807348828
bestIteration = 7823


<catboost.core.CatBoostClassifier at 0x7fbcdad0c898>

In [7]:
model.tree_count_

8006

In [8]:
model.get_evals_result()['learn']['Logloss'][-1], \
model.get_evals_result()['validation']['Logloss'][-1]

(0.33490703125, 0.3807861328125)

In [9]:
model.eval_metrics(train_pool, metrics=['Logloss'])['Logloss'][-1], \
model.eval_metrics(valid_pool, metrics=['Logloss'])['Logloss'][-1],

(0.34002411199275573, 0.4240669927488715)

In [10]:
from sklearn.metrics import log_loss
log_loss(train_pool.get_label(), model.predict_proba(train_pool)[:, 1]), \
log_loss(valid_pool.get_label(), model.predict_proba(valid_pool)[:, 1])

(0.33876961096881647, 0.42406699274887194)