In [64]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from catboost import Pool, cv,CatBoostClassifier

In [65]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [66]:
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,pickup_date,pickup_year,pickup_hour,pickup_minute,drop_date,drop_year,drop_hour,drop_minute,pick_cluster,drop_cluster,label
0,10.5,834.0,56.0,0.0,64.0,270.32,1,2019,0,20,1,2019,0,34,0,1,1
1,10.5,791.0,47.0,0.0,134.0,197.85,1,2019,0,56,1,2019,1,9,0,1,1
2,10.5,1087.0,80.0,0.0,61.0,301.64,1,2019,1,8,1,2019,1,26,0,1,1
3,10.5,598.0,271.0,15.6638,68.0,82.3,1,2019,2,27,1,2019,2,37,0,1,1
4,10.5,3407.0,182.0,0.0,112.0,1065.02,1,2019,5,38,1,2019,6,35,2,1,1


In [67]:
cat_features = ['pickup_date', 'pickup_year', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_year', 'drop_hour', 'drop_minute','pick_cluster','drop_cluster']
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)

In [68]:
skf = StratifiedKFold(n_splits=3)

In [69]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1'
}

In [70]:
validation_scores = []

In [71]:
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)
    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = f1_score(y_test, pred, average='macro')
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)

Learning rate set to 0.057693
0:	learn: 0.9529276	test: 0.9528835	best: 0.9528835 (0)	total: 10.8ms	remaining: 10.8s
10:	learn: 0.9580928	test: 0.9584575	best: 0.9589935 (9)	total: 191ms	remaining: 17.2s
20:	learn: 0.9571123	test: 0.9580114	best: 0.9589935 (9)	total: 420ms	remaining: 19.6s
30:	learn: 0.9592836	test: 0.9549583	best: 0.9589935 (9)	total: 652ms	remaining: 20.4s
40:	learn: 0.9640160	test: 0.9605499	best: 0.9607529 (39)	total: 887ms	remaining: 20.8s
50:	learn: 0.9649922	test: 0.9617528	best: 0.9627112 (47)	total: 1.1s	remaining: 20.5s
60:	learn: 0.9662054	test: 0.9563315	best: 0.9627112 (47)	total: 1.32s	remaining: 20.4s
70:	learn: 0.9670547	test: 0.9480158	best: 0.9627112 (47)	total: 1.56s	remaining: 20.4s
80:	learn: 0.9675928	test: 0.9447391	best: 0.9627112 (47)	total: 1.76s	remaining: 19.9s
90:	learn: 0.9683266	test: 0.9439027	best: 0.9627112 (47)	total: 1.97s	remaining: 19.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9627112244
bestIteration = 

In [72]:
np.mean(validation_scores), np.std(validation_scores)

(0.7056476783366913, 0.02943485629049361)