In [131]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from catboost import Pool, cv,CatBoostClassifier

In [132]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [133]:
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,pickup_date,pickup_year,pickup_hour,pickup_minute,drop_date,drop_year,drop_hour,drop_minute,pick_cluster,label
0,10.5,834.0,56.0,0.0,64.0,270.32,1,2019,0,20,1,2019,0,34,3,1
1,10.5,791.0,47.0,0.0,134.0,197.85,1,2019,0,56,1,2019,1,9,3,1
2,10.5,1087.0,80.0,0.0,61.0,301.64,1,2019,1,8,1,2019,1,26,3,1
3,10.5,598.0,271.0,15.6638,68.0,82.3,1,2019,2,27,1,2019,2,37,3,1
4,10.5,3407.0,182.0,0.0,112.0,1065.02,1,2019,5,38,1,2019,6,35,0,1


In [134]:
cat_features = ['pickup_date', 'pickup_year', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_year', 'drop_hour', 'drop_minute','pick_cluster']
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)

In [135]:
skf = StratifiedKFold(n_splits=3)

In [136]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1'
}

In [137]:
validation_scores = []

In [138]:
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)
    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = f1_score(y_test, pred, average='macro')
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)

Learning rate set to 0.057693
0:	learn: 0.9578992	test: 0.9586792	best: 0.9586792 (0)	total: 19.2ms	remaining: 19.2s
10:	learn: 0.9577806	test: 0.9594481	best: 0.9610390 (2)	total: 209ms	remaining: 18.8s
20:	learn: 0.9579996	test: 0.9593435	best: 0.9610390 (2)	total: 458ms	remaining: 21.3s
30:	learn: 0.9607440	test: 0.9601650	best: 0.9611823 (27)	total: 711ms	remaining: 22.2s
40:	learn: 0.9639707	test: 0.9602568	best: 0.9611823 (27)	total: 996ms	remaining: 23.3s
50:	learn: 0.9653163	test: 0.9586495	best: 0.9612769 (43)	total: 1.27s	remaining: 23.7s
60:	learn: 0.9662668	test: 0.9562238	best: 0.9612769 (43)	total: 1.54s	remaining: 23.8s
70:	learn: 0.9672100	test: 0.9515326	best: 0.9612769 (43)	total: 1.83s	remaining: 23.9s
80:	learn: 0.9676629	test: 0.9460709	best: 0.9612769 (43)	total: 2.09s	remaining: 23.7s
90:	learn: 0.9687131	test: 0.9462013	best: 0.9612769 (43)	total: 2.38s	remaining: 23.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9612769173
bestIteration 

In [139]:
np.mean(validation_scores), np.std(validation_scores)

(0.7080453978876703, 0.02469850158496203)