In [33]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from catboost import Pool, cv,CatBoostClassifier

In [8]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [9]:
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,pickup_date,pickup_year,pickup_hour,pickup_minute,drop_date,drop_year,drop_hour,drop_minute,label
0,10.5,834.0,56.0,0.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32,1,2019,0,20,1,2019,0,34,0
1,10.5,791.0,47.0,0.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85,1,2019,0,56,1,2019,1,9,0
2,10.5,1087.0,80.0,0.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64,1,2019,1,8,1,2019,1,26,0
3,10.5,598.0,271.0,15.6638,68.0,6.9257,79.8895,6.92748,79.8971,82.3,1,2019,2,27,1,2019,2,37,0
4,10.5,3407.0,182.0,0.0,112.0,7.13402,79.8969,6.91865,79.8649,1065.02,1,2019,5,38,1,2019,6,35,0


In [12]:
features = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'pick_lat', 'pick_lon', 'drop_lat',
       'drop_lon', 'fare', 'pickup_date', 'pickup_year', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_year', 'drop_hour', 'drop_minute']
cat_features = ['pickup_date', 'pickup_year', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_year', 'drop_hour', 'drop_minute']
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)

In [71]:
skf = StratifiedKFold(n_splits=5)

In [72]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'cat_features':cat_features,
    'eval_metric':'F1'
}

In [73]:
validation_scores = []

In [74]:
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)
    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = f1_score(y_test, pred, average='macro')
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)

Learning rate set to 0.06035
0:	learn: 0.3092917	test: 0.2936709	best: 0.2936709 (0)	total: 22.6ms	remaining: 22.6s
10:	learn: 0.2565556	test: 0.2808989	best: 0.3038674 (4)	total: 167ms	remaining: 15s
20:	learn: 0.3055747	test: 0.3215259	best: 0.3215259 (20)	total: 317ms	remaining: 14.8s
30:	learn: 0.3678312	test: 0.4183673	best: 0.4263959 (29)	total: 445ms	remaining: 13.9s
40:	learn: 0.4135290	test: 0.4668305	best: 0.4679803 (36)	total: 585ms	remaining: 13.7s
50:	learn: 0.4480398	test: 0.4854369	best: 0.5000000 (45)	total: 714ms	remaining: 13.3s
60:	learn: 0.4858006	test: 0.5270588	best: 0.5270588 (59)	total: 854ms	remaining: 13.1s
70:	learn: 0.5138807	test: 0.5429234	best: 0.5441860 (67)	total: 974ms	remaining: 12.7s
80:	learn: 0.5273574	test: 0.5714286	best: 0.5746606 (78)	total: 1.11s	remaining: 12.7s
90:	learn: 0.5377957	test: 0.5778781	best: 0.5778781 (83)	total: 1.24s	remaining: 12.4s
100:	learn: 0.5503432	test: 0.5874439	best: 0.5874439 (98)	total: 1.39s	remaining: 12.4s
110:	l

In [76]:
np.mean(validation_scores), np.std(validation_scores)

(0.7667179255603418, 0.050398366389929075)