In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from catboost import Pool, cv,CatBoostClassifier

In [None]:
train_df = pd.read_csv('train_df_reg.csv')
test_df = pd.read_csv('test_df_reg.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
train_df.columns

In [None]:
cat_features = [
#     'pickup_date', 
#     'pickup_hour',
#     'pickup_minute', 
#     'drop_date', 
#     'drop_hour', 
#     'drop_minute',
#     'pick_cluster',
#     'is_more_than_one_day',
#     'pickup_timeslot',
#     'day_of_week',
#     'is_weekday'
]
features = [
    'additional_fare', 
    'duration', 
    'meter_waiting', 
    'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'fare', 
#     'pick_cluster',
#     'is_more_than_one_day', 
    'distance_km', 
    'fare_per_km', 
#     'pickup_timeslot',
#     'day_of_week', 
#     'is_weekday', 
    'cal_time_difference', 
    'predicted_fare',
    'predicted_fare_difference',
    'predicted_fare_difference_per_fare',
    'predicted_duration',
    'predicted_duration_difference',
    'predicted_duration_difference_per_duration',
    'predicted_meter_waiting',
    'predicted_meter_waiting_difference',
    'predicted_meter_waiting_difference_per_meter_waiting'
]

In [None]:
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)[features]

In [None]:
skf = StratifiedKFold(n_splits=3)

In [None]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1'
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
validation_scores = []
submission_preds = np.zeros(submission_df.shape[0])
train_pools = []
models = []
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = model.best_score_['validation']['F1']
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    train_pools.append(train_pool)
    submission_preds += model.predict(submission_pool)

In [None]:
np.mean(validation_scores), np.std(validation_scores)

In [None]:
submission_df['prediction'] = np.where(submission_preds > 2, 1, 0)
submission_df.to_csv('submission.csv',index=False)

In [None]:
best_model = models[np.argmax(validation_scores)]

In [None]:
train_df.columns

In [None]:
best_model.get_feature_importance(prettified=True)

In [None]:
best_model.plot_tree(0,train_pools[np.argmax(validation_scores)])

In [None]:
cv_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'verbose': False
}

In [None]:
cv_pool = Pool(data=train_df,label=labels,cat_features=cat_features)

In [None]:
scores = cv(cv_pool,cv_params,plot=True, fold_count=3)

In [None]:
scores['test-F1-mean'].max() , scores['test-F1-std'].max() 