In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

from catboost import Pool, cv,CatBoostRegressor

In [20]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [21]:
data = train_df[train_df['label'] == 1].drop(['label'], axis=1).dropna()

In [22]:
data.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'fare', 'pickup_date', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_hour', 'drop_minute',
       'pick_cluster', 'is_more_than_one_day', 'distance_km', 'fare_per_km'],
      dtype='object')

In [23]:
cat_features = ['pickup_date', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_hour', 'drop_minute','pick_cluster','is_more_than_one_day']

features = ['additional_fare','duration', 'meter_waiting','meter_waiting_fare',
       'meter_waiting_till_pickup', 'pickup_date', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_hour', 'drop_minute',
       'pick_cluster', 'is_more_than_one_day', 'distance_km']

In [24]:
X = data[features]
y = data['fare'].values

In [30]:
train_df_X = train_df[features]
train_df_pool = Pool(data=train_df_X,cat_features=cat_features)
test_df_X = test_df[features]
test_df_pool = Pool(data=test_df_X,cat_features=cat_features)

In [31]:
train_df_preds = np.zeros(train_df.shape[0])
test_df_preds = np.zeros(test_df.shape[0])

In [26]:
params = {
    'loss_function':'RMSE',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'RMSE'
}

In [27]:
kf = KFold(n_splits=3)

In [32]:
validation_scores = []
models = []
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostRegressor(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    validation_score = model.best_score_['validation']['RMSE']
    print('Validation RMSE',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    
    train_df_preds += model.predict(train_df_pool)
    test_df_preds += model.predict(test_df_pool)

Learning rate set to 0.071592
0:	learn: 354.3756166	test: 267.7168743	best: 267.7168743 (0)	total: 14.3ms	remaining: 14.3s
10:	learn: 271.6204979	test: 189.3425811	best: 189.3425811 (10)	total: 142ms	remaining: 12.8s
20:	learn: 222.9650976	test: 157.6134756	best: 157.6134756 (20)	total: 269ms	remaining: 12.5s
30:	learn: 188.5990500	test: 140.2556710	best: 140.2556710 (30)	total: 388ms	remaining: 12.1s
40:	learn: 161.0421151	test: 128.8638824	best: 128.8638824 (40)	total: 511ms	remaining: 11.9s
50:	learn: 141.5152878	test: 118.4732542	best: 118.4732542 (50)	total: 626ms	remaining: 11.7s
60:	learn: 127.0218551	test: 115.3758193	best: 115.2246592 (59)	total: 743ms	remaining: 11.4s
70:	learn: 115.2843666	test: 112.7276744	best: 112.7062121 (67)	total: 884ms	remaining: 11.6s
80:	learn: 103.7126075	test: 111.4448930	best: 111.3673950 (78)	total: 1.08s	remaining: 12.3s
90:	learn: 96.4449062	test: 110.8679082	best: 110.7994282 (89)	total: 1.21s	remaining: 12.1s
100:	learn: 89.8418424	test: 109

450:	learn: 43.4069813	test: 308.8067475	best: 308.7764279 (446)	total: 6.47s	remaining: 7.87s
460:	learn: 43.2148225	test: 308.7920701	best: 308.7764279 (446)	total: 6.61s	remaining: 7.73s
470:	learn: 42.9523793	test: 308.7320597	best: 308.7320597 (470)	total: 6.76s	remaining: 7.59s
480:	learn: 42.7110330	test: 308.6813259	best: 308.6783896 (477)	total: 6.89s	remaining: 7.43s
490:	learn: 42.5069397	test: 308.6399250	best: 308.6399250 (490)	total: 7.04s	remaining: 7.29s
500:	learn: 42.3066194	test: 308.6177618	best: 308.5882643 (497)	total: 7.17s	remaining: 7.14s
510:	learn: 42.1021842	test: 308.6775520	best: 308.5882643 (497)	total: 7.34s	remaining: 7.02s
520:	learn: 41.8506397	test: 308.6231362	best: 308.5882643 (497)	total: 7.48s	remaining: 6.88s
530:	learn: 41.6606749	test: 308.5960330	best: 308.5882643 (497)	total: 7.63s	remaining: 6.74s
540:	learn: 41.4562550	test: 308.5802348	best: 308.5695906 (538)	total: 7.76s	remaining: 6.58s
550:	learn: 41.2959388	test: 308.5708627	best: 308

In [33]:
train_df['predicted_fare'] = train_df_preds/3
test_df['predicted_fare'] = test_df_preds/3

In [34]:
train_df.to_csv('train_df.csv',index=False)
test_df.to_csv('test_df.csv',index=False)