In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import Pool, cv,CatBoostRegressor

In [None]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
data = train_df[train_df['label'] == 1].drop(['label'], axis=1).dropna()

In [None]:
data.columns

In [None]:
cat_features = ['pick_cluster','is_more_than_one_day','pickup_timeslot',
       'day_of_week', 'is_weekday']

features = [
    'additional_fare',
#     'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'pick_cluster', 
    'is_more_than_one_day', 
    'distance_km',
    'pickup_timeslot',
    'day_of_week', 
    'is_weekday',
#     'fare'
]

In [None]:
X = data[features]
y = data['duration'].values

In [None]:
train_df_X = train_df[features]
train_df_pool = Pool(data=train_df_X,cat_features=cat_features)
test_df_X = test_df[features]
test_df_pool = Pool(data=test_df_X,cat_features=cat_features)

In [None]:
train_df_preds = np.zeros(train_df.shape[0])
test_df_preds = np.zeros(test_df.shape[0])

In [None]:
params = {
    'loss_function':'RMSE',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'RMSE'
}

In [None]:
kf = KFold(n_splits=3)

In [None]:
validation_scores = []
models = []
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostRegressor(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    validation_score = model.best_score_['validation']['RMSE']
    print('Validation RMSE',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    
    train_df_preds += model.predict(train_df_pool)
    test_df_preds += model.predict(test_df_pool)

In [None]:
np.mean(validation_scores), np.std(validation_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]

In [None]:
best_model.get_feature_importance(prettified=True)

In [None]:
train_df['predicted_duration'] = train_df_preds/3
test_df['predicted_duration'] = test_df_preds/3

In [None]:
train_df['predicted_duration_difference'] = train_df['duration'] - train_df['predicted_duration']
test_df['predicted_duration_difference'] = test_df['duration'] - test_df['predicted_duration']

In [None]:
train_df['predicted_duration_difference_per_duration'] = train_df['predicted_duration_difference'] / (train_df['duration'] + 1)
test_df['predicted_duration_difference_per_duration'] = test_df['predicted_duration_difference'] / (test_df['duration'] + 1)

In [None]:
sns.distplot(train_df[train_df['label']==1]['predicted_duration_difference'], hist=False)
sns.distplot(train_df[train_df['label']==0]['predicted_duration_difference'], hist=False)

In [None]:
train_df.to_csv('train_df_reg_duration.csv',index=False)
test_df.to_csv('test_df_reg_duration.csv',index=False)