In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import Pool, cv,CatBoostRegressor

In [None]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Check is noiced

In [None]:
data = train_df[train_df['label'] == 1].drop(['label'], axis=1).dropna()
# data = train_df.drop(['label'], axis=1).dropna()

In [None]:
data.head()

In [None]:
cat_features = ['pick_cluster','is_more_than_one_day','pickup_timeslot',
       'day_of_week', 'is_weekday']

features = [
    'additional_fare',
#     'duration',
#     'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'pick_cluster', 
    'is_more_than_one_day', 
    'distance_km',
    'pickup_timeslot',
    'day_of_week', 
    'is_weekday',
#     'fare'
]

In [None]:
X = data[features]
y = data['meter_waiting_fare'].values

In [None]:
train_df_X = train_df[features]
train_df_pool = Pool(data=train_df_X,cat_features=cat_features)
test_df_X = test_df[features]
test_df_pool = Pool(data=test_df_X,cat_features=cat_features)

In [None]:
train_df_preds = np.zeros(train_df.shape[0])
test_df_preds = np.zeros(test_df.shape[0])

In [None]:
params = {
    'loss_function':'RMSE',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'RMSE'
}

In [None]:
kf = KFold(n_splits=3)

In [None]:
validation_scores = []
models = []
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostRegressor(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    validation_score = model.best_score_['validation']['RMSE']
    print('Validation RMSE',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    
    train_df_preds += model.predict(train_df_pool)
    test_df_preds += model.predict(test_df_pool)

In [None]:
np.mean(validation_scores), np.std(validation_scores)

In [None]:
(418.17657240002376, 118.91645261628634)

In [None]:
best_model = models[np.argmax(validation_scores)]

In [None]:
best_model.get_feature_importance(prettified=True)

# Predict noice

In [None]:
cat_features = [
    'pick_cluster',
    'is_more_than_one_day',
    'pickup_timeslot',
    'day_of_week', 
    'is_weekday']

features = [    
#     'meter_waiting_fare',
#     'meter_waiting_till_pickup', 
    'pick_cluster', 
    'is_more_than_one_day', 
    'distance_km',
    'pickup_timeslot',
    'day_of_week', 
    'is_weekday',
    'additional_fare'
]

In [None]:
# data = train_df[train_df['label'] == 1].drop(['label'], axis=1).dropna()
data = train_df.drop(['label'], axis=1).dropna()

In [None]:
target_cols = ['duration','meter_waiting','fare']

In [None]:
params = {
    'loss_function':'RMSE',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'RMSE'
}

In [None]:
def get_cv_scores_and_pred(target_col, train_set=data, train_df=train_df, test_df=test_df):
    cols = list(filter(lambda col: col != target_col, features))
    X = data[cols]
    y = data[target_col].values
    
    train_df_X = train_df[cols]
    train_df_pool = Pool(data=train_df_X,cat_features=cat_features)
    test_df_X = test_df[cols]
    test_df_pool = Pool(data=test_df_X,cat_features=cat_features)
    
    train_df_preds = np.zeros(train_df.shape[0])
    test_df_preds = np.zeros(test_df.shape[0])
    
    kf = KFold(n_splits=3)
    validation_scores = []
    models = []
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
        test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
        model = CatBoostRegressor(**params)
        model.fit(X=train_pool, eval_set=test_pool,verbose=False)
        validation_score = model.best_score_['validation']['RMSE']
        print('Validation RMSE:',validation_score,' best iteration:',model.get_best_iteration())
        validation_scores.append(validation_score)
        models.append(model)

        train_df_preds += model.predict(train_df_pool)
        test_df_preds += model.predict(test_df_pool)
    print(target_col,np.mean(validation_scores), np.std(validation_scores))
    
    train_df[f'predicted_{target_col}'] = train_df_preds/3
    test_df[f'predicted_{target_col}'] = test_df_preds/3
    
    train_df[f'predicted_{target_col}_difference'] = train_df[f'{target_col}'] - train_df[f'predicted_{target_col}']
    test_df[f'predicted_{target_col}_difference'] = test_df[f'{target_col}'] - test_df[f'predicted_{target_col}']
    
    train_df[f'predicted_{target_col}_difference_per_{target_col}'] = train_df[f'predicted_{target_col}_difference'] / (train_df[f'{target_col}']+1)
    test_df[f'predicted_{target_col}_difference_per_{target_col}'] = test_df[f'predicted_{target_col}_difference']  / (test_df[f'{target_col}']+1)
    

In [None]:
for col in target_cols:
    get_cv_scores_and_pred(col)

In [None]:
sns.distplot(train_df[train_df['label']==1]['predicted_fare_difference_per_fare'], hist=False)

In [None]:
sns.distplot(train_df[train_df['label']==0]['predicted_fare_difference_per_fare'], hist=False)

In [None]:
train_df['predicted_fare_per_distance'] = train_df['predicted_fare'] / (train_df['distance_km'] + 1)

In [None]:
sns.distplot(train_df[train_df['label']==1]['predicted_fare_per_distance'], hist=False)

In [None]:
sns.distplot(train_df[train_df['label']==0]['predicted_fare_per_distance'], hist=False)

In [None]:
train_df['predicted_fare_per_distance_difference'] = train_df['predicted_fare_per_distance'] - train_df['fare_per_km']

In [None]:
sns.distplot(train_df[train_df['label']==1]['predicted_fare_per_distance_difference'], hist=False)

In [None]:
sns.distplot(train_df[train_df['label']==0]['predicted_fare_per_distance_difference'], hist=False)

In [None]:
train_df.head()

In [None]:
train_df.to_csv('train_df_reg.csv',index=False)
test_df.to_csv('test_df_reg.csv',index=False)