In [None]:
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor,ExtraTreesClassifier,RandomForestClassifier
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_error,f1_score,confusion_matrix,log_loss
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA,NMF

from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate,Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers
import tensorflow_addons as tfa

from tqdm import tqdm

import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor,DMatrix,plot_tree

from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN

import shap
shap.initjs()

In [None]:
train_df = pd.read_csv('train_df_final.csv')
train_df = train_df.fillna(0)
train_df_org = pd.read_csv('train_df_final_blanced.csv')
train_df_org = train_df_org.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
y = train_df['label'].values
y_org = train_df_org['label'].values

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
]

cat_features = []

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
best_model.plot_tree(0,train_pools[np.argmax(validation_scores)])

In [None]:
sns.scatterplot(x='meter_waiting_till_pickup_per_distance',y='meter_waiting_after_pickup_per_meter_waiting_fare',data=train_df,hue='label')

In [None]:
train_df[['meter_waiting_till_pickup_per_distance','meter_waiting_after_pickup_per_meter_waiting_fare']].describe()

In [None]:
sns.scatterplot(x='fare_per_duration',y='fare_per_distance',data=train_df,hue='label')

In [None]:
sns.scatterplot(x='pickup_hour',y='drop_hour',data=train_df,hue='label')

In [None]:
sns.violinplot(x='label',y='fare_per_duration',data=train_df)

In [None]:
sns.violinplot(x='label',y='fare_per_distance',data=train_df)

In [None]:
sns.violinplot(x='label',y='fare-additional_fare_per_distance',data=train_df)

# Col products

In [None]:
prod_cols = []
for i, col1 in enumerate(features):
    for col2 in features[i+1:]:
        name = f'{col1}@{col2}'
        train_df[name] = train_df[col1] * train_df[col2]
        test_df[name] = test_df[col1] * test_df[col2]
        train_df_org[name] = train_df_org[col1] * train_df_org[col2]
        prod_cols.append(name)

In [None]:
features = features + prod_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

submission_df['prediction'].sum() / submission_df.shape[0]

## with 0 importance columns removed

In [None]:
model_1 = models[0]
df_1 = model_1.get_feature_importance(prettified=True)
set_1 = set(df_1[df_1['Importances'] == 0]['Feature Id'].values)

model_2 = models[1]
df_2 = model_2.get_feature_importance(prettified=True)
set_2 = set(df_2[df_2['Importances'] == 0]['Feature Id'].values)

model_3 = models[2]
df_3 = model_3.get_feature_importance(prettified=True)
set_3 = set(df_3[df_3['Importances'] == 0]['Feature Id'].values)

ignore = set_1.intersection(set_2,set_3)
ignore

In [None]:
features = [each for each in features if each not in ignore]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

# Log scale added

In [None]:
log_cols = []
for col in features:    
    name = f'log_{col}'
    train_df[name] = np.log(train_df[col] + 1)
    test_df[name] = np.log(test_df[col] + 1)
    train_df_org[name] = np.log(train_df_org[col] + 1)
    log_cols.append(name)

In [None]:
features = features + log_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

## With 0 importance columns removed

In [None]:
model_1 = models[0]
df_1 = model_1.get_feature_importance(prettified=True)
set_1 = set(df_1[df_1['Importances'] == 0]['Feature Id'].values)

model_2 = models[1]
df_2 = model_2.get_feature_importance(prettified=True)
set_2 = set(df_2[df_2['Importances'] == 0]['Feature Id'].values)

model_3 = models[2]
df_3 = model_3.get_feature_importance(prettified=True)
set_3 = set(df_3[df_3['Importances'] == 0]['Feature Id'].values)

ignore = set_1.intersection(set_2,set_3)
ignore

In [None]:
features = [each for each in features if each not in ignore]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

# PCA

In [None]:
n_components=40

In [None]:
X_train = train_df[features].values
X_train_org = train_df_org[features].values
X_test = test_df[features].values

In [None]:
pca = PCA(n_components=n_components)
X_train_transformed = pca.fit_transform(X_train)
X_test_transformed = pca.transform(X_test)
X_train_org_transformed = pca.transform(X_train_org)

In [None]:
pca_train = pd.DataFrame()
pca_train['label'] = train_df['label']

pca_train_org = pd.DataFrame()
pca_train_org['label'] = y_org

pca_test = pd.DataFrame()
for i in range(n_components):
    name = f'col_{i}'
    pca_train[name] = X_train_transformed[:,i]
    pca_test[name] = X_test_transformed[:,i]
    pca_train_org[name] = X_train_org_transformed[:,i]
    

In [None]:
sns.scatterplot(x='col_0',y='col_1',data=pca_train,hue='label')

In [None]:
pca_cols = list(pca_test.columns)
for each in pca_cols:
    train_df[each] = pca_train[each]
    test_df[each] = pca_test[each]
    train_df_org[each] = pca_train_org[each]

In [None]:
features = features + pca_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
best_model.get_feature_importance(prettified=True)[:30]

In [None]:
sns.violinplot(x='label',y='col_5',data=train_df)

In [None]:
sns.violinplot(x='label',y='col_4',data=train_df)

In [None]:
train_df.to_csv('train_df_unit.csv',index=False)
test_df.to_csv('test_df_unit.csv',index=False)
train_df_org.to_csv('train_df_org_unit.csv',index=False)

# cal_time_difference_per_duration added

In [None]:
train_df = pd.read_csv('train_df_unit.csv')
test_df = pd.read_csv('test_df_unit.csv')
train_df_org = pd.read_csv('train_df_org_unit.csv')

In [None]:
train_df['cal_time_difference_per_duration'] = train_df['cal_time_difference'] / (train_df['duration']+1)
test_df['cal_time_difference_per_duration'] = test_df['cal_time_difference'] / (test_df['duration']+1)
train_df_org['cal_time_difference_per_duration'] = train_df_org['cal_time_difference'] / (train_df_org['duration']+1)

In [None]:
sns.violinplot(x='label',y='cal_time_difference_per_duration',data=train_df)

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare', 
    'cal_time_difference_per_duration',    
#     'fare_per_distance@fare_per_duration',    
#     'fare_per_distance@avg_speed',
#     'fare_per_distance@meter_waiting_per_duration',
#     'fare_per_distance@meter_waiting_fare_per_meter_waiting',
#     'fare_per_distance@meter_waiting_fare_per_duration',
#     'fare_per_distance@addtional_fare_per_fare',
#     'fare_per_distance@addtional_fare_per_distance',
#     'fare_per_distance@addtional_fare_per_duration',
#     'fare_per_distance@fare-additional_fare_per_distance',
#     'fare_per_distance@fare-additional_fare_per_duration',
#     'fare_per_distance@fare-additional_fare-meter_waiting_fare_per_distance',
#     'fare_per_distance@fare-additional_fare-meter_waiting_fare_per_duration',
#     'fare_per_distance@meter_waiting_till_pickup_per_meter_waiting',
#     'fare_per_distance@meter_waiting_after_pickup_per_duration',
#     'fare_per_distance@meter_waiting_till_pickup_per_duration',
#     'fare_per_distance@meter_waiting_till_pickup_per_distance',
#     'fare_per_distance@meter_waiting_after_pickup_per_distance',
#     'fare_per_distance@meter_waiting_till_pickup_per_fare',
#     'fare_per_distance@meter_waiting_after_pickup_per_fare',
#     'fare_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'fare_per_duration@avg_speed',
#     'fare_per_duration@meter_waiting_per_duration',
#     'fare_per_duration@meter_waiting_fare_per_meter_waiting',
#     'fare_per_duration@meter_waiting_fare_per_duration',
#     'fare_per_duration@addtional_fare_per_fare',
#     'fare_per_duration@addtional_fare_per_distance',
#     'fare_per_duration@addtional_fare_per_duration',
#     'fare_per_duration@fare-additional_fare_per_distance',
#     'fare_per_duration@fare-additional_fare_per_duration',
#     'fare_per_duration@fare-additional_fare-meter_waiting_fare_per_distance',
#     'fare_per_duration@fare-additional_fare-meter_waiting_fare_per_duration',
#     'fare_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'fare_per_duration@meter_waiting_after_pickup_per_duration',
#     'fare_per_duration@meter_waiting_till_pickup_per_duration',
#     'fare_per_duration@meter_waiting_till_pickup_per_distance',
#     'fare_per_duration@meter_waiting_after_pickup_per_distance',
#     'fare_per_duration@meter_waiting_till_pickup_per_fare',
#     'fare_per_duration@meter_waiting_after_pickup_per_fare',
#     'fare_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'avg_speed@meter_waiting_per_duration',
#     'avg_speed@meter_waiting_fare_per_meter_waiting',
#     'avg_speed@meter_waiting_fare_per_duration',
#     'avg_speed@addtional_fare_per_fare',
#     'avg_speed@addtional_fare_per_distance',
#     'avg_speed@addtional_fare_per_duration',
#     'avg_speed@fare-additional_fare_per_distance',
#     'avg_speed@fare-additional_fare_per_duration',
#     'avg_speed@fare-additional_fare-meter_waiting_fare_per_distance',
#     'avg_speed@fare-additional_fare-meter_waiting_fare_per_duration',
#     'avg_speed@meter_waiting_till_pickup_per_meter_waiting',
#     'avg_speed@meter_waiting_after_pickup_per_duration',
#     'avg_speed@meter_waiting_till_pickup_per_duration',
#     'avg_speed@meter_waiting_till_pickup_per_distance',
#     'avg_speed@meter_waiting_after_pickup_per_distance',
#     'avg_speed@meter_waiting_till_pickup_per_fare',
#     'avg_speed@meter_waiting_after_pickup_per_fare',
#     'avg_speed@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'avg_speed@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_per_duration@meter_waiting_fare_per_meter_waiting',
#     'meter_waiting_per_duration@meter_waiting_fare_per_duration',
#     'meter_waiting_per_duration@addtional_fare_per_fare',
#     'meter_waiting_per_duration@addtional_fare_per_distance',
#     'meter_waiting_per_duration@addtional_fare_per_duration',
#     'meter_waiting_per_duration@fare-additional_fare_per_distance',
#     'meter_waiting_per_duration@fare-additional_fare_per_duration',
#     'meter_waiting_per_duration@fare-additional_fare-meter_waiting_fare_per_distance',
#     'meter_waiting_per_duration@fare-additional_fare-meter_waiting_fare_per_duration',
#     'meter_waiting_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'meter_waiting_per_duration@meter_waiting_after_pickup_per_duration',
#     'meter_waiting_per_duration@meter_waiting_till_pickup_per_duration',
#     'meter_waiting_per_duration@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_per_duration@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_per_duration@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_per_duration@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_fare_per_duration',
#     'meter_waiting_fare_per_meter_waiting@addtional_fare_per_fare',
#     'meter_waiting_fare_per_meter_waiting@addtional_fare_per_distance',
#     'meter_waiting_fare_per_meter_waiting@addtional_fare_per_duration',
#     'meter_waiting_fare_per_meter_waiting@fare-additional_fare_per_distance',
#     'meter_waiting_fare_per_meter_waiting@fare-additional_fare_per_duration',
#     'meter_waiting_fare_per_meter_waiting@fare-additional_fare-meter_waiting_fare_per_distance',
#     'meter_waiting_fare_per_meter_waiting@fare-additional_fare-meter_waiting_fare_per_duration',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_till_pickup_per_meter_waiting',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_after_pickup_per_duration',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_till_pickup_per_duration',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_fare_per_meter_waiting@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_fare_per_duration@addtional_fare_per_fare',
#     'meter_waiting_fare_per_duration@addtional_fare_per_distance',
#     'meter_waiting_fare_per_duration@addtional_fare_per_duration',
#     'meter_waiting_fare_per_duration@fare-additional_fare_per_distance',
#     'meter_waiting_fare_per_duration@fare-additional_fare_per_duration',
#     'meter_waiting_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_distance',
#     'meter_waiting_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_duration',
#     'meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_duration',
#     'meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_duration',
#     'meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_fare@addtional_fare_per_distance',
#     'addtional_fare_per_fare@addtional_fare_per_duration',
#     'addtional_fare_per_fare@fare-additional_fare_per_distance',
#     'addtional_fare_per_fare@fare-additional_fare_per_duration',
#     'addtional_fare_per_fare@fare-additional_fare-meter_waiting_fare_per_distance',
#     'addtional_fare_per_fare@fare-additional_fare-meter_waiting_fare_per_duration',
#     'addtional_fare_per_fare@meter_waiting_till_pickup_per_meter_waiting',
#     'addtional_fare_per_fare@meter_waiting_after_pickup_per_duration',
#     'addtional_fare_per_fare@meter_waiting_till_pickup_per_duration',
#     'addtional_fare_per_fare@meter_waiting_till_pickup_per_distance',
#     'addtional_fare_per_fare@meter_waiting_after_pickup_per_distance',
#     'addtional_fare_per_fare@meter_waiting_till_pickup_per_fare',
#     'addtional_fare_per_fare@meter_waiting_after_pickup_per_fare',
#     'addtional_fare_per_fare@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_fare@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_distance@addtional_fare_per_duration',
#     'addtional_fare_per_distance@fare-additional_fare_per_distance',
#     'addtional_fare_per_distance@fare-additional_fare_per_duration',
#     'addtional_fare_per_distance@fare-additional_fare-meter_waiting_fare_per_distance',
#     'addtional_fare_per_distance@fare-additional_fare-meter_waiting_fare_per_duration',
#     'addtional_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting',
#     'addtional_fare_per_distance@meter_waiting_after_pickup_per_duration',
#     'addtional_fare_per_distance@meter_waiting_till_pickup_per_duration',
#     'addtional_fare_per_distance@meter_waiting_till_pickup_per_distance',
#     'addtional_fare_per_distance@meter_waiting_after_pickup_per_distance',
#     'addtional_fare_per_distance@meter_waiting_till_pickup_per_fare',
#     'addtional_fare_per_distance@meter_waiting_after_pickup_per_fare',
#     'addtional_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_duration@fare-additional_fare_per_distance',
#     'addtional_fare_per_duration@fare-additional_fare_per_duration',
#     'addtional_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_distance',
#     'addtional_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_duration',
#     'addtional_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'addtional_fare_per_duration@meter_waiting_after_pickup_per_duration',
#     'addtional_fare_per_duration@meter_waiting_till_pickup_per_duration',
#     'addtional_fare_per_duration@meter_waiting_till_pickup_per_distance',
#     'addtional_fare_per_duration@meter_waiting_after_pickup_per_distance',
#     'addtional_fare_per_duration@meter_waiting_till_pickup_per_fare',
#     'addtional_fare_per_duration@meter_waiting_after_pickup_per_fare',
#     'addtional_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'addtional_fare_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'fare-additional_fare_per_distance@fare-additional_fare_per_duration',
#     'fare-additional_fare_per_distance@fare-additional_fare-meter_waiting_fare_per_distance',
#     'fare-additional_fare_per_distance@fare-additional_fare-meter_waiting_fare_per_duration',
#     'fare-additional_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting',
#     'fare-additional_fare_per_distance@meter_waiting_after_pickup_per_duration',
#     'fare-additional_fare_per_distance@meter_waiting_till_pickup_per_duration',
#     'fare-additional_fare_per_distance@meter_waiting_till_pickup_per_distance',
#     'fare-additional_fare_per_distance@meter_waiting_after_pickup_per_distance',
#     'fare-additional_fare_per_distance@meter_waiting_till_pickup_per_fare',
#     'fare-additional_fare_per_distance@meter_waiting_after_pickup_per_fare',
#     'fare-additional_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare-additional_fare_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'fare-additional_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_distance',
#     'fare-additional_fare_per_duration@fare-additional_fare-meter_waiting_fare_per_duration',
#     'fare-additional_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'fare-additional_fare_per_duration@meter_waiting_after_pickup_per_duration',
#     'fare-additional_fare_per_duration@meter_waiting_till_pickup_per_duration',
#     'fare-additional_fare_per_duration@meter_waiting_till_pickup_per_distance',
#     'fare-additional_fare_per_duration@meter_waiting_after_pickup_per_distance',
#     'fare-additional_fare_per_duration@meter_waiting_till_pickup_per_fare',
#     'fare-additional_fare_per_duration@meter_waiting_after_pickup_per_fare',
#     'fare-additional_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare-additional_fare_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'fare-additional_fare-meter_waiting_fare_per_distance@fare-additional_fare-meter_waiting_fare_per_duration',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_after_pickup_per_duration',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_till_pickup_per_duration',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_till_pickup_per_distance',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_after_pickup_per_distance',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_till_pickup_per_fare',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_after_pickup_per_fare',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare-additional_fare-meter_waiting_fare_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_duration',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_duration',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_distance',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_distance',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_fare',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_fare',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'fare-additional_fare-meter_waiting_fare_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_after_pickup_per_duration',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_till_pickup_per_duration',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_meter_waiting@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_till_pickup_per_duration',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_till_pickup_per_distance',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_duration@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_distance@meter_waiting_after_pickup_per_distance',
#     'meter_waiting_till_pickup_per_distance@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_till_pickup_per_distance@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_till_pickup_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_distance@meter_waiting_till_pickup_per_fare',
#     'meter_waiting_after_pickup_per_distance@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_after_pickup_per_distance@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_distance@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_fare@meter_waiting_after_pickup_per_fare',
#     'meter_waiting_till_pickup_per_fare@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_fare@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_fare@meter_waiting_till_pickup_per_meter_waiting_fare',
#     'meter_waiting_after_pickup_per_fare@meter_waiting_after_pickup_per_meter_waiting_fare',
#     'meter_waiting_till_pickup_per_meter_waiting_fare@meter_waiting_after_pickup_per_meter_waiting_fare'
]

cat_features = []

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
best_model.plot_tree(0,train_pools[np.argmax(validation_scores)])

In [None]:
sns.scatterplot(x='meter_waiting_after_pickup_per_meter_waiting_fare',y='meter_waiting_fare_per_meter_waiting',data=train_df,hue='label')

In [None]:
sns.scatterplot(x='fare-additional_fare_per_duration',y='fare-additional_fare_per_distance',data=train_df,hue='label')

# Date minute encoding

In [None]:
sns.scatterplot(x='pickup_date',y='pickup_minute',data=train_df,hue='label')

In [None]:
sns.countplot(x='pickup_date',data=train_df,hue='label')

In [None]:
def date_minute_encoding(by='pickup'):
    def f(row):
        date = f"{int(row[f'{by}_date']):02}"
        minute = f"{int(row[f'{by}_minute']):02}"
        return f'{date}_{minute}'
    return f

In [None]:
train_df['pickup_date_minute'] = train_df.apply(date_minute_encoding(),axis=1)
test_df['pickup_date_minute'] = test_df.apply(date_minute_encoding(),axis=1)
train_df_org['pickup_date_minute'] = train_df_org.apply(date_minute_encoding(),axis=1)

In [None]:
keys = list(train_df['pickup_date_minute'].unique())
keys.sort()
key_map = {}
for i, key in enumerate(keys):
    key_map[key] = i

In [None]:
def encode_with_key_map(key_map,col='pickup_date_minute'):
    def fn(row):
        key = row[col]
        return key_map[key]
    return fn

In [None]:
train_df['pickup_date_minute'] = train_df.apply(encode_with_key_map(key_map),axis=1)
test_df['pickup_date_minute'] = test_df.apply(encode_with_key_map(key_map),axis=1)
train_df_org['pickup_date_minute'] = train_df_org.apply(encode_with_key_map(key_map),axis=1)

In [None]:
sns.countplot(x='pickup_date_minute',data=train_df,hue='label')

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare', 
    'cal_time_difference_per_duration',
    'pickup_date_minute'
]

cat_features = [
    'pickup_date_minute'
]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

# Linear predictions

In [None]:
sns.violinplot(x='label',y='predicted_fare_per_duration',data=train_df)

In [None]:
sns.violinplot(x='label',y='predicted_duration_diff_bucket',data=train_df)

In [None]:
sns.violinplot(x='label',y='predicted_meter_waiting_per_duration',data=train_df)

In [None]:
sns.violinplot(x='label',y='predicted_fare_per_distance',data=train_df)

In [None]:
predicted_cols = [
    'predicted_fare_per_duration',
    'predicted_duration_diff_bucket',
    'predicted_meter_waiting_per_duration',
    'predicted_fare_per_distance'
]

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare', 
    'cal_time_difference_per_duration'
]

cat_features = []

In [None]:
features = features + predicted_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

submission_df['prediction'].sum() / submission_df.shape[0]

# Original noise cols

In [None]:
noise_cols = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup'
]

In [None]:
sns.violinplot(x='label',y='additional_fare',data=train_df)

In [None]:
sns.violinplot(x='label',y='duration',data=train_df)

In [None]:
sns.violinplot(x='label',y='meter_waiting',data=train_df)

In [None]:
sns.violinplot(x='label',y='meter_waiting_fare',data=train_df)

In [None]:
sns.violinplot(x='label',y='meter_waiting_till_pickup',data=train_df)

In [None]:
np.percentile(train_df['additional_fare'],0.99)

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
]

cat_features = []

In [None]:
features = features + noise_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = np.round(best_model.predict(train))
confusion_matrix(y,y_hat,normalize='pred')

# Orignial cat encodings

In [None]:
original_cat_cols = [
    'pickup_hour',
    'drop_hour',
#     'pick_cluster'
]

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
#     'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
    'meter_waiting_till_pickup',
    
    'fare',
    'additional_fare',
    'duration',
#     'meter_waiting',
    'meter_waiting_fare',
    
    'predicted_duration_diff',
#     'predicted_fare_diff_per_predicted_fare',
#     'predicted_fare_diff_per_fare',
#     'predicted_fare_per_distance',
#     'predicted_fare_diff_per_distance',
#     'predicted_addtional_fare_per_distance',
#     'predicted_additional_fare_diff',
#     'predicted_avg_speed',
#     'predicted_fare_per_duration',
#     'predicted_fare_diff',
#     'predicted_additional_fare',
]

cat_features = []

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
features += original_cat_cols
cat_features += original_cat_cols

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
y_hat = train_class
confusion_matrix(y,y_hat,normalize='pred')

In [None]:
model = models[np.argmax(validation_scores)]
pool = train_pools[np.argmax(validation_scores)]
shap_values = model.get_feature_importance(Pool(train,y,cat_features=cat_features), type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

# visualize the first prediction's explanation
shap.force_plot(expected_value, shap_values[0,:], train.iloc[0,:])



In [None]:
# best_model = models[np.argmax(validation_scores)]
# best_model.get_feature_importance(prettified=True)

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

submission_df['prediction'].sum() / submission_df.shape[0]

In [None]:
neumeric_cols_to_multiply = [
    'fare',
    'fare_per_distance',  
    'fare_per_duration',
    'fare-additional_fare_per_duration',
    'avg_speed',    
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_till_pickup',
    'predicted_duration_diff',
#     'predicted_fare_diff_per_predicted_fare',
#     'predicted_fare_diff_per_fare',
#     'predicted_fare_per_distance',
#     'predicted_fare_diff_per_distance',
#     'predicted_addtional_fare_per_distance',
#     'predicted_additional_fare_diff',
#     'predicted_avg_speed',
#     'predicted_fare_per_duration',
#     'predicted_fare_diff',
#     'predicted_additional_fare'
]

encoding_cols = []
for col1 in original_cat_cols:
    for col2 in neumeric_cols_to_multiply:
        name = f'{col1}@{col2}'
        train_df[name] = train_df[col1] * train_df[col2]
        train_df_org[name] = train_df[col1] * train_df[col2]
        test_df[name] = test_df[col1] * test_df[col2]
        encoding_cols.append(name)

In [None]:
special_features = []

train_df['pickup_timeslot@distance'] = (train_df['pickup_timeslot']+1) * train_df['distance_km']
train_df_org['pickup_timeslot@distance'] = (train_df_org['pickup_timeslot']+1) * train_df_org['distance_km']
test_df['pickup_timeslot@distance'] = (test_df['pickup_timeslot']+1) * test_df['distance_km']
special_features.append('pickup_timeslot@distance')

In [None]:
features += encoding_cols
features += special_features

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
best_model = models[np.argmax(validation_scores)]
best_model.get_feature_importance(prettified=True)

In [None]:
y_hat = train_class
confusion_matrix(y,y_hat,normalize='pred')

In [None]:

model = models[np.argmax(validation_scores)]
pool = train_pools[np.argmax(validation_scores)]
shap_values = model.get_feature_importance(Pool(train,y,cat_features=cat_features), type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

# visualize the first prediction's explanation
shap.force_plot(expected_value, shap_values[0,:], train.iloc[0,:])



In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

submission_df['prediction'].sum() / submission_df.shape[0]