In [None]:
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor,ExtraTreesClassifier,RandomForestClassifier
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_error,f1_score
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import KNeighborsClassifier

from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate,Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers
import tensorflow_addons as tfa

from tqdm import tqdm

import lightgbm as lgb

from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN

In [None]:
train_df = pd.read_csv('train_df_final_blanced.csv')
train_df = train_df.fillna(0)
train_df_org = pd.read_csv('train_df_final.csv')
train_df_org = train_df_org.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
y = train_df['label'].values
y_org = train_df_org['label'].values

In [None]:
y.sum()/y.shape[0]

In [None]:
model_predictions_train = pd.DataFrame()
model_predictions_train['label'] = train_df['label']
model_predictions_test = pd.DataFrame()

In [None]:
for column in train_df.columns:
    print(column)

## Anomaly based

In [None]:
features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
]

cat_features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat,average='micro'), True

### catboost

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class = model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'catboost_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## LightGBM

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores =[]
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    org_scores.append(f1_score(y_org,np.round(model.predict(train_org)),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'lgb_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## K-NN

In [None]:
knn_params = {
    'n_neighbors':15,
    'weights':'uniform'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'knn_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## Random forests

In [None]:
rf_params = {
    'n_estimators':50,
    'max_depth':10,
    'random_state':0,    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = RandomForestClassifier(**rf_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'rf_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## SVC

In [None]:
svc_params = {
    'C':2,
    'kernel':'linear',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'svc_linear_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':2,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'svc_rbf_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

# Base features

In [None]:
features = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup',
    'fare',
    'pickup_date',
    'pickup_hour',
    'pickup_minute',
    'drop_date',
    'drop_hour',
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'distance_km',
    'fare_per_km',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
    'cal_time_difference',
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare',
    'fare-additional_fare-meter_waiting_fare',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',
    'fare_per_distance_mean',
    'fare_per_distance_mean_diff',
    'avg_speed_mean',
    'avg_speed_mean_diff',
    'meter_waiting_per_duration_mean',
    'meter_waiting_per_duration_mean_diff',
    'meter_waiting_fare_per_meter_waiting_mean',
    'meter_waiting_fare_per_meter_waiting_mean_diff',
    'meter_waiting_fare_per_duration_mean',
    'meter_waiting_fare_per_duration_mean_diff',
    'addtional_fare_per_fare_mean',
    'addtional_fare_per_fare_mean_diff',
    'addtional_fare_per_distance_mean',
    'addtional_fare_per_distance_mean_diff',
    'addtional_fare_per_duration_mean',
    'addtional_fare_per_duration_mean_diff',
]

cat_features = [
    'pickup_date',
    'pickup_hour',
    'pickup_minute',
    'drop_date',
    'drop_hour',
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

### Catboost

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class = model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'catboost_base'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

### lgb

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    org_scores.append(f1_score(y_org,np.round(model.predict(train_org)),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'lgb_base'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

# Linear prediction difference

In [None]:
features = [
    'predicted_fare_diff',
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare',
    'predicted_fare_diff_per_distance',
    'predicted_duration_diff',
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration',
    'predicted_duraton_diff_per_distance',
    'predicted_fare_per_duration_diff',
    'predicted_avg_speed_diff',
    'predicted_meter_waiting_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_distance',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare_diff',
    'predicted_meter_waiting_fare_diff_per_distance',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare_diff',
    'predicted_additional_fare_diff_per_additional_fare',
    'predicted_addtional_fare_diff_per_distance',
    'predicted_meter_waiting_till_pickup_diff',
    'predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup',
    'predicted_meter_waiting_till_pickup_per_meter_waiting_diff',
    'predicted_fare_diff_per_distance_normalized',
    'predicted_fare_diff_normalized',
    'predicted_fare_diff_per_fare_normalized',
    'predicted_fare_diff_per_predicted_fare_normalized',
    'predicted_duraton_diff_per_duraton_normalized',
    'predicted_duraton_diff_per_predicted_duration_normalized',
    'predicted_fare_per_duration_diff_normalized',
    'predicted_avg_speed_diff_normalized',
    'predicted_meter_waiting_diff_normalized',
    'predicted_meter_waiting_diff_per_meter_waiting_normalized',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting_normalized',
    'predicted_meter_waiting_per_duration_diff_normalized',
    'predicted_meter_waiting_fare_diff_normalized',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_per_meter_waiting_diff_normalized',
    'predicted_meter_waiting_fare_per_duration_diff_normalized',
    'predicted_additional_fare_diff_normalized',
    'predicted_additional_fare_diff_per_additional_fare_normalized',
]

cat_features = []

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

### Catboost

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class = model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'catboost_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

### lgb

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    org_scores.append(f1_score(y_org,np.round(model.predict(train_org)),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'lgb_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## unit differences

In [None]:
features = [
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare',
    'predicted_fare_diff_per_distance',
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration',
    'predicted_duraton_diff_per_distance',
    'predicted_fare_per_duration_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_distance',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare_diff_per_distance',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare_diff_per_additional_fare',
    'predicted_addtional_fare_diff_per_distance',
    'predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup',
    'predicted_meter_waiting_till_pickup_per_meter_waiting_diff'
]

cat_features = []

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

### KNN

In [None]:
knn_params = {
    'n_neighbors':10,
    'weights':'distance'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'knn_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

### SVM

In [None]:
svc_params = {
    'C':5,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'auto'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'rbf_svm_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

## diff norm

In [None]:
features = [
    'predicted_fare_diff_per_distance_normalized',
    'predicted_fare_diff_normalized',
    'predicted_fare_diff_per_fare_normalized',
    'predicted_fare_diff_per_predicted_fare_normalized',
    'predicted_duraton_diff_per_duraton_normalized',
    'predicted_duraton_diff_per_predicted_duration_normalized',
    'predicted_fare_per_duration_diff_normalized',
    'predicted_avg_speed_diff_normalized',
    'predicted_meter_waiting_diff_normalized',
    'predicted_meter_waiting_diff_per_meter_waiting_normalized',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting_normalized',
    'predicted_meter_waiting_per_duration_diff_normalized',
    'predicted_meter_waiting_fare_diff_normalized',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_per_meter_waiting_diff_normalized',
    'predicted_meter_waiting_fare_per_duration_diff_normalized',
    'predicted_additional_fare_diff_normalized',
    'predicted_additional_fare_diff_per_additional_fare_normalized',
]

cat_features = []

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

### KNN

In [None]:
knn_params = {
    'n_neighbors':15,
    'weights':'distance'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'knn_unit_diff_norm'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

### SVM

In [None]:
svc_params = {
    'C':5,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
name = 'svm_rbf_unit_diff_norm'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
model_predictions_train.to_csv('stack_2_balanced_train.csv',index=False)
model_predictions_test.to_csv('stack_2_balanced_test.csv',index=False)

# Stack level 2

In [None]:
features = model_predictions_test.columns
cat_cols = [
    'knn_anomaly',
    'rf_anomaly',
    'knn_unit_diff',
    'svc_linear_anomaly',
    'svc_rbf_anomaly',
    'knn_unit_diff_norm',
    'rbf_svm_unit_diff',
    'knn_unit_diff_norm',
    'svm_rbf_unit_diff_norm'
]

In [None]:
for each in cat_cols:
    model_predictions_train[each] = model_predictions_train[each].astype(int)
    model_predictions_test[each] = model_predictions_test[each].astype(int)

In [None]:
train = model_predictions_train[features]
test = model_predictions_test[features]
y = train_df['label']

In [None]:
sns.heatmap(model_predictions_train.corr())

### Catboost

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1'
}

In [None]:
submission_pool = Pool(data=model_predictions_test[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class = model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

### Extratrees

In [None]:
et_params = {
    'n_estimators':10,
    'max_depth':3,
    'random_state':0    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = ExtraTreesClassifier(**et_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)