In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import geopy.distance
import itertools

from sklearn.mixture import BayesianGaussianMixture
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR,SVC

from sklearn.ensemble import RandomForestRegressor, IsolationForest,ExtraTreesClassifier,RandomForestClassifier
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_error,f1_score,confusion_matrix,log_loss
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.decomposition import PCA,NMF

from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate,Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers
import tensorflow_addons as tfa

import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor,DMatrix,plot_tree

from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Feature engineering

## Shortened eda notebook

In [None]:
def extract_time(feature='date'):
    def f(time_stamp): 
        date,time = time_stamp.strip().split()
        date = list(map(int, date.split('/')))
        time = list(map(int, time.split(':')))
        if feature == 'date':
            return date[1]
        if feature == 'month':
            return date[0]
        if feature == 'year':
            return date[2]
        if feature == 'hour':
            return time[0]
        if feature == 'minute':
            return time[1]
    return f
        

In [None]:
def get_combined_dataset(cols, train=train_df, test=test_df):
    tmp_1 = train.loc[:,cols]
    tmp_1['dataset'] = 'train'

    tmp_2 = test.loc[:,cols]
    tmp_2['dataset'] = 'test'
    return tmp_1.append(tmp_2)
    

In [None]:
data = train_df.dropna().drop(['tripid'],axis=1)

In [None]:
data['pickup_month'] = data['pickup_time'].map(extract_time('month'))
data['pickup_date'] = data['pickup_time'].map(extract_time('date'))
data['pickup_year'] = data['pickup_time'].map(extract_time('year'))
data['pickup_hour'] = data['pickup_time'].map(extract_time('hour'))
data['pickup_minute'] = data['pickup_time'].map(extract_time('minute'))

data['drop_month'] = data['drop_time'].map(extract_time('month'))
data['drop_date'] = data['drop_time'].map(extract_time('date'))
data['drop_year'] = data['drop_time'].map(extract_time('year'))
data['drop_hour'] = data['drop_time'].map(extract_time('hour'))
data['drop_minute'] = data['drop_time'].map(extract_time('minute'))

In [None]:
test_df['pickup_month'] = test_df['pickup_time'].map(extract_time('month'))
test_df['pickup_date'] = test_df['pickup_time'].map(extract_time('date'))
test_df['pickup_year'] = test_df['pickup_time'].map(extract_time('year'))
test_df['pickup_hour'] = test_df['pickup_time'].map(extract_time('hour'))
test_df['pickup_minute'] = test_df['pickup_time'].map(extract_time('minute'))

test_df['drop_month'] = test_df['drop_time'].map(extract_time('month'))
test_df['drop_date'] = test_df['drop_time'].map(extract_time('date'))
test_df['drop_year'] = test_df['drop_time'].map(extract_time('year'))
test_df['drop_hour'] = test_df['drop_time'].map(extract_time('hour'))
test_df['drop_minute'] = test_df['drop_time'].map(extract_time('minute'))

In [None]:
data = data[data['drop_lat'] < 30]

In [None]:
def is_more_than_one_day(row):
    return 1 if row['pickup_date'] != row['drop_date'] else 0

In [None]:
data['is_more_than_one_day'] = data.apply(is_more_than_one_day,axis=1)
test_df['is_more_than_one_day'] = test_df.apply(is_more_than_one_day,axis=1)

In [None]:
cols = ['pick_lat','pick_lon']
comb_data = get_combined_dataset(cols, train=data)

gmm_pick = BayesianGaussianMixture(n_components=3)
gmm_pick.fit(comb_data[cols].values)

data['pick_cluster'] = gmm_pick.predict(data[cols].values)
test_df['pick_cluster'] = gmm_pick.predict(test_df[cols].values)

In [None]:
cols = ['drop_lat','drop_lon']
comb_data = get_combined_dataset(cols, train=data)

gmm_drop = BayesianGaussianMixture(n_components=3,max_iter=1000)
gmm_drop.fit(comb_data[cols].values)

data['drop_cluster'] = gmm_drop.predict(data[cols].values)
test_df['drop_cluster'] = gmm_drop.predict(test_df[cols].values)

In [None]:
def calculate_trip_distance(row):
    coords_1 = (row['pick_lat'],row['pick_lon'])
    coords_2 = (row['drop_lat'],row['drop_lon'])
    return geopy.distance.geodesic(coords_1, coords_2).km

In [None]:
data['distance_km'] = data.apply(calculate_trip_distance,axis=1).clip(0,100)
test_df['distance_km'] = test_df.apply(calculate_trip_distance,axis=1).clip(0,100)

In [None]:
def fare_per_distance(row):
    return row['fare'] / (row['distance_km']+0.01)

In [None]:
data['fare_per_km'] = data.apply(fare_per_distance,axis=1)
test_df['fare_per_km'] = test_df.apply(fare_per_distance,axis=1)

In [None]:
def time_slot(row,by='pickup'):
    hour = row[f'{by}_hour']
    if 7 <= hour <= 9:
        return 1
    if 12 <= hour <= 2:
        return 2
    if 4 <= hour <= 6:
        return 3
    return 0

In [None]:
data['pickup_timeslot'] = data.apply(time_slot,axis=1)
test_df['pickup_timeslot'] = test_df.apply(time_slot,axis=1)

In [None]:
def day_of_week(row,by='pickup'):
    date = row[f'{by}_date']
    month = row[f'{by}_month']
    year = row[f'{by}_year']
    d = datetime.datetime(year,month,date).weekday()
    return d

In [None]:
data['day_of_week'] = data.apply(day_of_week,axis=1)
test_df['day_of_week'] = test_df.apply(day_of_week,axis=1)

In [None]:
def is_weekday(row,by='pickup'):
    date = row['day_of_week']
    return 1 if date < 5 else 0

In [None]:
data['is_weekday'] = data.apply(is_weekday,axis=1)
test_df['is_weekday'] = test_df.apply(is_weekday,axis=1)

In [None]:
def cal_time_difference(row):
    pickup_date = row['pickup_date']
    pickup_month = row['pickup_month']
    pickup_year = row['pickup_year']
    pickup_hour = row['pickup_hour']
    pickup_minute = row['pickup_minute']
    pickup_time = datetime.datetime(pickup_year, pickup_month, pickup_date, pickup_hour, pickup_minute)
    
    drop_date = row['drop_date']
    drop_month = row['drop_month']
    drop_year = row['drop_year']
    drop_hour = row['drop_hour']
    drop_minute = row['drop_minute']
    drop_time = datetime.datetime(drop_year, drop_month, drop_date, drop_hour, drop_minute)
    
    delta = drop_time - pickup_time
    return delta.seconds - row['duration']

In [None]:
data['cal_time_difference'] = data.apply(cal_time_difference,axis=1)
test_df['cal_time_difference'] = test_df.apply(cal_time_difference,axis=1)

In [None]:
def encode_label(label):
    if label == 'correct':
        return 1
    elif label == 'incorrect':
        return 0
    else:
        return label

In [None]:
data['label'] = data['label'].map(encode_label)

In [None]:
columns_to_save = [
    'additional_fare', 
    'duration', 
    'meter_waiting', 
    'meter_waiting_fare',
    'meter_waiting_till_pickup', 
    'fare',
    'pickup_date', 
    'pickup_hour', 
    'pickup_minute',
    'drop_date', 
    'drop_hour', 
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'distance_km',
    'fare_per_km',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
    'cal_time_difference']
data.loc[:, columns_to_save+['label']].to_csv('train_df.csv',index=False)
test_df.loc[:, columns_to_save].to_csv('test_df.csv',index=False)

## Shortened anomaly_detection notebook

In [None]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

data = train_df[train_df['label'] == 1].dropna()

In [None]:
def anomaly_pred(col, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[col].values

    X_train_df = train_df[col].values
    X_test_df = test_df[col].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        X_train = X_train.reshape((-1,1))
        X_test = X_test.reshape((-1,1))

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1).reshape(y_test.shape)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df.reshape(-1,1)).reshape(X_train_df.shape).clip(0,1)
        test_preds += model.predict(X_test_df.reshape(-1,1)).reshape(X_test_df.shape).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
        
    train_df[f'{col}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{col}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [None]:
cols = ['fare','additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for col in tqdm(cols):
    validation_scores = anomaly_pred(col)


In [None]:
def anomaly_pred_multi(cols, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[cols].values

    X_train_df = train_df[cols].values
    X_test_df = test_df[cols].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])
#     print(X.shape)

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df).clip(0,1)
        test_preds += model.predict(X_test_df).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
    name = '_'.join(cols)
    train_df[f'{name}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{name}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [None]:
cols = ['fare','additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        validation_scores = anomaly_pred_multi([col_1,col_2])

In [None]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            validation_scores = anomaly_pred_multi([col_1,col_2,col_3])

In [None]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            k = cols.index(col_3)
            for col_4 in cols[k+1:]:                
                validation_scores = anomaly_pred_multi([col_1,col_2,col_3,col_4])
    

In [None]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
#     'class_weights':class_weights
}

In [None]:
features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    
]

cat_features = [
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',

]

In [None]:
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)[features]

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
train_df_pool = Pool(data=train_df[features], cat_features=cat_features)

In [None]:
skf = StratifiedKFold(n_splits=3)
validation_scores = []
submission_preds = np.zeros(submission_df.shape[0])
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_pools = []
models = []
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = model.best_score_['validation']['F1']
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    train_pools.append(train_pool)
    submission_preds += model.predict(submission_pool)
    train_preds += model.predict_proba(train_df_pool)[:,1]
    test_preds += model.predict_proba(submission_pool)[:,1]

In [None]:
train_df.to_csv('train_df_anomaly.csv',index=False)
test_df.to_csv('test_df_anomaly.csv',index=False)

## Shortened noise_pre_eda notebook

In [None]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

data = train_df[train_df['label'] == 1].dropna()

In [None]:
cols = ['duration','meter_waiting','meter_waiting_fare','is_more_than_one_day']
X = data[cols].values
y = data['fare'].values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = scaler.transform(X_train_df)
X_test_df = scaler.transform(X_test_df)

In [None]:
folds = 3
validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
#     model = CatBoostRegressor(**params)
#     model.fit(X=X_train,y=y_train,eval_set=(X_test,y_test))

    model = LinearRegression()
#     model = SVR()
    model.fit(X_train,y_train)
    
    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_fare'] = train_preds
test_df['predicted_fare'] = test_preds

train_df['predicted_fare_diff'] = train_df['fare'] - train_df['predicted_fare']
test_df['predicted_fare_diff'] = test_df['fare'] - test_df['predicted_fare']    

train_df['predicted_fare_diff_per_fare'] = train_df['predicted_fare_diff'] / (train_df['fare']+1)
test_df['predicted_fare_diff_per_fare'] = test_df['predicted_fare_diff'] / (test_df['fare']+1)

train_df['predicted_fare_diff_per_predicted_fare'] = train_df['predicted_fare_diff'] / (train_df['predicted_fare']+1)
test_df['predicted_fare_diff_per_predicted_fare'] = test_df['predicted_fare_diff'] / (test_df['predicted_fare']+1)

train_df['fare_per_distance'] = train_df['fare'] / (train_df['distance_km']+1)
test_df['fare_per_distance'] = test_df['fare'] / (test_df['distance_km']+1)

train_df['predicted_fare_per_distance'] = train_df['predicted_fare'] / (train_df['distance_km']+1)
test_df['predicted_fare_per_distance'] = test_df['predicted_fare'] / (test_df['distance_km']+1)

train_df['predicted_fare_diff_per_distance'] = train_df['predicted_fare_diff'] / (train_df['distance_km']+1)
test_df['predicted_fare_diff_per_distance'] = test_df['predicted_fare_diff'] / (test_df['distance_km']+1)

train_df['predicted_fare_diff_per_fare'] = train_df['predicted_fare_diff'] / (train_df['fare']+1)
test_df['predicted_fare_diff_per_fare'] = test_df['predicted_fare_diff'] / (test_df['fare']+1)

In [None]:
cols = ['meter_waiting','meter_waiting_fare','fare','is_more_than_one_day','cal_time_difference']

X = data[cols].values
y = data['duration'].values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = scaler.transform(X_train_df)
X_test_df = scaler.transform(X_test_df)

In [None]:
folds = 3

validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
   
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_duration'] = train_preds
test_df['predicted_duration'] = test_preds

train_df['predicted_duration_diff'] = train_df['duration'] - train_df['predicted_duration']
test_df['predicted_duration_diff'] = test_df['duration'] - test_df['predicted_duration']    

train_df['predicted_duraton_diff_per_duraton'] = train_df['predicted_duration_diff'] / (train_df['duration']+1)
test_df['predicted_duraton_diff_per_duraton'] = test_df['predicted_duration_diff'] / (test_df['duration']+1)

train_df['predicted_duraton_diff_per_predicted_duration'] = train_df['predicted_duration_diff'] / (train_df['predicted_duration']+1)
test_df['predicted_duraton_diff_per_predicted_duration'] = test_df['predicted_duration_diff'] / (test_df['predicted_duration']+1)

train_df['predicted_duraton_diff_per_distance'] = train_df['predicted_duration_diff'] / (train_df['distance_km']+1)
test_df['predicted_duraton_diff_per_distance'] = test_df['predicted_duration_diff'] / (test_df['distance_km']+1)

train_df['fare_per_duration'] = train_df['fare'] / (train_df['duration']+1)
test_df['fare_per_duration'] = test_df['fare'] / (test_df['duration']+1)

train_df['predicted_fare_per_duration'] = train_df['predicted_fare'] / (train_df['predicted_duration']+1)
test_df['predicted_fare_per_duration'] = test_df['predicted_fare'] / (test_df['predicted_duration']+1)

train_df['predicted_fare_per_duration_diff'] = train_df['fare_per_duration'] - train_df['predicted_fare_per_duration']
test_df['predicted_fare_per_duration_diff'] = test_df['fare_per_duration'] - test_df['predicted_fare_per_duration']

train_df['avg_speed'] = train_df['distance_km'] / (train_df['duration'] + 1)
test_df['avg_speed'] = test_df['distance_km'] / (test_df['duration'] + 1)

train_df['predicted_avg_speed'] = train_df['distance_km'] / (train_df['predicted_duration'] + 1)
test_df['predicted_avg_speed'] = test_df['distance_km'] / (test_df['predicted_duration'] + 1)

train_df['predicted_avg_speed_diff'] = train_df['avg_speed'] - train_df['predicted_avg_speed']
test_df['predicted_avg_speed_diff'] = test_df['avg_speed'] - test_df['predicted_avg_speed']    

In [None]:
cols = ['duration','meter_waiting_fare','fare','cal_time_difference']

X = data[cols].values
y = data['meter_waiting'].values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = scaler.transform(X_train_df)
X_test_df = scaler.transform(X_test_df)

In [None]:
folds = 3

validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
        
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_meter_waiting'] = train_preds
test_df['predicted_meter_waiting'] = test_preds

train_df['predicted_meter_waiting_diff'] = train_df['meter_waiting'] - train_df['predicted_meter_waiting']
test_df['predicted_meter_waiting_diff'] = test_df['meter_waiting'] - test_df['predicted_meter_waiting']

train_df['predicted_meter_waiting_diff_per_meter_waiting'] = train_df['predicted_meter_waiting_diff'] / (train_df['meter_waiting'] + 1)
test_df['predicted_meter_waiting_diff_per_meter_waiting'] = test_df['predicted_meter_waiting_diff'] / (test_df['meter_waiting'] + 1)

train_df['predicted_meter_waiting_diff_per_distance'] = train_df['predicted_meter_waiting_diff'] / (train_df['distance_km'] + 1)
test_df['predicted_meter_waiting_diff_per_distance'] = test_df['predicted_meter_waiting_diff'] / (test_df['distance_km'] + 1)

train_df['predicted_meter_waiting_diff_per_predicted_meter_waiting'] = train_df['predicted_meter_waiting_diff'] / (train_df['predicted_meter_waiting'] + 1)
test_df['predicted_meter_waiting_diff_per_predicted_meter_waiting'] = test_df['predicted_meter_waiting_diff'] / (test_df['predicted_meter_waiting'] + 1)

train_df['meter_waiting_per_duration'] = train_df['meter_waiting'] / (train_df['duration']+1)
test_df['meter_waiting_per_duration'] = test_df['meter_waiting'] / (test_df['duration']+1)

train_df['predicted_meter_waiting_per_duration'] = train_df['predicted_meter_waiting'] / (train_df['predicted_duration']+1)
test_df['predicted_meter_waiting_per_duration'] = test_df['predicted_meter_waiting'] / (test_df['predicted_duration']+1)

train_df['predicted_meter_waiting_per_duration_diff'] = train_df['meter_waiting_per_duration'] - train_df['predicted_meter_waiting_per_duration']
test_df['predicted_meter_waiting_per_duration_diff'] = test_df['meter_waiting_per_duration'] - test_df['predicted_meter_waiting_per_duration']

In [None]:
cols = ['duration','meter_waiting','fare','is_more_than_one_day','cal_time_difference']

X = data[cols].values
y = data['meter_waiting_fare'].values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = scaler.transform(X_train_df)
X_test_df = scaler.transform(X_test_df)

In [None]:
folds = 3

validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LinearRegression()
    model.fit(X_train,y_train)
    
    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_meter_waiting_fare'] = train_preds
test_df['predicted_meter_waiting_fare'] = test_preds

train_df['predicted_meter_waiting_fare_diff'] = train_df['meter_waiting_fare'] - train_df['predicted_meter_waiting_fare']
test_df['predicted_meter_waiting_fare_diff'] = test_df['meter_waiting_fare'] - test_df['predicted_meter_waiting_fare']

train_df['predicted_meter_waiting_fare_diff_per_meter_waiting_fare'] = train_df['predicted_meter_waiting_fare_diff'] / (train_df['meter_waiting_fare']+1)
test_df['predicted_meter_waiting_fare_diff_per_meter_waiting_fare'] = test_df['predicted_meter_waiting_fare_diff'] / (test_df['meter_waiting_fare']+1)

train_df['predicted_meter_waiting_fare_diff_per_distance'] = train_df['predicted_meter_waiting_fare_diff'] / (train_df['distance_km']+1)
test_df['predicted_meter_waiting_fare_diff_per_distance'] = test_df['predicted_meter_waiting_fare_diff'] / (test_df['distance_km']+1)

train_df['predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare'] = train_df['predicted_meter_waiting_fare_diff'] / (train_df['predicted_meter_waiting_fare']+1)
test_df['predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare'] = test_df['predicted_meter_waiting_fare_diff'] / (test_df['predicted_meter_waiting_fare']+1)

train_df['meter_waiting_fare_per_meter_waiting'] = train_df['meter_waiting_fare'] / train_df['meter_waiting']
test_df['meter_waiting_fare_per_meter_waiting'] = test_df['meter_waiting_fare'] / test_df['meter_waiting']

train_df['predicted_meter_waiting_fare_per_meter_waiting'] = train_df['predicted_meter_waiting_fare'] / train_df['predicted_meter_waiting']
test_df['predicted_meter_waiting_fare_per_meter_waiting'] = test_df['predicted_meter_waiting_fare'] / test_df['predicted_meter_waiting']

train_df['predicted_meter_waiting_fare_per_meter_waiting_diff'] = train_df['meter_waiting_fare_per_meter_waiting'] - train_df['predicted_meter_waiting_fare_per_meter_waiting']
test_df['predicted_meter_waiting_fare_per_meter_waiting_diff'] = test_df['meter_waiting_fare_per_meter_waiting'] - test_df['predicted_meter_waiting_fare_per_meter_waiting']

train_df['meter_waiting_fare_per_duration'] = train_df['meter_waiting_fare'] / train_df['duration']
test_df['meter_waiting_fare_per_duration'] = test_df['meter_waiting_fare'] / test_df['duration']

train_df['predicted_meter_waiting_fare_per_duration'] = train_df['predicted_meter_waiting_fare'] / train_df['predicted_duration']
test_df['predicted_meter_waiting_fare_per_duration'] = test_df['predicted_meter_waiting_fare'] / test_df['predicted_duration']

train_df['predicted_meter_waiting_fare_per_duration_diff'] = train_df['meter_waiting_fare_per_duration'] - train_df['predicted_meter_waiting_fare_per_duration']
test_df['predicted_meter_waiting_fare_per_duration_diff'] = test_df['meter_waiting_fare_per_duration'] - test_df['predicted_meter_waiting_fare_per_duration']

In [None]:
cols = ['meter_waiting_fare_per_duration','meter_waiting_per_duration','fare_per_duration']
data = train_df[train_df['label'] == 1].dropna()

X = data[cols].values
y = data['additional_fare'].values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = np.nan_to_num(scaler.transform(X_train_df))
X_test_df = np.nan_to_num(scaler.transform(X_test_df))

In [None]:
folds = 3

validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

#     model = CatBoostRegressor(**params)
#     model.fit(X=X_train,y=y_train,eval_set=(X_test,y_test))
   
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_additional_fare'] = train_preds
test_df['predicted_additional_fare'] = test_preds

train_df['predicted_additional_fare_diff'] = train_df['additional_fare'] - train_df['predicted_additional_fare']
test_df['predicted_additional_fare_diff'] = test_df['additional_fare'] - test_df['predicted_additional_fare']

train_df['predicted_additional_fare_diff_per_additional_fare'] = train_df['predicted_additional_fare_diff'] / (train_df['additional_fare']+1)
test_df['predicted_additional_fare_diff_per_additional_fare'] = test_df['predicted_additional_fare_diff'] / (test_df['additional_fare']+1)

train_df['predicted_addtional_fare_per_fare'] = train_df['predicted_additional_fare'] / (train_df['predicted_fare']+1)
test_df['predicted_addtional_fare_per_fare'] = test_df['predicted_additional_fare'] / (test_df['predicted_fare']+1)

train_df['addtional_fare_per_fare'] = train_df['additional_fare'] / (train_df['fare']+1)
test_df['addtional_fare_per_fare'] = test_df['additional_fare'] / (test_df['fare']+1)

train_df['addtional_fare_per_distance'] = train_df['additional_fare'] / (train_df['distance_km']+1)
test_df['addtional_fare_per_distance'] = test_df['additional_fare'] / (test_df['distance_km']+1)

train_df['predicted_addtional_fare_per_distance'] = train_df['predicted_additional_fare'] / (train_df['distance_km']+1)
test_df['predicted_addtional_fare_per_distance'] = test_df['predicted_additional_fare'] / (test_df['distance_km']+1)

train_df['predicted_addtional_fare_diff_per_distance'] = train_df['predicted_additional_fare_diff'] / (train_df['distance_km']+1)
test_df['predicted_addtional_fare_diff_per_distance'] = test_df['predicted_additional_fare_diff'] / (test_df['distance_km']+1)

train_df['addtional_fare_per_duration'] = train_df['additional_fare'] / (train_df['duration']+1)
test_df['addtional_fare_per_duration'] = test_df['additional_fare'] / (test_df['duration']+1)

train_df['predicted_addtional_fare_per_duration'] = train_df['predicted_additional_fare'] / (train_df['predicted_duration']+1)
test_df['predicted_addtional_fare_per_duration'] = test_df['predicted_additional_fare'] / (test_df['predicted_duration']+1)

train_df['fare-additional_fare'] = train_df['fare'] - train_df['additional_fare']
test_df['fare-additional_fare'] = test_df['fare'] - test_df['additional_fare']

train_df['predicted_fare-additional_fare'] = train_df['predicted_fare'] - train_df['predicted_additional_fare']
test_df['predicted_fare-additional_fare'] = test_df['predicted_fare'] - test_df['predicted_additional_fare']

train_df['fare-additional_fare-meter_waiting_fare'] = train_df['fare'] - (train_df['additional_fare'] + train_df['meter_waiting_fare'])
test_df['fare-additional_fare-meter_waiting_fare'] = test_df['fare'] - (test_df['additional_fare'] + test_df['meter_waiting_fare'])

train_df['predicted_fare-additional_fare-meter_waiting_fare'] = train_df['predicted_fare'] - (train_df['predicted_additional_fare'] + train_df['predicted_meter_waiting_fare'])
test_df['predicted_fare-additional_fare-meter_waiting_fare'] = test_df['predicted_fare'] - (test_df['predicted_additional_fare'] + test_df['predicted_meter_waiting_fare'])

train_df['fare-additional_fare_per_distance'] = train_df['fare-additional_fare'] / (train_df['distance_km']+1)
test_df['fare-additional_fare_per_distance'] = test_df['fare-additional_fare'] / (test_df['distance_km']+1)

train_df['predicted_fare-additional_fare_per_distance'] = train_df['predicted_fare-additional_fare'] / (train_df['distance_km']+1)
test_df['predicted_fare-additional_fare_per_distance'] = test_df['predicted_fare-additional_fare'] / (test_df['distance_km']+1)

train_df['fare-additional_fare_per_duration'] = train_df['fare-additional_fare'] / (train_df['duration']+1)
test_df['fare-additional_fare_per_duration'] = test_df['fare-additional_fare'] / (test_df['duration']+1)

train_df['predicted_fare-additional_fare_per_duration'] = train_df['predicted_fare-additional_fare'] / (train_df['predicted_duration']+1)
test_df['predicted_fare-additional_fare_per_duration'] = test_df['predicted_fare-additional_fare'] / (test_df['predicted_duration']+1)

train_df['fare-additional_fare-meter_waiting_fare_per_distance'] = train_df['fare-additional_fare-meter_waiting_fare'] / (train_df['distance_km']+1)
test_df['fare-additional_fare-meter_waiting_fare_per_distance'] = test_df['fare-additional_fare-meter_waiting_fare'] / (test_df['distance_km']+1)

train_df['predicted_fare-additional_fare-meter_waiting_fare_per_distance'] = train_df['predicted_fare-additional_fare-meter_waiting_fare'] / (train_df['distance_km']+1)
test_df['predicted_fare-additional_fare-meter_waiting_fare_per_distance'] = test_df['predicted_fare-additional_fare-meter_waiting_fare'] / (test_df['distance_km']+1)

train_df['fare-additional_fare-meter_waiting_fare_per_duration'] = train_df['fare-additional_fare-meter_waiting_fare'] / (train_df['duration']+1)
test_df['fare-additional_fare-meter_waiting_fare_per_duration'] = test_df['fare-additional_fare-meter_waiting_fare'] / (test_df['duration']+1)

train_df['predicted_fare-additional_fare-meter_waiting_fare_per_duration'] = train_df['predicted_fare-additional_fare-meter_waiting_fare'] / (train_df['predicted_duration']+1)
test_df['predicted_fare-additional_fare-meter_waiting_fare_per_duration'] = test_df['predicted_fare-additional_fare-meter_waiting_fare'] / (test_df['predicted_duration']+1)

In [None]:
data = train_df[train_df['label'] == 1].dropna()
y = data['meter_waiting_till_pickup'].values
X = data.drop(['label','meter_waiting_till_pickup'],axis=1)
cols = X.columns
X = X.values

X_train_df = train_df[cols].values
X_test_df = test_df[cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train_df = np.nan_to_num(scaler.transform(X_train_df))
X_test_df = np.nan_to_num(scaler.transform(X_test_df))

In [None]:
params = {
    'loss_function':'RMSE',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'RMSE'
}

In [None]:
folds = 3
validation_scores = []
models = []

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=folds)
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = CatBoostRegressor(**params)
    model.fit(X=X_train,y=y_train,eval_set=(X_test,y_test),verbose=10)
   

    pred = model.predict(X_test)
    score = mean_squared_error(y_test,pred) ** 0.5
    validation_scores.append(score)
    models.append(model)
    print('RMSE:', score)
    
    train_preds += model.predict(X_train_df)
    test_preds += model.predict(X_test_df)
    
train_preds /= folds
test_preds /= folds

In [None]:
train_df['predicted_meter_waiting_till_pickup'] = train_preds
test_df['predicted_meter_waiting_till_pickup'] = test_preds

train_df['predicted_meter_waiting_till_pickup_diff'] = train_df['meter_waiting_till_pickup'] - train_df['predicted_meter_waiting_till_pickup']
test_df['predicted_meter_waiting_till_pickup_diff'] = test_df['meter_waiting_till_pickup'] - test_df['predicted_meter_waiting_till_pickup']

train_df['predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup'] = train_df['predicted_meter_waiting_till_pickup_diff'] / (train_df['meter_waiting_till_pickup']+1)
test_df['predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup'] = test_df['predicted_meter_waiting_till_pickup_diff'] / (test_df['meter_waiting_till_pickup']+1)

train_df['meter_waiting_till_pickup_per_meter_waiting'] = train_df['meter_waiting_till_pickup'] / (train_df['meter_waiting'] + 1)
test_df['meter_waiting_till_pickup_per_meter_waiting'] = test_df['meter_waiting_till_pickup'] / (test_df['meter_waiting'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_meter_waiting'] = train_df['predicted_meter_waiting_till_pickup'] / (train_df['predicted_meter_waiting'] + 1)
test_df['predicted_meter_waiting_till_pickup_per_meter_waiting'] = test_df['predicted_meter_waiting_till_pickup'] / (test_df['predicted_meter_waiting'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_meter_waiting_diff'] = train_df['meter_waiting_till_pickup_per_meter_waiting'] - train_df['predicted_meter_waiting_till_pickup_per_meter_waiting']
test_df['predicted_meter_waiting_till_pickup_per_meter_waiting_diff'] = test_df['meter_waiting_till_pickup_per_meter_waiting'] - test_df['predicted_meter_waiting_till_pickup_per_meter_waiting']

train_df['meter_waiting_after_pickup'] = train_df['meter_waiting'] - train_df['meter_waiting_till_pickup']
test_df['meter_waiting_after_pickup'] = test_df['meter_waiting'] - test_df['meter_waiting_till_pickup']

train_df['predicted_meter_waiting_after_pickup'] = train_df['predicted_meter_waiting'] - train_df['predicted_meter_waiting_till_pickup']
test_df['predicted_meter_waiting_after_pickup'] = test_df['predicted_meter_waiting'] - test_df['predicted_meter_waiting_till_pickup']

train_df['meter_waiting_after_pickup_per_duration'] = train_df['meter_waiting_after_pickup'] / (train_df['duration'] + 1)
test_df['meter_waiting_after_pickup_per_duration'] = test_df['meter_waiting_after_pickup'] / (test_df['duration'] + 1)

train_df['predicted_meter_waiting_after_pickup_per_duration'] = train_df['predicted_meter_waiting_after_pickup'] / (train_df['predicted_duration'] + 1)
test_df['predicted_meter_waiting_after_pickup_per_duration'] = test_df['predicted_meter_waiting_after_pickup'] / (test_df['predicted_duration'] + 1)

train_df['meter_waiting_till_pickup_per_duration'] = train_df['meter_waiting_till_pickup'] / (train_df['duration'] + 1)
test_df['meter_waiting_till_pickup_per_duration'] = test_df['meter_waiting_till_pickup'] / (test_df['duration'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_duration'] = train_df['predicted_meter_waiting_till_pickup'] / (train_df['predicted_duration'] + 1)
test_df['predicted_meter_waiting_till_pickup_per_duration'] = test_df['predicted_meter_waiting_till_pickup'] / (test_df['predicted_duration'] + 1)

train_df['meter_waiting_till_pickup_per_distance'] = train_df['meter_waiting_till_pickup'] / (train_df['distance_km'] + 1)
test_df['meter_waiting_till_pickup_per_distance'] = test_df['meter_waiting_till_pickup'] / (test_df['distance_km'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_distance'] = train_df['predicted_meter_waiting_till_pickup'] / (train_df['distance_km'] + 1)
test_df['predicted_meter_waiting_till_pickup_per_distance'] = test_df['predicted_meter_waiting_till_pickup'] / (test_df['distance_km'] + 1)

train_df['meter_waiting_after_pickup_per_distance'] = train_df['meter_waiting_after_pickup'] / (train_df['distance_km'] + 1)
test_df['meter_waiting_after_pickup_per_distance'] = test_df['meter_waiting_after_pickup'] / (test_df['distance_km'] + 1)

train_df['predicted_meter_waiting_after_pickup_per_distance'] = train_df['predicted_meter_waiting_after_pickup'] / (train_df['distance_km'] + 1)
test_df['predicted_meter_waiting_after_pickup_per_distance'] = test_df['predicted_meter_waiting_after_pickup'] / (test_df['distance_km'] + 1)

train_df['meter_waiting_till_pickup_per_fare'] = train_df['meter_waiting_till_pickup'] / (train_df['fare'] + 1)
test_df['meter_waiting_till_pickup_per_fare'] = test_df['meter_waiting_till_pickup'] / (test_df['fare'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_fare'] = train_df['predicted_meter_waiting_till_pickup'] / (train_df['predicted_fare'] + 1)
test_df['predicted_meter_waiting_till_pickup_per_fare'] = test_df['predicted_meter_waiting_till_pickup'] / (test_df['predicted_fare'] + 1)

train_df['meter_waiting_after_pickup_per_fare'] = train_df['meter_waiting_after_pickup'] / (train_df['fare'] + 1)
test_df['meter_waiting_after_pickup_per_fare'] = test_df['meter_waiting_after_pickup'] / (test_df['fare'] + 1)

train_df['predicted_meter_waiting_after_pickup_per_fare'] = train_df['predicted_meter_waiting_after_pickup'] / (train_df['predicted_fare'] + 1)
test_df['predicted_meter_waiting_after_pickup_per_fare'] = test_df['predicted_meter_waiting_after_pickup'] / (test_df['predicted_fare'] + 1)

train_df['meter_waiting_till_pickup_per_meter_waiting_fare'] = train_df['meter_waiting_till_pickup'] / (train_df['meter_waiting_fare'] + 1)
test_df['meter_waiting_till_pickup_per_meter_waiting_fare'] = test_df['meter_waiting_till_pickup'] / (test_df['meter_waiting_fare'] + 1)

train_df['predicted_meter_waiting_till_pickup_per_meter_waiting_fare'] = train_df['predicted_meter_waiting_till_pickup'] / (train_df['predicted_meter_waiting_fare'] + 1)
test_df['predicted_meter_waiting_till_pickup_per_meter_waiting_fare'] = test_df['predicted_meter_waiting_till_pickup'] / (test_df['predicted_meter_waiting_fare'] + 1)

train_df['meter_waiting_after_pickup_per_meter_waiting_fare'] = train_df['meter_waiting_after_pickup'] / (train_df['meter_waiting_fare'] + 1)
test_df['meter_waiting_after_pickup_per_meter_waiting_fare'] = test_df['meter_waiting_after_pickup'] / (test_df['meter_waiting_fare'] + 1)

train_df['predicted_meter_waiting_after_pickup_per_meter_waiting_fare'] = train_df['predicted_meter_waiting_after_pickup'] / (train_df['predicted_meter_waiting_fare'] + 1)
test_df['predicted_meter_waiting_after_pickup_per_meter_waiting_fare'] = test_df['predicted_meter_waiting_after_pickup'] / (test_df['predicted_meter_waiting_fare'] + 1)

In [None]:
train_anomaly = pd.read_csv('train_df_anomaly.csv')
test_anomaly = pd.read_csv('test_df_anomaly.csv')

In [None]:
anomaly_columns = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
]

In [None]:
for col in anomaly_columns:
    train_df[col] = 1-train_anomaly[col]
    test_df[col] = 1-test_anomaly[col]

In [None]:
anomaly_multiplicatives = {
    'fare_anomaly':[
        'predicted_fare_diff',
        'predicted_fare_diff_per_fare',
        'predicted_fare_diff_per_distance',
    ],
    'additional_fare_anomaly':[
        'predicted_additional_fare_diff',
        'predicted_additional_fare_diff_per_additional_fare',
        'predicted_addtional_fare_per_distance',
    ],
    'duration_anomaly':[
        'predicted_duration_diff', 
        'predicted_duraton_diff_per_duraton',
        'predicted_duraton_diff_per_distance', 
    ],
    'meter_waiting_anomaly':[
        'predicted_meter_waiting_diff',
        'predicted_meter_waiting_diff_per_meter_waiting',
        'predicted_meter_waiting_diff_per_distance'
    ],
    'meter_waiting_fare_anomaly':[
        'predicted_meter_waiting_fare_diff',
        'predicted_meter_waiting_fare_diff_per_meter_waiting_fare',
        'predicted_meter_waiting_fare_diff_per_distance'
    ]
    
}

In [None]:
new_cols = []
for col1 in anomaly_multiplicatives:
    for col2 in anomaly_multiplicatives[col1]:
        name = f'{col1}_{col2}_prod'
        train_df[name] = train_df[col1] * train_df[col2]
        test_df[name] = test_df[col1] * test_df[col2]
        new_cols.append(name)

In [None]:
train_df['predicted_duraton_diff_per_duraton@predicted_duraton_diff_per_distance'] = train_df['predicted_duraton_diff_per_duraton'] * train_df['predicted_duraton_diff_per_distance']
test_df['predicted_duraton_diff_per_duraton@predicted_duraton_diff_per_distance'] = test_df['predicted_duraton_diff_per_duraton'] * test_df['predicted_duraton_diff_per_distance']

In [None]:
def normalize_diff(col_name):
    normalizer = StandardScaler()
    normalizer.fit(train_df[train_df['label'] == 1][col_name].values.reshape(-1,1))

    train_df[f'{col_name}_normalized'] = normalizer.transform(train_df[col_name].values.reshape(-1,1))
    test_df[f'{col_name}_normalized'] = normalizer.transform(test_df[col_name].values.reshape(-1,1))

In [None]:
diff_cols = [
    'predicted_fare_diff',
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare', 
    'predicted_fare_diff_per_distance',
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration', 
    'predicted_fare_per_duration_diff',
    'predicted_avg_speed_diff',
    'predicted_meter_waiting_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare_diff',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare_diff',
    'predicted_additional_fare_diff_per_additional_fare'
]
for col in diff_cols:
    normalize_diff(col)

In [None]:
def mean_value(col):
    grouping_order = ['pick_cluster','pickup_timeslot']
    group = train_df[train_df['label'] == 1].groupby(grouping_order)[col].mean()
    def f(row):
        return group[row['pick_cluster']][row['pickup_timeslot']]
    return f

In [None]:
def mean_encoding(col):
    train_df[f'{col}_mean'] = train_df.apply(mean_value(col),axis=1)
    test_df[f'{col}_mean'] = test_df.apply(mean_value(col),axis=1)
    
    train_df[f'{col}_mean_diff'] = train_df[f'{col}_mean'] - train_df[col]
    test_df[f'{col}_mean_diff'] = test_df[f'{col}_mean'] - test_df[col]

In [None]:
mean_cols = [
    'fare_per_distance',
    'avg_speed', 
    'meter_waiting_per_duration', 
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance', 
    'addtional_fare_per_duration'
]
for col in mean_cols:
    mean_encoding(col)

In [None]:
cols = [
    'predicted_fare_diff',
    'predicted_duration_diff',
    'predicted_meter_waiting_diff',
    'predicted_meter_waiting_fare_diff',
    'predicted_additional_fare_diff',    
]

In [None]:
def col_bucket(column):
    std = train_df[train_df['label']==1][column].std()
    name = f'{column}_bucket'
    train_df[name] = np.round((train_df[column]/std)+1).astype(int)
    test_df[name] = np.round((test_df[column]/std)+1).astype(int)    

In [None]:
for each in cols:
    col_bucket(each)

In [None]:
bin_multiplicatives = {
    'predicted_fare_diff_bucket':[
        'fare',
        'predicted_fare',
        'fare_per_distance',
        'predicted_fare_per_distance',         
    ],
    'predicted_duration_diff_bucket':[
        'duration',
        'predicted_duration',
        'avg_speed', 
        'predicted_avg_speed',         
    ],
    'predicted_meter_waiting_diff_bucket':[
        'meter_waiting', 
        'predicted_meter_waiting', 
        'meter_waiting_per_duration', 
        'predicted_meter_waiting_per_duration',
    ],
    'predicted_meter_waiting_fare_diff_bucket':[
        'meter_waiting_fare',
        'predicted_meter_waiting_fare',
        'meter_waiting_fare_per_meter_waiting',
        'predicted_meter_waiting_fare_per_meter_waiting',
        'meter_waiting_fare_per_duration',
        'predicted_meter_waiting_fare_per_duration',
    ],
    'predicted_additional_fare_diff':[
        'additional_fare',
        'predicted_additional_fare', 
        'predicted_addtional_fare_per_fare', 
        'addtional_fare_per_fare',
        'addtional_fare_per_distance', 
        'predicted_addtional_fare_per_distance',
        'addtional_fare_per_duration', 
        'predicted_addtional_fare_per_duration',
    ]
    
}

In [None]:
for bucket in tqdm(bin_multiplicatives):
    for col in bin_multiplicatives[bucket]:
        name = f'{bucket}@{col}'
        train_df[name] = train_df[bucket] * train_df[col]
        test_df[name] = test_df[bucket] * test_df[col]

In [None]:
train_df.to_csv('train_df_final.csv',index=False)
test_df.to_csv('test_df_final.csv',index=False)

# Model 1

In [None]:
train_df = pd.read_csv('train_df_final.csv')
train_df = train_df.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
y = train_df['label'].values

In [None]:
original_cat_cols = [
    'pickup_hour',
    'drop_hour',
#     'pick_cluster'
]

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
    'meter_waiting_till_pickup'
]

cat_features = []

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
features += original_cat_cols
cat_features += original_cat_cols

In [None]:
train = train_df[features]
test = test_df[features]
y = train_df['label']

In [None]:
neumeric_cols_to_multiply = [
    'fare_per_distance',    
    'avg_speed',    
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_till_pickup'
]

encoding_cols = []
for col1 in original_cat_cols:
    for col2 in neumeric_cols_to_multiply:
        name = f'{col1}@{col2}'
        train_df[name] = train_df[col1] * train_df[col2]
        test_df[name] = test_df[col1] * test_df[col2]
        encoding_cols.append(name)

In [None]:
features += encoding_cols

In [None]:
train = train_df[features]
test = test_df[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

# Model 2

In [None]:
train_df = pd.read_csv('train_df_final.csv')
train_df = train_df.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

y = train_df['label'].values

In [None]:
original_cat_cols = [
    'pickup_hour',
    'drop_hour',
#     'pick_cluster'
]

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
    'meter_waiting_till_pickup',
    
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    
    'predicted_duration_diff',
    'predicted_fare_diff_per_predicted_fare',
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_distance',
]

cat_features = []

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
features += original_cat_cols
cat_features += original_cat_cols

In [None]:
neumeric_cols_to_multiply = [
    'fare_per_distance',  
#     'fare_per_duration',
#     'fare-additional_fare_per_duration',
    'avg_speed',    
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_till_pickup'
]

encoding_cols = []
for col1 in original_cat_cols:
    for col2 in neumeric_cols_to_multiply:
        name = f'{col1}@{col2}'
        train_df[name] = train_df[col1] * train_df[col2]
        test_df[name] = test_df[col1] * test_df[col2]
        encoding_cols.append(name)

In [None]:
special_features = []

train_df['pickup_timeslot@distance'] = (train_df['pickup_timeslot']+1) * train_df['distance_km']
test_df['pickup_timeslot@distance'] = (test_df['pickup_timeslot']+1) * test_df['distance_km']
special_features.append('pickup_timeslot@distance')

In [None]:
features += encoding_cols
features += special_features

In [None]:
train = train_df[features]
test = test_df[features]
y = train_df['label']

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)


In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
models = []
train_pools = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    train_pools.append(train_pool)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

# Model 3

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat,average='micro'), True

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred),average='micro')
    return 'f1_err', err

In [None]:
train_df = pd.read_csv('train_df_final.csv')
train_df = train_df.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
y = train_df['label'].values

In [None]:
model_predictions_train = pd.DataFrame()
model_predictions_train['label'] = train_df['label']
model_predictions_test = pd.DataFrame()

In [None]:
features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
]

cat_features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
]

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
}

submission_pool = Pool(data=test_df[features], cat_features=cat_features)

train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
name = 'catboost_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    models.append(model)

In [None]:
name = 'lgb_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
knn_params = {
    'n_neighbors':15,
    'weights':'uniform'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'knn_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
rf_params = {
    'n_estimators':50,
    'max_depth':10,
    'random_state':0,    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = RandomForestClassifier(**rf_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)


In [None]:
name = 'rf_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':2,
    'kernel':'linear',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'svc_linear_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':2,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'svc_rbf_anomaly'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
features = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup',
    'fare',
    'pickup_date',
    'pickup_hour',
    'pickup_minute',
    'drop_date',
    'drop_hour',
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'distance_km',
    'fare_per_km',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
    'cal_time_difference',
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare',
    'fare-additional_fare-meter_waiting_fare',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',
    'fare_per_distance_mean',
    'fare_per_distance_mean_diff',
    'avg_speed_mean',
    'avg_speed_mean_diff',
    'meter_waiting_per_duration_mean',
    'meter_waiting_per_duration_mean_diff',
    'meter_waiting_fare_per_meter_waiting_mean',
    'meter_waiting_fare_per_meter_waiting_mean_diff',
    'meter_waiting_fare_per_duration_mean',
    'meter_waiting_fare_per_duration_mean_diff',
    'addtional_fare_per_fare_mean',
    'addtional_fare_per_fare_mean_diff',
    'addtional_fare_per_distance_mean',
    'addtional_fare_per_distance_mean_diff',
    'addtional_fare_per_duration_mean',
    'addtional_fare_per_duration_mean_diff',
]

cat_features = [
    'pickup_date',
    'pickup_hour',
    'pickup_minute',
    'drop_date',
    'drop_hour',
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
]

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
name = 'catboost_base'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    models.append(model)

In [None]:
name = 'lgb_base'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
features = [
    'predicted_fare_diff',
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare',
    'predicted_fare_diff_per_distance',
    'predicted_duration_diff',
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration',
    'predicted_duraton_diff_per_distance',
    'predicted_fare_per_duration_diff',
    'predicted_avg_speed_diff',
    'predicted_meter_waiting_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_distance',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare_diff',
    'predicted_meter_waiting_fare_diff_per_distance',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare_diff',
    'predicted_additional_fare_diff_per_additional_fare',
    'predicted_addtional_fare_diff_per_distance',
    'predicted_meter_waiting_till_pickup_diff',
    'predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup',
    'predicted_meter_waiting_till_pickup_per_meter_waiting_diff',
    'predicted_fare_diff_per_distance_normalized',
    'predicted_fare_diff_normalized',
    'predicted_fare_diff_per_fare_normalized',
    'predicted_fare_diff_per_predicted_fare_normalized',
    'predicted_duraton_diff_per_duraton_normalized',
    'predicted_duraton_diff_per_predicted_duration_normalized',
    'predicted_fare_per_duration_diff_normalized',
    'predicted_avg_speed_diff_normalized',
    'predicted_meter_waiting_diff_normalized',
    'predicted_meter_waiting_diff_per_meter_waiting_normalized',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting_normalized',
    'predicted_meter_waiting_per_duration_diff_normalized',
    'predicted_meter_waiting_fare_diff_normalized',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_per_meter_waiting_diff_normalized',
    'predicted_meter_waiting_fare_per_duration_diff_normalized',
    'predicted_additional_fare_diff_normalized',
    'predicted_additional_fare_diff_per_additional_fare_normalized',
]

cat_features = []

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
name = 'catboost_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':6
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    models.append(model)


In [None]:
name = 'lgb_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
knn_params = {
    'n_neighbors':10,
    'weights':'uniform'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'knn_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':1,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'auto'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'svm_pred_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
features = [
    'predicted_fare_diff_per_fare',
    'predicted_fare_diff_per_predicted_fare',
    'predicted_fare_diff_per_distance',
    'predicted_duraton_diff_per_duraton',
    'predicted_duraton_diff_per_predicted_duration',
    'predicted_duraton_diff_per_distance',
    'predicted_fare_per_duration_diff',
    'predicted_meter_waiting_diff_per_meter_waiting',
    'predicted_meter_waiting_diff_per_distance',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting',
    'predicted_meter_waiting_per_duration_diff',
    'predicted_meter_waiting_fare_diff_per_distance',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare',
    'predicted_meter_waiting_fare_per_meter_waiting_diff',
    'predicted_meter_waiting_fare_per_duration_diff',
    'predicted_additional_fare_diff_per_additional_fare',
    'predicted_addtional_fare_diff_per_distance',
    'predicted_meter_waiting_till_pickup_diff_per_meter_waiting_till_pickup',
    'predicted_meter_waiting_till_pickup_per_meter_waiting_diff'
]

cat_features = []

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
name = 'catboost_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':8
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    models.append(model)

In [None]:
name = 'lgb_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
knn_params = {
    'n_neighbors':10,
    'weights':'distance'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'knn_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':5,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'auto'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'rbf_svm_unit_diff'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
features = [
    'predicted_fare_diff_per_distance_normalized',
    'predicted_fare_diff_normalized',
    'predicted_fare_diff_per_fare_normalized',
    'predicted_fare_diff_per_predicted_fare_normalized',
    'predicted_duraton_diff_per_duraton_normalized',
    'predicted_duraton_diff_per_predicted_duration_normalized',
    'predicted_fare_per_duration_diff_normalized',
    'predicted_avg_speed_diff_normalized',
    'predicted_meter_waiting_diff_normalized',
    'predicted_meter_waiting_diff_per_meter_waiting_normalized',
    'predicted_meter_waiting_diff_per_predicted_meter_waiting_normalized',
    'predicted_meter_waiting_per_duration_diff_normalized',
    'predicted_meter_waiting_fare_diff_normalized',
    'predicted_meter_waiting_fare_diff_per_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_diff_per_predicted_meter_waiting_fare_normalized',
    'predicted_meter_waiting_fare_per_meter_waiting_diff_normalized',
    'predicted_meter_waiting_fare_per_duration_diff_normalized',
    'predicted_additional_fare_diff_normalized',
    'predicted_additional_fare_diff_per_additional_fare_normalized',
]

cat_features = []

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)

In [None]:
knn_params = {
    'n_neighbors':15,
    'weights':'distance'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(train_org.values),average='micro'))
    models.append(model)

In [None]:
name = 'knn_unit_diff_norm'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':5,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'scale'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'svm_rbf_unit_diff_norm'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
]

cat_features = []

train = train_df[features]
test = test_df[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
name = 'catboost_unit'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
lgb_params = {
    'objective':'binary',
    'learning_rate':0.05,
    'seed':0, 
    'metric':'f1',
    'max_depth':8
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_data = lgb.Dataset(X_train,y_train)
    valid_data = lgb.Dataset(X_test,y_test)
    evals_result = {}
    model = lgb.train(lgb_params, train_data,num_boost_round=1000,early_stopping_rounds=50, valid_sets=valid_data,feval=lgb_f1_score, evals_result=evals_result,verbose_eval=False)
    
    test_preds += model.predict(test) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,np.round(model.predict(X_test)),average='micro'))
    models.append(model)

In [None]:
name = 'lgb_unit'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
knn_params = {
    'n_neighbors':10,
    'weights':'distance'    
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = KNeighborsClassifier(**knn_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'knn_unit'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
svc_params = {
    'C':10,
    'kernel':'rbf',
    'random_state':0,    
    'probability': False,
    'gamma':'auto'
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = SVC(**svc_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
name = 'svm_unit'
model_predictions_train[name] = train_preds
model_predictions_test[name] = test_preds

In [None]:
for col in model_predictions_test.columns:
    model_predictions_train[col] = np.round(model_predictions_train[col]).astype(int)
    model_predictions_test[col] = np.round(model_predictions_test[col]).astype(int)

In [None]:
features = [
    'catboost_unit',
    'knn_pred_diff',
    'catboost_pred_diff',
    'lgb_pred_diff',
    'catboost_unit_diff',
    'lgb_unit_diff',
    'svm_pred_diff',
    'svc_linear_anomaly',
    'catboost_anomaly',
    'rbf_svm_unit_diff',
    'rf_anomaly'
]
cat_cols = [
    'knn_pred_diff',
    'svm_pred_diff',
    'svc_linear_anomaly',
    'svc_linear_anomaly',
    'rbf_svm_unit_diff',
    'rf_anomaly',
]

In [None]:
for each in cat_cols:
    model_predictions_train[each] = model_predictions_train[each].astype(int)
    model_predictions_test[each] = model_predictions_test[each].astype(int)

In [None]:
train = model_predictions_train[features]
test = model_predictions_test[features]
y = train_df['label']

In [None]:
logistic_reg_params = {
    'penalty':'l2',
    'C':1.0,
    'solver':'lbfgs',
    'max_iter':100
}

In [None]:
folds = 3
skf = StratifiedKFold(n_splits=folds)
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
validation_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:].values, train.iloc[test_index,:].values
    y_train, y_test = y[train_index], y[test_index]
    
    model = LogisticRegression(**logistic_reg_params)
    model.fit(X_train, y_train)
    
    test_preds += model.predict(test.values) / 3
    train_preds[test_index] = model.predict(X_test)
    validation_scores.append(f1_score(y_test,model.predict(X_test),average='micro'))
    models.append(model)

In [None]:
submission_df['prediction'] = np.round(test_preds).astype(int)
submission_df.to_csv('submission.csv',index=False)