In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,IsolationForest
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder
from sklearn.metrics import mean_squared_error,f1_score
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold

from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

from tqdm import tqdm

In [45]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [46]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

data = train_df[train_df['label'] == 1].dropna()

# Isolation forests for anomaly detection on noise columns

In [47]:
def anomaly_pred(col, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[col].values

    X_train_df = train_df[col].values
    X_test_df = test_df[col].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        X_train = X_train.reshape((-1,1))
        X_test = X_test.reshape((-1,1))

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1).reshape(y_test.shape)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df.reshape(-1,1)).reshape(X_train_df.shape).clip(0,1)
        test_preds += model.predict(X_test_df.reshape(-1,1)).reshape(X_test_df.shape).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
        
    train_df[f'{col}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{col}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [48]:
cols = ['fare','additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for col in tqdm(cols):
    validation_scores = anomaly_pred(col)
    print(f'col:{col}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

 17%|█▋        | 1/6 [00:03<00:15,  3.16s/it]

col:fare, mean:0.9242, std:0.0019


 33%|███▎      | 2/6 [00:05<00:11,  2.79s/it]

col:additional_fare, mean:0.9250, std:0.0115


 50%|█████     | 3/6 [00:08<00:08,  2.88s/it]

col:duration, mean:0.9226, std:0.0041


 67%|██████▋   | 4/6 [00:11<00:05,  2.93s/it]

col:meter_waiting, mean:0.9141, std:0.0041


 83%|████████▎ | 5/6 [00:13<00:02,  2.88s/it]

col:meter_waiting_fare, mean:0.9107, std:0.0049


100%|██████████| 6/6 [00:17<00:00,  2.86s/it]

col:meter_waiting_till_pickup, mean:0.9122, std:0.0019





# Multi column anomaly detection

In [49]:
def anomaly_pred_multi(cols, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[cols].values

    X_train_df = train_df[cols].values
    X_test_df = test_df[cols].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])
#     print(X.shape)

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df).clip(0,1)
        test_preds += model.predict(X_test_df).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
    name = '_'.join(cols)
    train_df[f'{name}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{name}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [50]:
cols = ['fare','additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        validation_scores = anomaly_pred_multi([col_1,col_2])
        print(f'cols:{[col_1,col_2]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['fare', 'additional_fare'], mean:0.9179, std:0.0168
cols:['fare', 'duration'], mean:0.9361, std:0.0016
cols:['fare', 'meter_waiting'], mean:0.9223, std:0.0029
cols:['fare', 'meter_waiting_fare'], mean:0.9187, std:0.0019
cols:['fare', 'meter_waiting_till_pickup'], mean:0.9228, std:0.0023
cols:['additional_fare', 'duration'], mean:0.9169, std:0.0165
cols:['additional_fare', 'meter_waiting'], mean:0.9141, std:0.0130
cols:['additional_fare', 'meter_waiting_fare'], mean:0.9097, std:0.0120
cols:['additional_fare', 'meter_waiting_till_pickup'], mean:0.9144, std:0.0153
cols:['duration', 'meter_waiting'], mean:0.9176, std:0.0031
cols:['duration', 'meter_waiting_fare'], mean:0.9131, std:0.0045
cols:['duration', 'meter_waiting_till_pickup'], mean:0.9174, std:0.0016
cols:['meter_waiting', 'meter_waiting_fare'], mean:0.9146, std:0.0041
cols:['meter_waiting', 'meter_waiting_till_pickup'], mean:0.9164, std:0.0008
cols:['meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9137, std:0.0027


In [51]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            validation_scores = anomaly_pred_multi([col_1,col_2,col_3])
            print(f'cols:{[col_1,col_2,col_3]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['fare', 'additional_fare', 'duration'], mean:0.9183, std:0.0138
cols:['fare', 'additional_fare', 'meter_waiting'], mean:0.9158, std:0.0123
cols:['fare', 'additional_fare', 'meter_waiting_fare'], mean:0.9140, std:0.0120
cols:['fare', 'additional_fare', 'meter_waiting_till_pickup'], mean:0.9152, std:0.0132
cols:['fare', 'duration', 'meter_waiting'], mean:0.9259, std:0.0015
cols:['fare', 'duration', 'meter_waiting_fare'], mean:0.9216, std:0.0020
cols:['fare', 'duration', 'meter_waiting_till_pickup'], mean:0.9251, std:0.0017
cols:['fare', 'meter_waiting', 'meter_waiting_fare'], mean:0.9199, std:0.0029
cols:['fare', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9217, std:0.0023
cols:['fare', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9193, std:0.0015
cols:['additional_fare', 'duration', 'meter_waiting'], mean:0.9130, std:0.0125
cols:['additional_fare', 'duration', 'meter_waiting_fare'], mean:0.9126, std:0.0125
cols:['additional_fare', 'duration', 'meter_waiting

In [52]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            k = cols.index(col_3)
            for col_4 in cols[k+1:]:                
                validation_scores = anomaly_pred_multi([col_1,col_2,col_3,col_4])
                print(f'cols:{[col_1,col_2,col_3,col_4]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['fare', 'additional_fare', 'duration', 'meter_waiting'], mean:0.9194, std:0.0075
cols:['fare', 'additional_fare', 'duration', 'meter_waiting_fare'], mean:0.9191, std:0.0059
cols:['fare', 'additional_fare', 'duration', 'meter_waiting_till_pickup'], mean:0.9192, std:0.0090
cols:['fare', 'additional_fare', 'meter_waiting', 'meter_waiting_fare'], mean:0.9165, std:0.0061
cols:['fare', 'additional_fare', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9184, std:0.0073
cols:['fare', 'additional_fare', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9202, std:0.0036
cols:['fare', 'duration', 'meter_waiting', 'meter_waiting_fare'], mean:0.9197, std:0.0029
cols:['fare', 'duration', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9221, std:0.0017
cols:['fare', 'duration', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9201, std:0.0021
cols:['fare', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9186, std:0.0025
cols:['additiona

In [53]:
validation_scores = anomaly_pred_multi(cols)
np.mean(validation_scores) , np.std(validation_scores)

(0.9221976028898832, 0.0014306015564709482)

# Classifier

In [54]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
#     'class_weights':class_weights
}

In [55]:
train_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'fare', 'pickup_date', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_hour', 'drop_minute',
       'pick_cluster', 'is_more_than_one_day', 'distance_km', 'fare_per_km',
       'pickup_timeslot', 'day_of_week', 'is_weekday', 'cal_time_difference',
       'label', 'fare_anomaly', 'additional_fare_anomaly', 'duration_anomaly',
       'meter_waiting_anomaly', 'meter_waiting_fare_anomaly',
       'meter_waiting_till_pickup_anomaly', 'fare_additional_fare_anomaly',
       'fare_duration_anomaly', 'fare_meter_waiting_anomaly',
       'fare_meter_waiting_fare_anomaly',
       'fare_meter_waiting_till_pickup_anomaly',
       'additional_fare_duration_anomaly',
       'additional_fare_meter_waiting_anomaly',
       'additional_fare_meter_waiting_fare_anomaly',
       'additional_fare_meter_waiting_till_pickup_anomaly',
       'duration_meter_waiting_anomaly', 'duration_mete

In [56]:
features = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    
]

cat_features = [
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',

]

In [57]:
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)[features]

In [58]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
train_df_pool = Pool(data=train_df[features], cat_features=cat_features)

In [59]:
skf = StratifiedKFold(n_splits=3)
validation_scores = []
submission_preds = np.zeros(submission_df.shape[0])
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_pools = []
models = []
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = model.best_score_['validation']['F1']
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    train_pools.append(train_pool)
    submission_preds += model.predict(submission_pool)
    train_preds += model.predict_proba(train_df_pool)[:,1]
    test_preds += model.predict_proba(submission_pool)[:,1]

Learning rate set to 0.057693
0:	learn: 0.9532710	test: 0.9525581	best: 0.9525581 (0)	total: 7.61ms	remaining: 7.6s
10:	learn: 0.9532762	test: 0.9531481	best: 0.9535810 (1)	total: 58.7ms	remaining: 5.28s
20:	learn: 0.9537179	test: 0.9533160	best: 0.9535810 (1)	total: 110ms	remaining: 5.13s
30:	learn: 0.9545096	test: 0.9534754	best: 0.9535810 (1)	total: 165ms	remaining: 5.16s
40:	learn: 0.9552059	test: 0.9537862	best: 0.9538747 (35)	total: 233ms	remaining: 5.45s
50:	learn: 0.9556051	test: 0.9540347	best: 0.9540433 (45)	total: 293ms	remaining: 5.45s
60:	learn: 0.9557341	test: 0.9540176	best: 0.9541148 (55)	total: 353ms	remaining: 5.43s
70:	learn: 0.9559561	test: 0.9542836	best: 0.9542836 (70)	total: 414ms	remaining: 5.41s
80:	learn: 0.9559883	test: 0.9544525	best: 0.9544610 (78)	total: 489ms	remaining: 5.54s
90:	learn: 0.9561983	test: 0.9547103	best: 0.9547103 (88)	total: 552ms	remaining: 5.51s
100:	learn: 0.9563843	test: 0.9546131	best: 0.9548075 (92)	total: 614ms	remaining: 5.46s
110:	

In [60]:
np.mean(validation_scores), np.std(validation_scores)

(0.954733892471095, 0.0004623882893981669)

In [61]:
submission_df['prediction'] = np.where(submission_preds > 2, 1, 0)
submission_df.to_csv('submission_anomaly.csv',index=False)

In [62]:
train_df.to_csv('train_df_anomaly.csv',index=False)
test_df.to_csv('test_df_anomaly.csv',index=False)

In [63]:
stacking_train_df = pd.read_csv('stacking_train_df.csv')
stacking_test_df = pd.read_csv('stacking_test_df.csv')

stacking_train_df['catboost_anomaly'] = train_preds
stacking_test_df['catboost_anomaly'] = submission_preds

In [64]:
stacking_test_df.to_csv('stacking_test_df.csv',index=False)
stacking_train_df.to_csv('stacking_train_df.csv',index=False)