In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,IsolationForest
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder
from sklearn.metrics import mean_squared_error,f1_score
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold

from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

from tqdm import tqdm

In [40]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [41]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
submission_df = pd.read_csv('sample_submission.csv')

data = train_df[train_df['label'] == 1].dropna()

# Isolation forests for anomaly detection on noise columns

In [42]:
def anomaly_pred(col, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[col].values

    X_train_df = train_df[col].values
    X_test_df = test_df[col].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        X_train = X_train.reshape((-1,1))
        X_test = X_test.reshape((-1,1))

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1).reshape(y_test.shape)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df.reshape(-1,1)).reshape(X_train_df.shape).clip(0,1)
        test_preds += model.predict(X_test_df.reshape(-1,1)).reshape(X_test_df.shape).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
        
    train_df[f'{col}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{col}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [43]:
cols = ['additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for col in tqdm(cols):
    validation_scores = anomaly_pred(col)
    print(f'col:{col}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

 20%|██        | 1/5 [00:01<00:07,  1.88s/it]

col:additional_fare, mean:0.9250, std:0.0115


 40%|████      | 2/5 [00:04<00:06,  2.23s/it]

col:duration, mean:0.9226, std:0.0041


 60%|██████    | 3/5 [00:08<00:05,  2.51s/it]

col:meter_waiting, mean:0.9141, std:0.0041


 80%|████████  | 4/5 [00:10<00:02,  2.62s/it]

col:meter_waiting_fare, mean:0.9107, std:0.0049


100%|██████████| 5/5 [00:14<00:00,  2.82s/it]

col:meter_waiting_till_pickup, mean:0.9122, std:0.0019





# Multi column anomaly detection

In [44]:
def anomaly_pred_multi(cols, train_df=train_df, test_df=test_df, folds=3):
    labels = train_df['label'].values
    X = train_df[cols].values

    X_train_df = train_df[cols].values
    X_test_df = test_df[cols].values
    
    skf = StratifiedKFold(n_splits=3)

    validation_scores = []
    models = []

    train_preds = np.zeros(train_df.shape[0])
    test_preds = np.zeros(test_df.shape[0])
#     print(X.shape)

    for train_index, test_index in skf.split(X, labels):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = labels[train_index], labels[test_index]

        model = IsolationForest(random_state=0).fit(X_train)
        preds = model.predict(X_test).clip(0,1)
        validation_score = f1_score(y_test, preds)

        train_preds += model.predict(X_train_df).clip(0,1)
        test_preds += model.predict(X_test_df).clip(0,1)

    #     print('Validation score:' , validation_score)

        validation_scores.append(validation_score)
        models.append(model)
    name = '_'.join(cols)
    train_df[f'{name}_anomaly'] = np.where(train_preds > 2, 1, 0)
    test_df[f'{name}_anomaly'] = np.where(test_preds > 2, 1, 0)
    return validation_scores

In [45]:
cols = ['additional_fare','duration','meter_waiting','meter_waiting_fare','meter_waiting_till_pickup']
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        validation_scores = anomaly_pred_multi([col_1,col_2])
        print(f'cols:{[col_1,col_2]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['additional_fare', 'duration'], mean:0.9169, std:0.0165
cols:['additional_fare', 'meter_waiting'], mean:0.9141, std:0.0130
cols:['additional_fare', 'meter_waiting_fare'], mean:0.9097, std:0.0120
cols:['additional_fare', 'meter_waiting_till_pickup'], mean:0.9144, std:0.0153
cols:['duration', 'meter_waiting'], mean:0.9176, std:0.0031
cols:['duration', 'meter_waiting_fare'], mean:0.9131, std:0.0045
cols:['duration', 'meter_waiting_till_pickup'], mean:0.9174, std:0.0016
cols:['meter_waiting', 'meter_waiting_fare'], mean:0.9146, std:0.0041
cols:['meter_waiting', 'meter_waiting_till_pickup'], mean:0.9164, std:0.0008
cols:['meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9137, std:0.0027


In [46]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            validation_scores = anomaly_pred_multi([col_1,col_2,col_3])
            print(f'cols:{[col_1,col_2,col_3]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['additional_fare', 'duration', 'meter_waiting'], mean:0.9130, std:0.0125
cols:['additional_fare', 'duration', 'meter_waiting_fare'], mean:0.9126, std:0.0125
cols:['additional_fare', 'duration', 'meter_waiting_till_pickup'], mean:0.9143, std:0.0137
cols:['additional_fare', 'meter_waiting', 'meter_waiting_fare'], mean:0.9104, std:0.0124
cols:['additional_fare', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9136, std:0.0123
cols:['additional_fare', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9114, std:0.0123
cols:['duration', 'meter_waiting', 'meter_waiting_fare'], mean:0.9162, std:0.0033
cols:['duration', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9177, std:0.0012
cols:['duration', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9156, std:0.0020
cols:['meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9161, std:0.0025


In [47]:
for i, col_1 in enumerate(cols):
    for col_2 in cols[i+1:]:
        j = cols.index(col_2)
        for col_3 in cols[j+1:]:
            k = cols.index(col_3)
            for col_4 in cols[k+1:]:                
                validation_scores = anomaly_pred_multi([col_1,col_2,col_3,col_4])
                print(f'cols:{[col_1,col_2,col_3,col_4]}, mean:{np.mean(validation_scores):.4f}, std:{np.std(validation_scores):.4f}')

cols:['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare'], mean:0.9163, std:0.0031
cols:['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_till_pickup'], mean:0.9192, std:0.0028
cols:['additional_fare', 'duration', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9176, std:0.0036
cols:['additional_fare', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9169, std:0.0031
cols:['duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup'], mean:0.9166, std:0.0032


In [48]:
validation_scores = anomaly_pred_multi(cols)
np.mean(validation_scores) , np.std(validation_scores)

(0.9189834563552332, 0.0020987658400050775)

# Classifier

In [49]:
params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
#     'class_weights':class_weights
}

In [50]:
train_df.columns

Index(['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare',
       'meter_waiting_till_pickup', 'fare', 'pickup_date', 'pickup_hour',
       'pickup_minute', 'drop_date', 'drop_hour', 'drop_minute',
       'pick_cluster', 'is_more_than_one_day', 'distance_km', 'fare_per_km',
       'pickup_timeslot', 'day_of_week', 'is_weekday', 'cal_time_difference',
       'label', 'additional_fare_anomaly', 'duration_anomaly',
       'meter_waiting_anomaly', 'meter_waiting_fare_anomaly',
       'meter_waiting_till_pickup_anomaly', 'additional_fare_duration_anomaly',
       'additional_fare_meter_waiting_anomaly',
       'additional_fare_meter_waiting_fare_anomaly',
       'additional_fare_meter_waiting_till_pickup_anomaly',
       'duration_meter_waiting_anomaly', 'duration_meter_waiting_fare_anomaly',
       'duration_meter_waiting_till_pickup_anomaly',
       'meter_waiting_meter_waiting_fare_anomaly',
       'meter_waiting_meter_waiting_till_pickup_anomaly',
       'meter_waiting

In [51]:
features = [
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly'
]

cat_features = [
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly'
]

In [52]:
labels = train_df['label'].values
train_df = train_df.drop(['label'], axis=1)[features]

In [53]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
train_df_pool = Pool(data=train_df[features], cat_features=cat_features)

In [54]:
skf = StratifiedKFold(n_splits=3)
validation_scores = []
submission_preds = np.zeros(submission_df.shape[0])
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_pools = []
models = []
for train_index, test_index in skf.split(train_df, labels):
    X_train, X_test = train_df.iloc[train_index,:], train_df.iloc[test_index,:]
    y_train, y_test = labels[train_index], labels[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    pred = model.predict(test_pool)
    validation_score = model.best_score_['validation']['F1']
    print('Validation f1',validation_score)
    validation_scores.append(validation_score)
    models.append(model)
    train_pools.append(train_pool)
    submission_preds += model.predict(submission_pool)
    train_preds += model.predict_proba(train_df_pool)[:,1]
    test_preds += model.predict_proba(submission_pool)[:,1]

Learning rate set to 0.057693
0:	learn: 0.9530599	test: 0.9521957	best: 0.9521957 (0)	total: 4.68ms	remaining: 4.67s
10:	learn: 0.9533645	test: 0.9531395	best: 0.9531395 (5)	total: 46.2ms	remaining: 4.16s
20:	learn: 0.9536295	test: 0.9531395	best: 0.9531395 (5)	total: 89.6ms	remaining: 4.17s
30:	learn: 0.9538504	test: 0.9529455	best: 0.9531395 (5)	total: 136ms	remaining: 4.24s
40:	learn: 0.9541514	test: 0.9533871	best: 0.9534754 (39)	total: 179ms	remaining: 4.17s
50:	learn: 0.9543199	test: 0.9534668	best: 0.9534754 (39)	total: 238ms	remaining: 4.44s
60:	learn: 0.9544084	test: 0.9533698	best: 0.9534754 (39)	total: 283ms	remaining: 4.36s
70:	learn: 0.9546214	test: 0.9535294	best: 0.9535294 (68)	total: 328ms	remaining: 4.3s
80:	learn: 0.9547058	test: 0.9537063	best: 0.9537063 (72)	total: 374ms	remaining: 4.24s
90:	learn: 0.9547986	test: 0.9536977	best: 0.9537948 (82)	total: 419ms	remaining: 4.19s
100:	learn: 0.9550160	test: 0.9535920	best: 0.9537948 (82)	total: 473ms	remaining: 4.21s
110:

In [55]:
np.mean(validation_scores), np.std(validation_scores)

(0.9538121763873875, 0.00023510056290938325)

In [56]:
submission_df['prediction'] = np.where(submission_preds > 2, 1, 0)
submission_df.to_csv('submission_anomaly.csv',index=False)

In [57]:
train_df.to_csv('train_df_anomaly.csv',index=False)
test_df.to_csv('test_df_anomaly.csv',index=False)

In [58]:
stacking_train_df = pd.read_csv('stacking_train_df.csv')
stacking_test_df = pd.read_csv('stacking_test_df.csv')

stacking_train_df['catboost_anomaly'] = train_preds
stacking_test_df['catboost_anomaly'] = submission_preds

In [59]:
stacking_test_df.to_csv('stacking_test_df.csv',index=False)
stacking_train_df.to_csv('stacking_train_df.csv',index=False)