In [None]:
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor,ExtraTreesClassifier,RandomForestClassifier
from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_error,f1_score,confusion_matrix,log_loss
from sklearn.kernel_approximation import Nystroem
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA


from catboost import Pool, cv,CatBoostClassifier,CatBoostRegressor

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate,Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import regularizers
import tensorflow_addons as tfa

from tqdm import tqdm

import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor,DMatrix,plot_tree

from imblearn.over_sampling import RandomOverSampler,SMOTE, ADASYN

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('train_df_final.csv')
train_df = train_df.fillna(0)
train_df_org = pd.read_csv('train_df_final_blanced.csv')
train_df_org = train_df_org.fillna(0)
test_df = pd.read_csv('test_df_final.csv')
test_df = test_df.fillna(0)
submission_df = pd.read_csv('sample_submission.csv')

y = train_df['label'].values
y_org = train_df_org['label'].values

In [None]:
base_features = [
    'fare_per_distance',
    'fare_per_duration',
    'avg_speed',
    'meter_waiting_per_duration',
    'meter_waiting_fare_per_meter_waiting',
    'meter_waiting_fare_per_duration',
    'addtional_fare_per_fare',
    'addtional_fare_per_distance',
    'addtional_fare_per_duration',
    'fare-additional_fare_per_distance',
    'fare-additional_fare_per_duration',
    'fare-additional_fare-meter_waiting_fare_per_distance',
    'fare-additional_fare-meter_waiting_fare_per_duration',
    'meter_waiting_till_pickup_per_meter_waiting',
    'meter_waiting_after_pickup_per_duration',
    'meter_waiting_till_pickup_per_duration',
    'meter_waiting_till_pickup_per_distance',
    'meter_waiting_after_pickup_per_distance',
    'meter_waiting_till_pickup_per_fare',
    'meter_waiting_after_pickup_per_fare',
    'meter_waiting_till_pickup_per_meter_waiting_fare',
    'meter_waiting_after_pickup_per_meter_waiting_fare',    
]

base_cat_features = []

cat_cols = [
    'fare_anomaly',
    'additional_fare_anomaly', 
    'duration_anomaly',
    'meter_waiting_anomaly', 
    'meter_waiting_fare_anomaly',
    'meter_waiting_till_pickup_anomaly', 
    'additional_fare_duration_anomaly',
    'additional_fare_meter_waiting_anomaly',
    'additional_fare_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_anomaly', 
    'duration_meter_waiting_fare_anomaly',
    'duration_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_anomaly',
    'meter_waiting_meter_waiting_till_pickup_anomaly',
    'meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_anomaly',
    'additional_fare_duration_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_anomaly',
    'duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_fare_anomaly',
    'additional_fare_duration_meter_waiting_meter_waiting_till_pickup_anomaly',
    'additional_fare_duration_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'additional_fare_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'duration_meter_waiting_meter_waiting_fare_meter_waiting_till_pickup_anomaly',
    'pickup_date',
    'pickup_hour',
    'pickup_minute',
    'drop_date',
    'drop_hour',
    'drop_minute',
    'pick_cluster',
    'is_more_than_one_day',
    'pickup_timeslot',
    'day_of_week',
    'is_weekday',
]

In [None]:
catboost_params = {
    'loss_function':'Logloss',
    'random_state':0,
    'early_stopping_rounds':50,
    'eval_metric':'F1',
    'border_count':512
}

In [None]:
def get_mean_accuracy(features, cat_features,y):
    train = train_df[features]
    test = test_df[features]
    for each in cat_features:
        train[each] = train[each].values.astype(int)
        test[each] = test[each].values.astype(int)
        
    skf = StratifiedKFold(n_splits=3)
    validation_scores = []
    for train_index, test_index in skf.split(train, y):
        X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
        test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
        model = CatBoostClassifier(**catboost_params)
        model.fit(X=train_pool, eval_set=test_pool,verbose=0)
        
        validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    return np.mean(validation_scores), np.min(validation_scores)

In [None]:
selected_features = base_features[:]
other_features = [each for each in test_df.columns if each not in selected_features]

In [None]:
mean_accuracy, _ = get_mean_accuracy(base_features,base_cat_features,y)
feature = None

In [None]:
remaining_features = other_features[:]
# remaining_features.remove('predicted_duration_diff_bucket@predicted_avg_speed')
# selected_features.append('predicted_duration_diff_bucket@predicted_avg_speed')

In [None]:
%%time
while len(remaining_features)>0:
    for new_feature in remaining_features:
        features = selected_features + [new_feature]
        cat_features = [feature for feature in features if feature in cat_cols]
        mean,_ = get_mean_accuracy(features,cat_features,y)
        if mean > mean_accuracy:
            mean_accuracy = mean
            feature = new_feature
            print(new_feature, mean)
    if feature != None:
        selected_features.append(feature)
        remaining_features.remove(feature)
        feature = None
    else:
        break

In [None]:
selected_features

In [None]:
features = selected_features[:]
cat_features = [feature for feature in features if feature in cat_cols]

In [None]:
train = train_df[features]
test = test_df[features]
train_org = train_df_org[features]
y = train_df['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

In [None]:
submission_pool = Pool(data=test_df[features], cat_features=cat_features)
org_pool = Pool(data=train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)

In [None]:
submission_df['prediction'] = test_class
submission_df.to_csv('submission.csv',index=False)

# PCA

In [None]:
features = test_df.columns
X_train = train_df[features].values
X_train_org = train_df_org[features].values
X_test = test_df[features].values

In [None]:
pca = PCA(n_components=40)
X_train_transformed = pca.fit_transform(X_train)
X_test_transformed = pca.transform(X_test)
X_train_org_transformed = pca.transform(X_train_org)

In [None]:
pca_train = pd.DataFrame()
pca_train['label'] = train_df['label']

pca_train_org = pd.DataFrame()
pca_train_org['label'] = y_org

pca_test = pd.DataFrame()
for i in range(40):
    name = f'col_{i}'
    pca_train[name] = X_train_transformed[:,i]
    pca_test[name] = X_test_transformed[:,i]
    pca_train_org[name] = X_train_org_transformed[:,i]
    
pca_train.to_csv('pca_train.csv',index=False)
pca_test.to_csv('pca_test.csv',index=False)

In [None]:
sns.scatterplot(x='col_0',y='col_1',data=pca_train,hue='label')

In [None]:
sns.scatterplot(x='col_0',y='col_3',data=pca_train,hue='label')

In [None]:
features = pca_test.columns
cat_features = []

In [None]:
train = pca_train[features]
test = pca_test[features]
train_org = pca_train_org[features]
y = pca_train['label']
for each in cat_features:
    train[each] = train[each].values.astype(int)
    test[each] = test[each].values.astype(int)
    train_org[each] = train_org[each].values.astype(int)

In [None]:
submission_pool = Pool(data=pca_test[features], cat_features=cat_features)
org_pool = Pool(data=pca_train_org[features], cat_features=cat_features)

In [None]:
train_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
train_class = np.zeros(train_df.shape[0])
test_class = np.zeros(test_df.shape[0])
skf = StratifiedKFold(n_splits=3)
validation_scores = []
org_scores = []
models = []
for train_index, test_index in skf.split(train, y):
    X_train, X_test = train.iloc[train_index,:], train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train_pool = Pool(data=X_train, label=y_train,cat_features=cat_features)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)    
    model = CatBoostClassifier(**catboost_params)
    model.fit(X=train_pool, eval_set=test_pool,verbose=10)
    train_preds[test_index] = model.predict_proba(test_pool)[:,1]
    train_class[test_index] = model.predict(test_pool)
    test_preds += model.predict_proba(submission_pool)[:,1]/3
    test_class += model.predict(submission_pool)
    validation_scores.append(f1_score(y_test,model.predict(test_pool),average='micro'))
    org_scores.append(f1_score(y_org,model.predict(org_pool),average='micro'))
    models.append(model)
test_class = np.where(test_class > 2, 1, 0)

In [None]:
np.mean(validation_scores), np.std(validation_scores), min(validation_scores)

In [None]:
np.mean(org_scores), np.std(org_scores), min(org_scores)