# Data Analysis

## Data Glimpsing

In [1]:
import os
from pathlib import Path
import warnings
import random

import numpy as np
import pandas as pd
from typing import Tuple
from scipy.stats import ks_2samp
from tqdm import tqdm, tqdm_notebook
from pylab import *

from IPython.display import Image
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import dabl
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest, GradientBoostingClassifier
from sklearn.utils import resample

from imblearn.over_sampling import SMOTE

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, LeakyReLU, Embedding, SpatialDropout1D,
                                     BatchNormalization, Dropout, Concatenate, Reshape)
from tensorflow.keras import callbacks, backend
from tensorflow.keras.utils import plot_model, to_categorical
import tensorflow as tf

from pandas.plotting import scatter_matrix

In [2]:
%matplotlib inline
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore')

pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('precision', 5)

tqdm_notebook().pandas()

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

In [3]:
# Loading Data and # Viewing Raw Data 
fraud_data = pd.read_csv('Datasets/creditcard.csv')
fraud_data.head()




Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.35981,-0.07278,2.53635,1.37816,-0.33832,0.46239,0.2396,0.0987,0.36379,0.09079,-0.5516,-0.6178,-0.99139,-0.31117,1.46818,-0.4704,0.20797,0.02579,0.40399,0.25141,-0.01831,0.27784,-0.11047,0.06693,0.12854,-0.18911,0.13356,-0.02105,149.62,0
1,0.0,1.19186,0.26615,0.16648,0.44815,0.06002,-0.08236,-0.0788,0.0851,-0.25543,-0.16697,1.61273,1.06524,0.4891,-0.14377,0.63556,0.46392,-0.1148,-0.18336,-0.14578,-0.06908,-0.22578,-0.63867,0.10129,-0.33985,0.16717,0.12589,-0.00898,0.01472,2.69,0
2,1.0,-1.35835,-1.34016,1.77321,0.37978,-0.5032,1.8005,0.79146,0.24768,-1.51465,0.20764,0.6245,0.06608,0.71729,-0.16595,2.34586,-2.89008,1.10997,-0.12136,-2.26186,0.52498,0.248,0.77168,0.90941,-0.68928,-0.32764,-0.1391,-0.05535,-0.05975,378.66,0
3,1.0,-0.96627,-0.18523,1.79299,-0.86329,-0.01031,1.2472,0.23761,0.37744,-1.38702,-0.05495,-0.22649,0.17823,0.50776,-0.28792,-0.63142,-1.05965,-0.68409,1.96578,-1.23262,-0.20804,-0.1083,0.00527,-0.19032,-1.17558,0.64738,-0.22193,0.06272,0.06146,123.5,0
4,2.0,-1.15823,0.87774,1.54872,0.40303,-0.40719,0.09592,0.59294,-0.27053,0.81774,0.75307,-0.82284,0.5382,1.34585,-1.11967,0.17512,-0.45145,-0.23703,-0.03819,0.80349,0.40854,-0.00943,0.79828,-0.13746,0.14127,-0.20601,0.50229,0.21942,0.21515,69.99,0


In [4]:
# Dimension of data
fraud_data.shape

(284807, 31)

In [5]:
# Data type
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

<font color='#00AA00'>
Observation: <br />
1. There are 29 input variables and 1 output variables (Class); <br />
2. All the input variables is float type whereas the data type of out variable (Class) is int64; <br />
3. No NaN variables or Null variables is found in the dataset as the non-null count of each variables match the total number of rows in the dataset.
</font>

In [6]:
# Summarizing data
fraud_data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.85958,3.91956e-15,5.68817e-16,-8.76907e-15,2.78231e-15,-1.55256e-15,2.01066e-15,-1.69425e-15,-1.92703e-16,-3.13702e-15,1.76863e-15,9.17032e-16,-1.81066e-15,1.69344e-15,1.47905e-15,3.48234e-15,1.39201e-15,-7.52849e-16,4.32877e-16,9.04973e-16,5.0855e-16,1.53729e-16,7.95991e-16,5.36759e-16,4.45811e-15,1.453e-15,1.6991e-15,-3.66016e-16,-1.20605e-16,88.34962,0.00173
std,47488.14595,1.9587,1.65131,1.51626,1.41587,1.38025,1.33227,1.23709,1.19435,1.09863,1.08885,1.02071,0.999201,0.995274,0.958596,0.915316,0.876253,0.849337,0.838176,0.814041,0.770925,0.734524,0.725702,0.62446,0.605647,0.521278,0.482227,0.403632,0.330083,250.12011,0.04153
min,0.0,-56.4075,-72.7157,-48.3256,-5.68317,-113.743,-26.1605,-43.5572,-73.2167,-13.4341,-24.5883,-4.79747,-18.6837,-5.79188,-19.2143,-4.49894,-14.1299,-25.1628,-9.49875,-7.21353,-54.4977,-34.8304,-10.9331,-44.8077,-2.83663,-10.2954,-2.60455,-22.5657,-15.4301,0.0,0.0
25%,54201.5,-0.920373,-0.59855,-0.890365,-0.84864,-0.691597,-0.768296,-0.554076,-0.20863,-0.643098,-0.535426,-0.762494,-0.405571,-0.648539,-0.425574,-0.582884,-0.468037,-0.483748,-0.49885,-0.456299,-0.211721,-0.228395,-0.54235,-0.161846,-0.354586,-0.317145,-0.326984,-0.0708395,-0.0529598,5.6,0.0
50%,84692.0,0.0181088,0.0654856,0.179846,-0.0198465,-0.0543358,-0.274187,0.0401031,0.022358,-0.0514287,-0.0929174,-0.0327574,0.140033,-0.0135681,0.0506013,0.0480715,0.0664133,-0.0656758,-0.00363631,0.00373482,-0.0624811,-0.0294502,0.00678194,-0.0111929,0.0409761,0.0165935,-0.0521391,0.00134215,0.0112438,22.0,0.0
75%,139320.5,1.31564,0.803724,1.0272,0.743341,0.611926,0.398565,0.570436,0.327346,0.597139,0.453923,0.739593,0.618238,0.662505,0.49315,0.648821,0.523296,0.399675,0.500807,0.458949,0.133041,0.186377,0.528554,0.147642,0.439527,0.350716,0.240952,0.0910451,0.07828,77.165,0.0
max,172792.0,2.45493,22.0577,9.38256,16.8753,34.8017,73.3016,120.589,20.0072,15.595,23.7451,12.0189,7.84839,7.12688,10.5268,8.87774,17.3151,9.25353,5.04107,5.59197,39.4209,27.2028,10.5031,22.5284,4.58455,7.51959,3.51735,31.6122,33.8478,25691.16,1.0


In [7]:
# Response Variable Analysis
class_names = {0:'Not Fraud', 1:'Fraud'}
fraud_data.Class.value_counts().rename(index = class_names)

Not Fraud    284315
Fraud           492
Name: Class, dtype: int64

In [8]:
random_seed = 1017

def set_seed(seed: int):
    """set all random seed"""
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(random_seed)

In [9]:
def split_feature_target(df: pd.DataFrame, target: str) -> Tuple[np.array, np.array]:
    """
    Split the dataset
    :param df: raw dataset
    :param target: name of target column
    """

    x_ = df.drop(target, axis=1).values
    y_ = df[target].values
    y_ = y_.reshape(len(y_), 1)
    return x_, y_

def split_df(df: pd.DataFrame, random_seed: int) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split the dataset
    :param df: raw dataset
    :param random_seed: random seed
    """

    # For Demo
    train = df.sample(frac=0.8, random_state=random_seed)
    dev_and_test = df.drop(train.index)
    dev = dev_and_test.sample(frac=0.5, random_state=random_seed)
    test = dev_and_test.drop(dev.index)

    return train, dev, test

def check_score(old_met, new_met):
    print(f"mean of top3 auc for old models: {round(old_met[:3]['auc'].mean(), 4)}", 
          f"new models: {round(new_met[:3]['auc'].mean(), 4)}")
    print(f"mean of top3 rcal for old models: {round(old_met[:3]['recall'].mean(), 4)}", 
          f"new models: {round(new_met[:3]['recall'].mean(), 4)}")

def benchmark_models(train_x:np.array, dev_x:np.array, train_y:np.array, dev_y:np.array, 
                     models_dict: dict, random_s: int, suffix: str = None) -> pd.DataFrame:
    """
    fit models and return the model scores, without model tuning.

    :param train_x, dev_x, train_y, dev_y: input data feature and target
    :param models_dict: dictionary of target models
    :param random_s: random seed
    :param suffix: suffix for model name
    :return: model score dataframe
    """

    #get score
    cv_s = []
    f1_ = []
    auc_ = []
    recall_ = []
    acc_ = []
    model_names = []
    models = {}
    # iterate train and fit model
    for name, model in tqdm(models_dict.items()):
        if suffix:
            name = f"{name}_{suffix}"
        else:
            name = name
        
        # fit model
        model.fit(train_x, train_y)
        cv_score = cross_val_score(model, train_x, train_y, scoring='recall', 
                                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=random_s))
        cv_s.append(cv_score.mean())

        # dev validation
        predict_y = model.predict(dev_x)
        f1_.append(f1_score(dev_y, predict_y))
        acc_.append(accuracy_score(dev_y, predict_y))
        recall_.append(recall_score(dev_y, predict_y))

        probability_y = model.predict_proba(dev_x)
        auc_.append(roc_auc_score(dev_y, probability_y[:, 1]))

        # save model
        model_names.append(name)
        models[name] = model
    
    # save val result
    model_metrics = pd.DataFrame({"model": model_names, "recall": recall_, "f1": f1_, "auc": auc_,
                                  "accuracy": acc_, "train_cv": cv_s})
    model_metrics.sort_values(["auc","recall"], inplace=True, ascending=False)

    return model_metrics.set_index("model"), models


In [11]:
# benchmark model with raw data

# split the data
train, dev, test = split_df(fraud_data, random_seed)

x_train, y_train = split_feature_target(train, 'Class')
x_dev, y_dev = split_feature_target(dev, 'Class')

raw_metrics, benchmark_model = benchmark_models(
    train_x=x_train, dev_x=x_dev, train_y=y_train, dev_y=y_dev,
    models_dict={"LReg": LogisticRegression(),
                 "LR_balance": LogisticRegression(class_weight="balanced"),
                 "LR_balance_libl": LogisticRegression(solver="liblinear",
                                                       class_weight="balanced"),
                 "DecisionTree": DecisionTreeClassifier(),
                 "KNN": KNeighborsClassifier(),
                 "GaussianNB": GaussianNB(),
                 "RandomForest": RandomForestClassifier(n_estimators=50),
                 "XGB": XGBClassifier(n_estimators=50, eval_metric="logloss"),
                 "GBT": GradientBoostingClassifier(n_estimators=50),
                 "LightGBM": LGBMClassifier(n_estimators=50)},
    random_s=random_seed)

raw_metrics

 40%|████      | 4/10 [10:49<16:14, 162.44s/it]


KeyboardInterrupt: 

## EDA and Feature engineering

In [None]:
# Feature engineering by RandomForest
feature_score = pd.DataFrame({'feature_score': benchmark_model["RandomForest"].feature_importances_, 
                              'feature_name': train.columns[:-1]}).sort_values(
                                  by='feature_score', ascending=False)
plt.figure(figsize=(10, 5))
plt.bar(x='feature_name', height='feature_score', data=feature_score)

plt.title('feature_score')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Check feature correlation
def plot_corr_map(df: pd.DataFrame) -> None:
    """
    Plot the feature correlation
    """
    fig = plt.figure(figsize=(36,14))

    # colleration matrix
    corr = df.corr()
    sns.heatmap(corr, annot=True)
    plt.show()

def get_large_correlation(df: pd.DataFrame, top_corr: int) -> None:
    """
    Find top N correlations
    :param df: input dataframe
    :param top_corr: select top n correlation pairs
    """
    # plot corr matrix
    plot_corr_map(df)

    print("Let's select some correlations automatically")
    # colleration pairs
    all_corr = df.corr().abs().unstack()

    # remove the diagonal/upper triangular pairs
    drop_pairs = set()
    corr_cols = df.columns
    for i in range(df.shape[1]):
        for j in range(i+1):
            drop_pairs.add((corr_cols[i], corr_cols[j]))
    
    # return large colleration pairs
    all_corr.drop(labels=drop_pairs, inplace=True)
    print(f" Top {top_corr} Correlations ".center(60, '*'))
    print(all_corr.sort_values(ascending=False)[:top_corr])
    return None

get_large_correlation(fraud_data, 20)

<font color='#00AA00'>
1. No variable pairs have strong correlation; <br />
2. "V2, Amount", "V5, Amount", "V7, Amount" and "V20 Amount" have weak correlation between themselves; <br />
3. "V14, Class" and "V17, Class" have weak correlation between themselves.
</font>

In [None]:
# Check numerical distribution
def check_numerical_distribution(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """check the distribution and outlier from fraud and normal trx"""
    quant1 = df[col].quantile(0.25)
    quant3 = df[col].quantile(0.75)
    std_r = (quant3 - quant1) * 1.5

    indicate = df[col] < quant1 - std_r
    indicate |= df[col] > quant3 + std_r

    print(f"for {col} distribution ratio between fraud and normal is "
          f"{len(df.loc[(indicate) & (df['Class']==1)]) / len(df.loc[indicate])}")

for col in ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount"]:
    check_numerical_distribution(fraud_data, col)

<font color='#00AA00'>
The distribution of the variables between fraud and normal trx is quite different.
</font>

In [None]:
class Preprocessing():
    @staticmethod
    def remove_corr_col(df: pd.DataFrame) -> pd.DataFrame:
        return df.drop(columns=["V2", "V3", "V5", "V7", "V20"])

class FeatureEngineering():
    @staticmethod
    def normalization(train_: pd.DataFrame, dev_: pd.DataFrame,
                      test_: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """ normalze column values """
        scaler = StandardScaler()
        cols_to_norm = [col for col in train_.columns if col not in ["Class"]]
        train_[cols_to_norm] = scaler.fit_transform(train_[cols_to_norm])

        dev_[cols_to_norm] = scaler.fit_transform(dev_[cols_to_norm])
        test_[cols_to_norm] = scaler.fit_transform(test_[cols_to_norm])

        return train_, dev_, test_
        
    @staticmethod
    def new_feature(train_: pd.DataFrame, dev_: pd.DataFrame,
                    test_: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

        def _get_outlier_for_mean(train_, dev_, test_, col: str, new_col_name: str):
            v_mean = train_[col].mean()
            v_std = train_[col].std()
            train_[new_col_name] = [0 if i < (v_mean-v_std) else 2 if i > (v_mean+v_std) else 1 for i in train_[col]]
            dev_[new_col_name] = [0 if i < (v_mean-v_std) else 2 if i > (v_mean+v_std) else 1 for i in dev_[col]]
            test_[new_col_name] = [0 if i < (v_mean-v_std) else 2 if i > (v_mean+v_std) else 1 for i in test_[col]]

        def _get_outlier_2(train_, dev_, test_, col: str):
            """check the distribution and outlier from fraud and normal trx"""
            quant1 = train_[col].quantile(0.25)
            quant3 = train_[col].quantile(0.75)
            std_r = (quant3 - quant1)*1.5

            train_outlier = train_[col] < quant1 - std_r
            train_outlier |= train_[col] > quant3 + std_r

            quant1 = dev_[col].quantile(0.25)
            quant3 = dev_[col].quantile(0.75)
            std_r = (quant3 - quant1)*1.5
            
            dev_outlier = dev_[col] < quant1 - std_r
            dev_outlier |= dev_[col] > quant3 + std_r

            quant1 = test_[col].quantile(0.25)
            quant3 = test_[col].quantile(0.75)
            std_r = (quant3 - quant1)*1.5
            
            test_outlier = test_[col] < quant1 - std_r
            test_outlier |= test_[col] > quant3 + std_r

            new_col = col + "_outlier"
            train_[new_col] = 0
            dev_[new_col] = 0
            test_[new_col] = 0

            train_.loc[train_outlier.index, new_col] = 1
            dev_.loc[dev_outlier.index, new_col] = 1
            test_.loc[test_outlier.index, new_col] = 1

        # =========================
        # outlier for mean column

        # outlier since these have different distribution between fraud and normal trx
        for col in ["Time", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount"]:
            _get_outlier_2(train_, dev_, test_, col)
        
        return train_, dev_, test_
        
    @staticmethod
    def resample_with_smote(df: pd.DataFrame, random_s: int) -> pd.DataFrame:
        """resample the train data"""
        t_x = df.drop(columns="Class")
        t_y = df[["Class"]]

        t_x_new, t_y_new = SMOTE(random_state=random_s).fit_resample(t_x, t_y)
        resample_df = t_x_new.join(t_y_new)
        return resample_df
        
    @staticmethod
    def down_sample(df: pd.DataFrame, random_s: int) -> pd.DataFrame:
        """randomly apply the undersampling"""
        fraud_trx = df.loc[df["Class"] == 1]
        normal_trx = df.drop(fraud_trx.index)

        normal_trx = resample(normal_trx, replace=False, n_samples=len(fraud_trx), random_state=random_s)

        resample_df = pd.concat([fraud_trx, normal_trx])
        return resample_df
        
    @staticmethod
    def over_sample(df: pd.DataFrame, random_s: int) -> pd.DataFrame:
        """randomly apply the oversampling"""
        fraud_trx = df.loc[df["Class"] == 1]
        normal_trx = df.drop(fraud_trx.index)

        fraud_trx = resample(fraud_trx, replace=False, n_samples=len(normal_trx), random_state=random_s)

        resample_df = pd.concat([fraud_trx, normal_trx])
        return resample_df
        
    @staticmethod
    def manual_feature(df: pd.DataFrame, random_s: int) -> pd.DataFrame:
        """creating some feature based on gut feeling"""

        fraud_trx = df.loc[df["Class"] == 1]
        normal_trx = df.drop(fraud_trx.index)

        normal_trx = resample(normal_trx, replace=False, n_samples=len(fraud_trx), random_state=random_s)

        resample_df = pd.concat([fraud_trx, normal_trx])
        return resample_df
        
    @staticmethod
    def autoencoder_feature(x_tr: np.array, x_de: np.array, x_te: np.array, 
                            encode_size: int, random_s: int) -> Tuple[np.array, np.array, np.array]:
        set_seed(random_s)
        # define encoder
        n_inputs = x_tr.shape[1]
        print(f"autoencoder n_inputs {n_inputs}, encode_size {encode_size}")

        visible = Input(shape=(n_inputs,))
        # encoder level 1
        e = Dense(n_inputs*2)(visible)
        e = BatchNormalization()(e)
        e = LeakyReLU()(e)

        # bottleneck
        n_bottleneck = encode_size
        bottleneck = Dense(n_bottleneck)(e)

        # decoder, level 1
        d = Dense(n_inputs*2)(bottleneck)
        d = BatchNormalization()(d)
        d = LeakyReLU()(d)

        # output layer
        output = Dense(n_inputs, activation="linear")(d)

        # define autoencoder model
        model = Model(inputs=visible, outputs=bottleneck)
        model.compile(optimizer='adam', loss='mse')

        # fit the autoencoder model to reconstruct input
        history = model.fit(x_tr, x_tr, epochs=80, batch_size=256, verbose=0, validation_data=(x_de, x_de))

        # plot loss
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='val')
        plt.legend()
        plt.show()

        # define an encoder model (without the decoder)
        encoder = Model(inputs=visible, outputs=bottleneck)

        # encode the train data
        x_train_encode = encoder.predict(x_tr)
        x_train_encode = np.concatenate((x_tr, x_train_encode), axis=1)

        # encode the dev data
        x_dev_encode = encoder.predict(x_de)
        x_dev_encode = np.concatenate((x_de, x_dev_encode), axis=1)

        # encode the test data
        x_test_encode = encoder.predict(x_te)
        x_test_encode = np.concatenate((x_te, x_test_encode), axis=1)

        return x_train_encode, x_dev_encode, x_test_encode

def get_df(df: pd.DataFrame,
           random_s: int,
           drop_corr_col: bool=True,
           #use_ohe_category: bool=True,
           apply_normalization: bool=True,
           #apply_log_trf: bool=False,
           add_feature: bool=True,
           resample: bool=True,
           downsample: bool=False,
           oversample: bool=False,
           use_autoencoder: bool=False,
           encode_size: int=8) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Data preprocessing
    :param df: input df
    :return: new df
    """
    new_df = df.copy()

    Prepo = Preprocessing()
    FeatureE = FeatureEngineering()

    # add new features (preprocess)
    #if add_feature:
    #    new_df = Prepo.add_feature(new_df)

    # ohe_category variable
    #if use_ohe_category:
    #    new_df = Prepo.use_ohe_category(new_df)

    # remove high correlation columns
    if drop_corr_col:
        new_df = Prepo.remove_corr_col(new_df)

    #if apply_log_trf:
    #    new_df = Prepo.apply_log(new_df)
                 
    # ==========================
    # split data
    train, dev, test = split_df(new_df, random_s)

    # add new features 
    if add_feature:
        train, dev, test = FeatureE.new_feature(train, dev, test)

    # smote resample data
    if resample:
        downsample=False
        train = FeatureE.resample_with_smote(train, random_s)

    if downsample and resample == False:
        oversample=False
        train = FeatureE.down_sample(train, random_s)

    if oversample:
        train = FeatureE.over_sample(train, random_s)

    # scaler
    if apply_normalization:
        train, dev, test = FeatureE.normalization(train, dev, test)

    # df to array as input
    x_train, y_train = split_feature_target(train, 'Class')
    x_dev, y_dev = split_feature_target(dev, 'Class')
    x_test, y_test = split_feature_target(test, 'Class')

    # apply autoencoder
    if use_autoencoder:
        x_train, x_dev, x_test = FeatureE.autoencoder_feature(
            x_train, x_dev, x_test, encode_size=encode_size, random_s=random_seed
        )
    
    return x_train, y_train, x_dev, y_dev, x_test, y_test

## Check model performance with/without engineering and resampling tech

In [None]:
# check if remove variance / scale / without_resample
x_train, y_train, x_dev, y_dev, x_test, y_test = get_df(fraud_data, random_s=random_seed, add_feature=False,
                                                        resample=False, downsample=False)

metrics_sc_rm_var_rm_col_ds, _ = benchmark_models(
    train_x=x_train, dev_x=x_dev, train_y=y_train, dev_y=y_dev,
    suffix="sc_rm_var_rm_col_ds",
    models_dict={"LReg": LogisticRegression(),
                 "LR_balance": LogisticRegression(class_weight="balanced"),
                 "LR_balance_libl": LogisticRegression(solver="liblinear",
                                                       class_weight="balanced"),
                 "DecisionTree": DecisionTreeClassifier(criterion="entropy", max_depth=5),
                 "KNN": KNeighborsClassifier(),
                 "GaussianNB": GaussianNB(),
                 "RandomForest": RandomForestClassifier(n_estimators=50),
                 "XGB": XGBClassifier(n_estimators=50, eval_metric="logloss"),
                 "GBT": GradientBoostingClassifier(n_estimators=50),
                 "LightGBM": LGBMClassifier(n_estimators=50)},
    random_s=random_seed)

check_score(raw_metrics, metrics_sc_rm_var_rm_col_ds)
metrics_sc_rm_var_rm_col_ds

In [None]:
# check if remove variance / scale / downsample
x_train, y_train, x_dev, y_dev, x_test, y_test = get_df(fraud_data, random_s=random_seed, add_feature=False,
                                                        resample=False, oversample=True)

metrics_sc_rm_var_rm_col_ds, _ = benchmark_models(
    train_x=x_train, dev_x=x_dev, train_y=y_train, dev_y=y_dev,
    suffix="sc_rm_var_rm_col_ds",
    models_dict={"LReg": LogisticRegression(),
                 "LR_balance": LogisticRegression(class_weight="balanced"),
                 "LR_balance_libl": LogisticRegression(solver="liblinear",
                                                       class_weight="balanced"),
                 "DecisionTree": DecisionTreeClassifier(criterion="entropy", max_depth=5),
                 "KNN": KNeighborsClassifier(),
                 "GaussianNB": GaussianNB(),
                 "RandomForest": RandomForestClassifier(n_estimators=50),
                 "XGB": XGBClassifier(n_estimators=50, eval_metric="logloss"),
                 "GBT": GradientBoostingClassifier(n_estimators=50),
                 "LightGBM": LGBMClassifier(n_estimators=50)},
    random_s=random_seed)

check_score(raw_metrics, metrics_sc_rm_var_rm_col_ds)
metrics_sc_rm_var_rm_col_ds

In [None]:
# train autoencoder for classification
x_train, y_train, x_dev, y_dev, x_test, y_test = get_df(fraud_data, random_s=random_seed, add_feature=False,
                                                        resample=False, oversample=True, use_autoencoder=True,
                                                        encode_size=16)

metrics_sc_rm_var_rm_col_ds_autoe, _ = benchmark_models(
    train_x=x_train, dev_x=x_dev, train_y=y_train, dev_y=y_dev,
    suffix="sc_rm_var_rm_col_ds",
    models_dict={"LReg": LogisticRegression(),
                 "LR_balance": LogisticRegression(class_weight="balanced"),
                 "LR_balance_libl": LogisticRegression(solver="liblinear",
                                                       class_weight="balanced"),
                 "DecisionTree": DecisionTreeClassifier(criterion="entropy", max_depth=5),
                 "KNN": KNeighborsClassifier(),
                 "GaussianNB": GaussianNB(),
                 "RandomForest": RandomForestClassifier(n_estimators=50),
                 "XGB": XGBClassifier(n_estimators=50, eval_metric="logloss"),
                 "GBT": GradientBoostingClassifier(n_estimators=50),
                 "LightGBM": LGBMClassifier(n_estimators=50)},
    random_s=random_seed)

check_score(metrics_sc_rm_var_rm_col_ds, metrics_sc_rm_var_rm_col_ds_autoe)
metrics_sc_rm_var_rm_col_ds_autoe

## Try Embedding

In [None]:
def get_cat_data():
    cat_fraud_data = fraud_data.copy()

    # remove some correlation features
    # "V2", "V3", "V5", "V7", "V20"
    cat_fraud_data = cat_fraud_data[["Time", "V1", "V3", "V5", "V6", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "Amount", "Class"]]

    train, dev, _ = split_df(cat_fraud_data, random_seed)

    cols_to_norm = [x for x in cat_fraud_data.columns if x not in ["Class"]]
    scaler = StandardScaler()
    train[cols_to_norm] = scaler.fit_transform(train[cols_to_norm])
    dev[cols_to_norm] = scaler.fit_transform(dev[cols_to_norm])

    # iterate each column, get new feature.
    for col in cols_to_norm:
        new_col = col + "_range"
        max_v = train[col].max()
        min_v = train[col].min()
        std_v = train[col].std()
        step_v = max(int(max(max_v-min_v, 2)) / 10, 1)

        # change numeric to category
        train[new_col] = pd.cut(train[col], range(int(min_v+std_v), int(max_v-std_v), int(step_v)))
        train[new_col] = train[new_col].cat.codes
        train.loc[train[new_col] > int(max_v-std_v), new_col] = train[new_col].max()+1
        train.drop(columns=col, inplace=True)

        dev[new_col] = pd.cut(dev[col], range(int(min_v+std_v), int(max_v-std_v), int(step_v)))
        dev[new_col] = dev[new_col].cat.codes
        dev.loc[dev[new_col] > int(max_v-std_v), new_col] = dev[new_col].max()+1
        for i in dev[new_col].unique():
            if i not in train[new_col].unique():
                new_i = i
                while new_i not in train[new_col].unique():
                    new_i+=1
                    dev.loc[dev[new_col]==i, new_col] = new_i
        dev.drop(columns=col, inplace=True)

    # labelencoder
    train["indi"] = 1
    dev["indi"] = 0
    data = pd.concat([train, dev])
    for col in [x for x in data.columns if x not in ["Class"]]:
        lbl = LabelEncoder()
        data[col] = lbl.fit_transform(data[col].fillna("-1").astype(str).values)
    
    train = data.loc[data["indi"]==1].drop(columns="indi")
    dev = data.loc[data["indi"]==1].drop(columns="indi")

    return train, dev

# auc for tf
def auc(y_t, y_pred):
    def fallback_auc(y_t, y_pred):
        try:
            return roc_auc_score(y_t, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_t, y_pred), tf.double)


def create_model(df, cols):
    # create embedding model
    inputs = []
    outputs = []

    # each feature get itself embedding
    for c in cols:
        num_unique_values = int(df[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 8))

        inp = Input(shape=(1,))
        out = Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = SpatialDropout1D(0.2)(out)
        out = Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)

    # all feature together become one layer
    x = Concatenate()(outputs)
    x = BatchNormalization()(x)

    x = Dense(num_unique_values*2, activation="relu")(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    x = Dense(num_unique_values*2, activation="relu")(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)

    # output
    y = Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    return model


def embedding_model():
    # build embedding model for classfication

    # get train and test data
    X_train, X_test = get_cat_data()
    features = [x for x in X_train.columns if x not in ["Class"]]

    # init model
    model = create_model(pd.concat([X_train, X_test]), features)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])

    oof_preds = np.zeros((len(X_train)))
    test_preds = np.zeros((len(X_test)))

    X_train = X_train.reset_index(drop=True)
    y_train = X_train.Class.values
    X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]

    X_test = X_test.reset_index(drop=True)
    y_test = X_test.Class.values
    X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]

    # callback for early stop and reduce learning rate
    estop = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=3,
                                    verbose=0, mode="max", baseline=None, restore_best_weights=True)
    rlearn = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=3, 
                                    min_lr=1e-5, mode='max', verbose=0)

    # fit model
    model.fit(X_train, to_categorical(y_train), 
              validation_data=(X_test, to_categorical(y_test)),
              verbose=1, batch_size=512, callbacks=[estop, rlearn], epochs=30)

    # check auc score
    test_pred = model.predit(X_test)[:, 1]
    print(f"AUC {roc_auc_score(y_test, test_pred)}")
    K.clear_session()

embedding_model()