In [1]:
# Based on https://github.com/GoogleCloudPlatform/ml-design-patterns/blob/master/03_problem_representation/rebalancing.ipynb.
#   !gsutil cp gs://ml-design-patterns/fraud_data_kaggle.csv .

import xgboost as xgb
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import tensorflow as tf
import kerastuner as kt
from tensorflow import keras

  import kerastuner as kt


In [2]:
SEED = 42
NROWS = None 

def process_data(df: pd.DataFrame) -> pd.DataFrame:
    df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'], inplace=True)
    df = pd.get_dummies(df)

    target = df.isFraud
    df = df.drop(columns=['isFraud'])
    sc = StandardScaler()    
    df[df.columns] = sc.fit_transform(df)
    df['isFraud'] = target
    return df

def fraud_rate_df(df: pd.DataFrame) -> float:
    fraud_df = df[df['isFraud'] == 1]
    return len(fraud_df)/len(df)

def fraud_rate(y) -> float:
    fraud_indices = np.where(y == 1)[0]
    return len(fraud_indices) / len(y) # type: ignore

def train_split(df: pd.DataFrame):
    y = df.isFraud.values
    X_train, X_test, _, y_test = train_test_split(df, y, stratify=y, test_size=0.2, random_state=SEED)
    train_df = pd.DataFrame(data=X_train, columns=df.columns)
    X_test.drop(columns=['isFraud'], inplace=True) # type: ignore
    return train_df, X_test, y_test

# This is from the sklearn docs
# https://scikit-learn.org/0.18/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    if normalize:
        # cm = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], 3)
        cm = cm / cm.sum(axis=1, keepdims=True)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, f'{cm[i, j]:.3f}',
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")                 

    plt.tight_layout()  
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def train_and_eval(df: pd.DataFrame, should_down_sample: bool=False, frac: float=1.0, use_class_weight=False, initialize_bias=False):

    train_df, X_test, y_test = train_split(df)

    # down_sample
    if should_down_sample:
        print(f"Down sample train {frac}")
        fraud = train_df[train_df['isFraud'] == 1]
        not_fraud = train_df[train_df['isFraud'] == 0]
        not_fraud_sample = not_fraud.sample(random_state=SEED, frac=frac)
        train_df = pd.concat([not_fraud_sample,fraud])
        train_df = shuffle(train_df, random_state=SEED)    
    else:
        print("Original train")

    X_train = train_df.drop(columns=['isFraud']).values
    y_train = train_df.isFraud.values

    print(f'Fraud Rate in (all, train, test)=({fraud_rate_df(df):.5f}, {fraud_rate(y_train):.5f}, {fraud_rate(y_test):.5f})')

    if use_class_weight:
        minor_weight = 1 / fraud_rate(y_train) / 2
        major_weight = 1 / (1 - fraud_rate(y_train)) / 2
        class_weight = {1: minor_weight, 0: major_weight}
    else:
        class_weight = None

    if initialize_bias:
        model = keras.Sequential([
            keras.layers.Dense(10, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'),
            keras.layers.Dense(1, activation='sigmoid')
        ])    
    else:
        len_minority = len(train_df[train_df['isFraud'] == 1])
        len_majority = len(train_df[train_df['isFraud'] == 0])        
        bias_value = np.log(len_minority / len_majority)
        model = keras.Sequential([
            keras.layers.Dense(10, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'),
            keras.layers.Dense(1, activation='sigmoid', bias_initializer=keras.initializers.Constant(value=bias_value))
        ])  

    def build_model(hp):
        model = keras.Sequential()
        for i in range(hp.Int('num_layers', 1, 2)):        
            model.add(keras.layers.Dense(hp.Int(f'hidden_{i}th', 16, 256, step=32), input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
        model.add(keras.layers.Dense(1, activation='sigmoid', bias_initializer=keras.initializers.Constant(value=bias_value)))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])        
        return model        

    tuner = kt.BayesianOptimization(
        build_model,
        objective='val_accuracy',
        max_trials=30
    )      

    tuner.search(X_train, y_train, validation_split=0.1, epochs=10)        
    best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]
    print("best_hps", best_hps, tuner.results_summary())
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])    
    model.fit(X_train, y_train, epochs=2, batch_size=512, verbose=0, class_weight=class_weight)

    y_pred = model.predict(X_test)
    y_pred = y_pred >= 0.5
    y_pred = y_pred.reshape(-1).astype(int)
    print(f'label={y_test} pred={y_pred}')
    print(f'accuracy_score: {accuracy_score(y_test, y_pred):.4f}')    
    print(f'precision_score: {precision_score(y_test, y_pred):.4f}')
    print(f'recall_score: {recall_score(y_test, y_pred):.4f}')    
    print(f'f1_score: {f1_score(y_test, y_pred):.4f}')
    cm = confusion_matrix(y_test, y_pred)
    print(f'cm={cm}')

    classes = ['not fraud', 'fraud']    
    plot_confusion_matrix(cm, classes, normalize=True)

In [3]:
df = pd.read_csv('/kaggle/input/fraud_data_kaggle.csv', nrows=NROWS)
df = process_data(df)
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud
0,-1.703042,-0.28156,-0.22981,-0.237622,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0
1,-1.703042,-0.294767,-0.281359,-0.285812,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0
2,-1.703042,-0.297555,-0.288654,-0.292442,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,-0.714778,3.307478,1
3,-1.703042,-0.297555,-0.288654,-0.292442,-0.317582,-0.333411,-0.530965,1.357803,-0.08096,-0.714778,-0.302345,1
4,-1.703042,-0.278532,-0.274329,-0.282221,-0.323814,-0.333411,-0.530965,-0.736484,-0.08096,1.399036,-0.302345,0


In [4]:
# Based on the isFraud rate, we pass class_weight to Keras.
train_and_eval(df, use_class_weight=True)

Trial 6 Complete [06h 51m 06s]
val_accuracy: 0.9995146989822388

Best val_accuracy So Far: 0.9995619058609009
Total elapsed time: 11h 17m 39s

Search: Running Trial #7

Hyperparameter    |Value             |Best Value So Far 
num_layers        |1                 |1                 
hidden_0th        |176               |176               
hidden_1th        |80                |None              

Epoch 1/10
    12/143159 [..............................] - ETA: 11:10 - loss: 0.0186 - accuracy: 0.9974  

2021-12-04 05:34:31.226759: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-12-04 06:14:34.458701: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10