Hi all,

This is my first notebook. I've studied a few notebooks on this dataset. I learned a lot from this particular notebook https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets written by Janio Martinez. Especially, I learned that you need to first split train/test, and then do undersampling or over sampling.

In this notebook I'll do the following:

1. Load and scale the data (there is enough visualization study, I'll ignore that). I know from other studies that the data is clean.
2. Split the data.
3. Train a VAE only on the fraud cases of the training data
4. use VAE to generate as many data necessary to match non-fraud case. build an augmented data
5. train an ensemble classifier
6. test on out-of-sample using different metrics
7. wait for comments and critics.



Import a few packages, classifiers, metrics and etc.

In [ ]:
from os.path import exists
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, average_precision_score, precision_recall_curve
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle
import matplotlib.pyplot as plt

I like to put everything in a function for easier handling of data. I also make sure that the output is treated uniformly across experiments.

Here is what I'm doing in this function:
1. load the data
2. scale using StandardScaler in sklearn
3. go through each column except the 'Class' and scale it. Drop the column and replace it with a new name that has a prefix 'scaled_'
4. Put everything in the X data except 'Class' column that goes to y
5. split data
6. return as dataframe

In [ ]:
def prepare_data(path, random_state=42):
    """
    parameters:
    path (str): The path to the input data set
    random_state (int): A seed to set the random state of the split

    Returns:
    pandas.DataFrame: The train data set
    pandas.DataFrame: The test data set
    """
    df = pd.read_csv(path)
    std_scaler = StandardScaler()
    columns = df.columns.drop(['Class'])
    for col in columns:
        df['scaled_' + str(col)] = std_scaler.fit_transform(df[col].values.reshape(-1, 1))
        df.drop(col, axis=1, inplace=True)
    X = df.drop('Class', axis=1)
    y = df['Class']
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=random_state)
    return pd.concat((xtrain, ytrain), axis=1), pd.concat((xtest, ytest), axis=1)

A simple function to return the X value of the fraud case for train and test data

In [ ]:
def fraud_or_not(dftrain, dftest):
    """
    parameters:
    dftrain (pandas.DataFrame): The train data frame
    dftest (pandas.DataFrame): The test data frame

    Returns:
    pandas.DataFrame: The fraud portion of the train data frame
    pandas.DataFrame: The fraud portion of the test data frame
    """
    fraud = dftrain.loc[dftrain['Class'] == 1].sample(frac=1)
    fraudtest = dftest.loc[dftest['Class'] == 1].sample(frac=1)
    return fraud.drop('Class', axis=1), fraudtest.drop('Class', axis=1)

Before we go any further, we want to define our VAE class and a training function for the VAE. Read the inline comments.

In [ ]:
import tensorflow as tf
from tensorflow.keras.layers import InputLayer, Dense, Flatten, Reshape
from tensorflow.keras import Sequential
import time
import numpy as np

# This code is taken from Tensorflow tutorial on VAEs. It's turned from a CNN to a simple neuralnet which is more appropriate for our case here.

# almost the standard for activation these days
relu = tf.nn.relu


class VAE(tf.keras.Model):
    """
    A VAE class inhereted from keras.Model

    parameters:
    ndim (int): number of dimensions of the input data
    latent_dim (int): number of dimensions of the latent variable

    attributes:
    ndim (int): number of dimensions of the input
    latent_dim (int): number of dimensions of the latent variable
    inference_net (keras.Sequential): The inference model that takes an input of size=(None, ndim) and return a matrix of size=(None, latent_dim)
    generative_net (keras.Sequential): The generative model that takes an input of size=(None, latent_dim) and return a matrix of size=(None, ndim)
    """
    def __init__(self, ndim, latent_dim):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim  
        self.ndim = ndim        
        self.inference_net = Sequential(
            [
                InputLayer(input_shape=(ndim,)),
                Dense(100, activation=relu),
                Dense(2 * latent_dim)
            ]
        )
        
        self.generative_net = Sequential(
            [
                InputLayer(input_shape=(latent_dim,)),
                Dense(100, activation=relu),
                Dense(ndim)
            ])

    @tf.function
    def sample(self, num_samples=100, eps=None):
        """
        Given an input noise of size (num_samples, latent_dim), generate samples of size (num_samples, ndim)

        parameters:
        num_samples (int): number of samples
        eps (numpy.ndarray): input noise. if specified, num_samples is ignored

        returns:
        numpy.ndarray: the decoded samples
        """
        if eps is None:
            eps = tf.random.normal(shape=(num_samples, self.latent_dim))
        return self.decode(eps)

    def encode(self, x):
        """
        
        parameters:
        x (numpy.ndarray): the input data with size (None, ndim)

        returns:
        numpy.ndarray: the mean of the latent variables
        numpy.ndarray: the log variance of the latent variables
        """
        mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        """
        Reparameterize the input for backpropagation

        parameters:
        mean (numpy.ndarray): the mean of the latent variables
        logvar (numpy.ndarray): the log variance of the latent variables

        returns:
        numpy.ndarray: the noise samples from a normal distribution around mean with standard deviation exp(logvar / 2)
        """
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(logvar * 0.5) + mean

    def decode(self, z):
        """
        Given an input noise generates the decoded samples

        parameters:
        z (numpy.ndarray): the input noise (None, latent_dim)

        returns:
        numpy.ndarray: the decoded samples of size (None, ndim)
        """
        return self.generative_net(z)


In [ ]:
# a function for computing the KL term of Gaussian distribution
def log_normal_pdf(sample, mean, logvar, raxis=1):
    log2pi = tf.math.log(2. * np.pi)
    return tf.reduce_sum(-0.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),
                         axis=raxis)

# a function to compute the loss of the VAE
@tf.function
def compute_loss(model, x):
    mean, logvar = model.encode(x)
    logvar = tf.clip_by_value(logvar, -88., 88.)
    z = model.reparameterize(mean, logvar)
    xmean = model.decode(z)
    logpx_z = -tf.reduce_sum((x - xmean) ** 2, axis=1)  # ad-hoc l2 loss that is pretty close to log-prob of a gaussian distribution withtout taking into account the variance
    logpz = log_normal_pdf(z, 0.0, 0.0)
    logqz_x = log_normal_pdf(z, mean, logvar)
    return -tf.reduce_mean(logpx_z + logpz - logqz_x)

# A function that given the model computes the loss, the gradients and apply the parameter update
@tf.function
def compute_apply_gradients(model, x, optimizer):
    with tf.GradientTape() as tape:
        loss = compute_loss(model, x)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

The training function is easy and self explanatory. I added a few lines to make sure that I can re-load the model every time we want to.

In [ ]:
def train(xtrain, xtest, model=None, load=False, filepath=None):
    optimizer = tf.keras.optimizers.Adam(1e-3)
    epochs = 2000
    latent_dim = 20
    num_train, ndim = xtrain.shape
    num_test, _ = xtest.shape
    if model is None:
        model = VAE(ndim, latent_dim)
    if load and filepath is not None:
        model.load_weights(filepath=filepath)
        return model
    else:
        batch_size = 32
        train_dataset = tf.data.Dataset.from_tensor_slices(xtrain.values.astype(np.float32)).shuffle(num_train).batch(
            batch_size)

        test_dataset = tf.data.Dataset.from_tensor_slices(xtest.values.astype(np.float32)).shuffle(num_test).batch(num_test)

        for epoch in range(1, epochs + 1):
            start_time = time.time()
            for train_x in train_dataset:
                compute_apply_gradients(model, train_x, optimizer)
            end_time = time.time()

            if epoch % 100 == 0:
                loss = tf.keras.metrics.Mean()
                for test_x in test_dataset:
                    loss(compute_loss(model, test_x))
                elbo = -loss.result()
                print('Epoch: {}, Test set psudo-ELBO: {}, '
                      'time elapse for current epoch {}'.format(epoch, elbo, end_time - start_time))
                model.save_weights('saved_models/model_%d_at_%d' % (latent_dim, epoch))
    return model

Once we have the VAE trained, we can pass it to this function and samples from it to increase the number of fraud cases.

In [ ]:
def augment_data(data, model):
    num_samples = data['Class'].value_counts()[0] - data['Class'].value_counts()[1]
    samples = model.sample(num_samples=num_samples).numpy()
    dfnew = pd.DataFrame(samples, columns=data.columns.drop('Class'))
    dfnew['Class'] = np.ones(len(samples), dtype=np.int)
    dfnew = pd.concat((data, dfnew), ignore_index=True).sample(frac=1)
    return dfnew

In [ ]:
# get the data and split it
dtrain, dtest = get_scaled_data_splitted(path='creditcard.csv')
# get the fraud cases from train and test
Xf, Xft = fraud_or_not(dtrain, dtest)
# get the traied VAE model
model = train(Xf, Xft, load=False, filepath='saved_models/model_20_at_1900')
# augment the data using the VAE model
augmented = augment_data(dtrain, model)
X = augmented.drop('Class', axis=1)
y = augmented['Class']
Xt = dtest.drop('Class', axis=1)
yt = dtest['Class']

Now that we have the augmented X, y, we are going to train a classifier. Then we going to test it on the held-out test set. Be careful that if you're loading a VAE model, it may contain your test set becase the model is train on a specific train/test split, but upon loading you might get a different split.

In [ ]:
classifiers = {
    "XGBClassifier": XGBClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
}

N = 10000
for key, classifier in classifiers.items():
    print('_' * 50)
    name = key
    print(name)
    if exists(name):
        print('loading...')
        classifier = pickle.load(open(name, 'rb'))
        training_score = cross_val_score(classifier, X[:N], y[:N], cv=5)
        print("Classifiers: ", name, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
    else:
        classifier.fit(X, y)
        print('validating ...')
        training_score = cross_val_score(classifier, X[:N], y[:N], cv=5)
        print("Classifiers: ", name, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
        pickle.dump(classifier, open(name, 'wb'))
    print(classifier.score(X, y))
    print(classifier.score(Xt, yt))
    y_pred = classifier.predict(Xt)
    cm = confusion_matrix(yt, y_pred)
    print(cm)
    print(classification_report(yt, y_pred))
    print(average_precision_score(yt, y_pred))


In [ ]:
def augment_data_interpolation(data, model):
    num_samples = data['Class'].value_counts()[0] - data['Class'].value_counts()[1]
    X = data[data['Class'] == 1].drop(['Class'], axis=1)
    z, _ = model.encode(X.values.astype(np.float32))
    z1 = pd.DataFrame(z).sample(frac=num_samples / len(z), replace=True)
    z2 = z1.sample(frac=1)
    r = np.random.rand(*z1.shape)
    z = r * z1.values + (1 - r) * z2.values
    samples = model.decode(z.astype(np.float32)).numpy()
    dfnew = pd.DataFrame(samples, columns=data.columns.drop('Class'))
    dfnew['Class'] = np.ones(len(samples), dtype=np.int)
    dfnew = pd.concat((data, dfnew), ignore_index=True).sample(frac=1)
    return dfnew

In [ ]:
augment_data_interpolation(dtrain, model)
augmented_interpolate = augment_data_interpolation(dtrain, model)
X = augmented_interpolate.drop('Class', axis=1)
y = augmented_interpolate['Class']
Xt = dtest.drop('Class', axis=1)
yt = dtest['Class']

In [ ]:
classifiers = {
    "XGBClassifier": XGBClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
}

N = 10000
for key, classifier in classifiers.items():
    print('_' * 50)
    name = key
    print(name)
    if exists(name):
        print('loading...')
        classifier = pickle.load(open(name, 'rb'))
        training_score = cross_val_score(classifier, X[:N], y[:N], cv=5)
        print("Classifiers: ", name, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
    else:
        classifier.fit(X, y)
        print('validating ...')
        training_score = cross_val_score(classifier, X[:N], y[:N], cv=5)
        print("Classifiers: ", name, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
        pickle.dump(classifier, open(name, 'wb'))
    print(classifier.score(X, y))
    print(classifier.score(Xt, yt))
    y_pred = classifier.predict(Xt)
    cm = confusion_matrix(yt, y_pred)
    print(cm)
    print(classification_report(yt, y_pred))
    print(average_precision_score(yt, y_pred))
