# Classification of fraudulent credit card transactions using PyTorch


In [1]:
import pandas as pd                # for reading csv data
import numpy as np                 # for efficient matrix multiplication
from random import shuffle         # for shuffling our datasets
import matplotlib.pyplot as plt    # for plotting the training process
import torch                       # deep learning framework
from torch.utils.data import random_split

## Download and read the dataset
You can download the dataset from https://www.kaggle.com/mlg-ulb/creditcardfraud/downloads/creditcard.csv/3

This dataset constists of 284807 examples of credit card transactions. Each example has 28 features as well as the time (to the nearest hour) which it occured. The first column is the time (to the nearest hour) of the transaction, the remaining features are  The last column is the label of whether the transaction was fraudulent (1) or not (0) - this can be interpreted as the true probability of fraud, which is what a perfect model that we will make should predict.

In [2]:
     # read the data from a spreadsheet
                    # convert to matrix so we can use it in the math

#data = data[:, :]                             # you can slice the matrix to only use certain features or examples
'''
labels = data[:, -1]         # the labels are the last column (0 for legitimate or 1 for fraudulent)
features = data[:, :-1]      # crop off the indices and labels
print(features.shape)        # show the shape of the features
print(labels.shape)          # show the shape of the labels
'''
                  # show the data
            # show the shape of the data

[[ 0.00000000e+00 -1.35980713e+00 -7.27811733e-02 ... -2.10530535e-02
   1.49620000e+02  0.00000000e+00]
 [ 0.00000000e+00  1.19185711e+00  2.66150712e-01 ...  1.47241692e-02
   2.69000000e+00  0.00000000e+00]
 [ 1.00000000e+00 -1.35835406e+00 -1.34016307e+00 ... -5.97518406e-02
   3.78660000e+02  0.00000000e+00]
 ...
 [ 1.72788000e+05  1.91956501e+00 -3.01253846e-01 ... -2.65608286e-02
   6.78800000e+01  0.00000000e+00]
 [ 1.72788000e+05 -2.40440050e-01  5.30482513e-01 ...  1.04532821e-01
   1.00000000e+01  0.00000000e+00]
 [ 1.72792000e+05 -5.33412522e-01 -1.89733337e-01 ...  1.36489143e-02
   2.17000000e+02  0.00000000e+00]]
(284807, 31)


# What the hell are these features?

The features of this dataset are not the raw features that we might collect about a card transaction, like how many tries it took to get the PIN right, or how quickly the transaction was made etc.
Instead, they are actually a transformation of those original features, such that the new axes point in the direction in which the features vary the most. The new axes are still orthogonal to one another (the transformation is just a rotation of the axes). Transforming the features in this way is called principal component analysis (PCA). Performing PCA should mean that the first few features represent most of the variation of the data, and hence are the most important. This means that we could probably get decent results using only those first few features. This is a form of 'dimensionality reduction'.

This isn't the main focus of the session, but is the explanation for what the features represent, and their strange names (V1, V2... )

# Making the dataset unbiased

Most credit card transactions aren't fraudulent. As such, there are far fewer examples of fraudulent transactions than there are legitimate ones. If we train our model on all of these examples then it will be able to achieve a 99.83% accuracy by just classifying every example as legitimate! 

In order to counteract the bias of the dataset, we will adjust it to make it contain an even number of examples from each class.

In [3]:
labels = data[:, -1]                # binary vector which has 1s where there is a fraud and 0s otherwise
n_fraud = np.sum(labels)            # sum of the labels is the number of fraudulent examples

# print the stats
print('Number of fraudulent transactions:', n_fraud)
print('Number of legitimate transactions:', len(data) - n_fraud)
print('Percentage of examples that are fraudulent {:.2f}%:'.format((n_fraud / len(data))*100))

Number of fraudulent transactions: 492.0
Number of legitimate transactions: 284315.0
Percentage of examples that are fraudulent 0.17%:


In [4]:
print(data.shape)

fraud_indices =               # get a boolean vector which has TRUE at the index of each fraudulent 
print(fraud_indices)                      
fraud_examples =     # index the data with the binary vector to get the fraudulent examples

legit_indices =          # the tilde (~) inverses the binary vector
print(legit_indices)
legit_examples =     # index the data to get the legit examples
clipped_legit_examples =       # clip the legit examples so that we have the same number of examples with each label
print(legit_examples.shape)

data =     # vertically stack the fraudulent and legit examples into a dataset where there are an even number of examples from each class
print(data.shape)

(284807, 31)
[False False False ... False False False]
[ True  True  True ...  True  True  True]
(284315, 31)
(984, 31)


## Normalising the dataset


In [5]:
# CENTER AROUND MEAN
    # we dont want to normalise the labels! so separate them from the features 
# get features
# get labels    
    
       # subtract the mean of each feature (over all rows axis=0) from each feature

# DIVIDE BY RANGE
       # find the larget value of each feature
      # find the smallest value of each feature

      # find the range of each feature

    # divide by range
print(features)

print(features.shape)
     # turn labels from a m-vector into a mx1 matrix so that we can stack
print(labels.shape)

   # horizontally stack labels back onto the normalised features


[[-0.23513274  0.00474767  0.00136628 ...  0.01599127 -0.03791971
  -0.02513678]
 [-0.23474529 -0.0176271  -0.1481511  ... -0.03382708  0.00175016
   0.11305405]
 [-0.21132265  0.00501926 -0.00427415 ... -0.00548823 -0.04008067
   0.03754021]
 ...
 [-0.23541451  0.06249815 -0.03953566 ... -0.00188655 -0.02803576
  -0.01425915]
 [-0.23540864  0.11408986 -0.04825472 ... -0.01209214 -0.00195925
  -0.02479979]
 [-0.23540277  0.11512425 -0.04706851 ... -0.01201526 -0.00088955
  -0.02490428]]
(984, 30)
(984, 1)


# Split the dataset into training, validation and test sets

It's no use training our model to just perform well on the data we show it. It needs to be able to perform well on unseen examples.

These unseen examples will come from a part of our dataset that we break off into a 'test set'.

For less obvious reasons, we also need to create a 'validation set'. We will see that there are some design choices of our model (hyperparameters, rather than model parameters) that we shouldn't learn from the training set; and if we adjust these based on the model's performance on the test set, then we are training the hyperparameters on the test set... and then performance on the test set no longer becomes representative of performance on unseen examples! The point of the validation set is to train these hyperparameters.

# Setting up hyperparameters

Now we have sorted out the data, we are ready to start building the rest of the model.

Firstly we will set some hyperparameters.

Hyperparameters are different to parameters because it doesn't make sense to learn them during training. Some examples include:
- the depth of our model
- the width of our model
- batch size (how many examples we show the model at once)
- learning rate (how much we change our parameters by on each update)
- epochs to train for (how many times we pass the whole dataset through our model)

In [7]:
batch_size = 16          # how many examples will we pass through our model at once
lr = 0.001               # how big will the step sizes of our model parameter updates be
momentum = 0.6           # what proportion of the previous parameter update will also contribute to the next
epochs = 1               # how many times will we pass our whole dataset through the model

# Create the data loaders

Now we have the dataset, we will create something to pass us the data in mini-batches and shuffle if for us - a data loader.

We want to pass our data to our model in mini-batches (rather than the whole batch at once) for 2 main reasons:
- passing the whole batch through the model will take longer, and slow each training step
- How badly the model performs is a function of the data we pass through it. If we update our model parameters based on what will improve predictions for the batch as a whole, we may actually end up not optimisig for any of them specifically.

This is an implementation from scratch. In the regression example we will use a pre-built class that PyTorch provides.

In [8]:
class DataLoader():                           # create a data loader to pass our examples to us in batches
    
    def __init__(self, dataset, batch_size):     # what happens when we create an dataloader instance
        # MAKE BATCHES OF DATA
        self.batches = []                        # initialisee empty list of batches
        i = 0                                    # initial index to count where we are counting batches from
        while i + batch_size < len(dataset):     # before we reach the end of the dataset
            self.batches.append(dataset[i:i+batch_size])        # grab a batch from the data and append it to the list of batches
            i += batch_size                      # increase the index to start at the next batch
        self.batches.append(dataset[i:])         # the last batch may not fit into the 
        shuffle(self.batches)                    # shuffle the batches
        
    def __getitem__(self, idx):           # this function is called when we index the data loader e.g. dataset[4]
        if idx == 0:  
            shuffle(self.batches)                # shuffle the batches each epoch
        # get a batch of examples from the list of batches
        # get the features from that batch (all rows and all columns up until the last one)
        # turn the features into a torch tensor
        # change the data type
        # get the labels from the batch (all rows, last column)


        # change from vector to matrix (so the labels come out as the same size as our predictions)
        return features, labels                  # return the features and labels

# CREATE A DATA LOADER
          # create a data loader from the normalised dataset of a certain batch size


# SHOW AN EXAMPLE OF A BATCH PRODUCED
print(train_loader[1])
x, y = train_loader[1]
print(x.shape)
print(y.shape)


(tensor([[ 0.6595,  0.1160, -0.0805,  0.0283, -0.1020,  0.0411, -0.0461,  0.0651,
         -0.0100,  0.1243,  0.0399, -0.1353,  0.1669,  0.1271,  0.0558,  0.1087,
          0.1516,  0.1515,  0.1588, -0.0891,  0.0246, -0.0044, -0.0045, -0.0070,
         -0.0639, -0.0266, -0.1485, -0.0091,  0.0099,  0.0684],
        [-0.2370,  0.0838, -0.1030,  0.1187, -0.0478, -0.0053,  0.0496,  0.0490,
         -0.0028,  0.1256,  0.0737, -0.1956,  0.1429, -0.0653,  0.1504,  0.1290,
          0.1572,  0.0911,  0.1402, -0.1171,  0.0358,  0.0024,  0.0313, -0.0180,
          0.1438,  0.0212, -0.0676, -0.0094,  0.0204,  0.0949],
        [ 0.3164, -0.7340,  0.4270, -0.7768,  0.3877, -0.4919, -0.0509, -0.5767,
         -0.1916, -0.4325, -0.5816,  0.1719, -0.3646, -0.1032, -0.0145, -0.3534,
         -0.2694, -0.2851, -0.3222, -0.1014,  0.1713, -0.1822,  0.2032,  0.0377,
          0.0932, -0.1181, -0.2303, -0.7134, -0.2997, -0.0249],
        [ 0.2631, -0.1396,  0.0444, -0.1847,  0.1692, -0.0742, -0.1780, -0.086

## Creating the model

We are going to create a function to map our inputs to our output (confidence of transaction being false)

We call the function that we will create to perform this mapping a model, because it should model some ideal function that really maps these types of inputs to these outputs.

We build a neural network of adjustable width and depth, that takes in a vector the size of our inputs and outputs a scalar probabiility of an example being fraudulent.

In [9]:
# DEFINE SIZES FOR EACH OF OUT NEURAL NETWORKS LAYERS
units1 = 16
units2 = 16
units3 = 16

                                   # create a neural network class
    
            # what happens when we create a neural network instance
                                          # initialise the parent class
        # DEFINE LAYERS TO TAKE FEATURES TO A PROBABILITY OF THIS EXAMPLE BEING FRAUDULENT
                          # define the layers of the model sequentially
                         # linear layers form weighted combinations of their inputs
                                             # Rectified Linear Unit activation function
            
               # sigmoid function squashes our output in the range 0-1, so it can be a probability
        
    
             # this function is called when we call our model with some input e.g. model(x)
                # pass the features of the example through the layers of our model
                           # return the transformed output (should tell us whether we predict fraud or not)
    
           # now actually create an instance of a neural network

# Creating the loss function

The loss function is a measure of how badly the model is currently performing. To measure this when our labels are labels are binary (either 0 or 1) we can use the binary cross entropy loss.

When the label is y=1 (fraudulent) the second term is multiplied by 0 (1-y) and only the first term contributes to the loss. $log(\hat{y})$ increases as the prediction ($\hat{y}$) moves away from 1.

When the label is y=0 (legitimate) the first term is multiplied by 0 (y) and only the second term contributes to the loss. $log(1 - \hat{y})$ increases as the prediction ($\hat{y}$) moves away from 0.

# $L = - [ \ y \ log(\hat{y}) + (1-y) \ log(1 - \hat{y}) \ ]$ 

In [10]:
           # binary cross entropy loss function. Can be called to return loss between prediction and label

# Creating the optimiser

The optimiser will update the parameters (weights and biases) of the model in a direction that reduces the error.

We will use stochastic gradient descent (SGD), which updates the weights based on the update rule: 
# $w \leftarrow w - \alpha \frac{\partial L}{\partial w}$ 

This means that the weights are moved in the direction that decreases the loss. The step size is proportional to the gradient by the learning rate (alpha)

In [11]:
#, momentum=momentum)    # define how we will update our weights

# Train the model

Here we repeatedly pass mini-batches of examples through our model, then compare our output to the corresponding labels and update the model weights based on the loss.

In [12]:
                # define a training function
                          # initialise an empty list of the losses
    
    
    
    
    
                  # for however many epochs we specified
        # TRAINING
        print('Training')
             # for each batch in the training data loader
                              # unpack the batch
            print(labels)
                           # pass an example's features forward through the model
            print(prediction)
                   # calculate the loss
                                  # zero the gradients (otherwise they will accumulate)
                                        # find rate of change of loss with respect to model params
                                       # update the weights of our model
            print('Epoch:', epoch, '\tBatch:', batch_idx, '\tLoss:', loss.item())
                       # add this loss to the list of losses
            
            batch_idx_for_val += 1 # counter for plotting validation losses       
             
            if batch_idx == 1000:    # tell your model to stop at some batch_idx if you want
                #pass
                break
        
        # VALIDATING
        print('Validating')
        val_losses = []
        validation_loss_idxs.append(batch_idx_for_val)
            # for each batch in the validation loader
                               # unpack the batch
                           # pass an example's features forward through the model
            # evaluate the validation loss
            # add the val loss to the list of validation losses for this run through the val dataset
        
        
               
        # return the list of losses from the training function
            
   # call the training function and store the losses

Training
tensor([[0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])
tensor([[0.4968],
        [0.4969],
        [0.4969],
        [0.4968],
        [0.4969],
        [0.4967],
        [0.4968],
        [0.4969],
        [0.4969],
        [0.4969],
        [0.4970],
        [0.4969],
        [0.4969],
        [0.4968],
        [0.4969],
        [0.4969]], grad_fn=<SigmoidBackward>)
Epoch: 0 	Batch: 0 	Loss: 0.6962948441505432
tensor([[1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.]])
tensor([[0.4977],
        [0.4978],
        [0.4978],
        [0.4977],
        [0.4978],
        [0.4978],
        [0.4977],
        [0.4977],
        [0.4977],
        [0.4978],


NameError: name 'validation_loss_idxs' is not defined

In [None]:
     # plot the training losses
     # plot the validation losses
     # show the training curve