In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Data from https://www.kaggle.com/mlg-ulb/creditcardfraud

Dataset description:
The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

## Predicting Fraud

In [2]:
df = pd.read_csv('datafiles/creditcard.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

Definitely a class imbalance problem here. With less than 1% of transactions being fraud transactions, the model is going to correctly predict not fraud most of the them, and fraud less of the time.   This is going to bias the model towards non-fraud so it will fail to detect fraud cases. 


# Addressing class imbalance

We can address the class imbalance in two ways.  We can downsample the majority class, or upsample the minority class. More info found here:  https://elitedatascience.com/imbalanced-classes

Downsampling is going to be a much smaller dataset.  

In [5]:
from sklearn.utils import resample

In [6]:
# Employing downsampling of majority class to address class imbalance issue.

# Separate majority and minority classes
df_majority = df[df.Class==0]
df_minority = df[df.Class==1]
 
len(df_minority)

492

In [7]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority))    # to match minority class
                                
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.Class.value_counts()

1    492
0    492
Name: Class, dtype: int64

In [8]:
# Separate majority and minority classes
# Taking a fourth of the original data from the majority class for easier processing
df_majority = df[df.Class==0].sample(frac=0.25)
df_minority = df[df.Class==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority))    # to match majority class
                                 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.Class.value_counts()

1    71079
0    71079
Name: Class, dtype: int64

### Random Forest with Downsampled dataset

In [9]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = df_downsampled.drop('Class', 1)
Y = df_downsampled['Class']

cross_val_score(rfc, X, Y, cv=5)

array([0.96464646, 0.92424242, 0.90306122, 0.95408163, 0.90306122])

In [10]:
y_pred = rfc.fit(X, Y).predict(X)

In [11]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y, y_pred)

array([[491,   1],
       [  5, 487]])

Not bad! only 6 cases are misidentified here. 1 false positive and 5 false negatives  

### Support Vector Machine from downsampled dataset

In [12]:
from sklearn.svm import SVC

svc = SVC(C=1e-9, kernel='rbf')
svc.fit(X, Y)

SVC(C=1e-09, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
svc.score(X, Y)

0.5101626016260162

In [14]:
cross_val_score(svc, X, Y, cv=5)

array([0.5, 0.5, 0.5, 0.5, 0.5])

In [15]:
y_pred = svc.fit(X, Y).predict(X)

In [16]:
confusion_matrix(Y, y_pred)

array([[492,   0],
       [482,  10]])

Yeahh not great. Most cases were predicted as not fraud. SVM would need more tuning in order forus to get a more accurate model.

# Challenge 4.3.6 Make Your Network

Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

The downsample dataset has very little datapoints, so we need to have a simplified neural network.

In [17]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [18]:
cross_val_score(mlp, X, Y, cv=5)

array([0.49494949, 0.5       , 0.5       , 0.5       , 0.5       ])

In [19]:
# Establish and fit the model, with two, 1000 perceptron layers.
mlp = MLPClassifier(hidden_layer_sizes=(1000,2))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [20]:
cross_val_score(mlp, X, Y, cv=5)

array([0.50505051, 0.5       , 0.5       , 0.5       , 0.5       ])

In [21]:
# Multi-Layer perceptron model
# Establish and fit the model, with a 5 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(5,))
mlp.fit(X, Y)

cross_val_score(mlp, X, Y, cv=5)

array([0.49494949, 0.5       , 0.5       , 0.5       , 0.5       ])

In [22]:
y_pred = mlp.fit(X, Y).predict(X)

In [23]:
confusion_matrix(Y, y_pred)

array([[364, 128],
       [219, 273]])

In [29]:
print("Sensitivity is " + str(273/(273+128)))
print('Specificity is ' + str(364/(364+219)))

Sensitivity is 0.6807980049875312
Specificity is 0.6243567753001715


Better than the un-tuned SVM, but still not great.  

Since neural networks work better with more data, let's use the upsampled dataset.

In [24]:
X = df_upsampled.drop('Class', 1)
Y = df_upsampled['Class']

In [25]:
# Import the model.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

cross_val_score(mlp, X, Y, cv=5)

array([0.91685425, 0.93127462, 0.92016038, 0.9391179 , 0.92764685])

In [26]:
y_pred = mlp.fit(X, Y).predict(X)

confusion_matrix(Y, y_pred)

array([[69812,  1267],
       [ 8479, 62600]])

In [31]:
# Sensitivity. Percentage of positives correctly identified
62600/(62600+1267)

0.9801618989462477

In [30]:
#Specificity
69812/(69812+8479)

0.8916989181387388

Hooray! I made a pretty accurate neural network for detecting credit card fraud!  With a higher sensitivity, it is actually better at detecting fraud than not fraud, which is good.  We want to err on the safe side. 