In [134]:
!pip install -U imbalanced-learn



### Import libraries 

In [135]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics
from imblearn.over_sampling import SMOTE

### Import the file

In [136]:
df = pd.read_csv('creditcard.csv')

In [137]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


Here we see that this dataset is highly imbalanced. This is very common for fraud detection, but we will need to take some measures to get useful results

In [138]:
df[['Class', 'Amount']].groupby(['Class']).count()

Unnamed: 0_level_0,Amount
Class,Unnamed: 1_level_1
0,284315
1,492


### Create a sample to test different models before training the whole dataset

This step is important because our dataset has almost 30 features and more than 280 thousand rows, if we tried testing different models and parameters directly, it would take a long time and be computationally expensive.

In [139]:
sample = df.sample(frac=0.2, random_state=1)

### Create the sample train and test split

In [140]:
X_sample = sample.drop(columns=['Class'])
y_sample = sample[['Class']]

In [141]:
X_train_sample, X_test_sample, y_train_sample, y_test_sample = model_selection.train_test_split(X_sample, y_sample, test_size=0.3)

### Oversample the minority class

The method I chose to deal with the imbalance in this dataset is oversampling. I chose this after testing a few others like undersampling and random oversampling, the SMOTE performed best in this scenario. I also used a set random_state to guarantee the reproducibility of the results.

In [142]:
os = SMOTE(random_state=1)
X_train_sample_over, y_train_sample_over = os.fit_resample(X_train_sample, y_train_sample)

### Test model parameters in sample

Because the main goal of this model is to detect fraudulent transactions, we would rather have a safe transaction classified as fraud (false positive) than a fraud transaction classified as safe (false negative).

To reach this goal, my main metric to follow was the recall, but I also kept an eye on the precision and the F1 score to make sure they didn't get out of hand.

In [143]:
rfc_sample = RandomForestClassifier().fit(X_train_sample_over, y_train_sample_over.values.ravel())
y_pred_sample = rfc_sample.predict(X_test_sample)

print('Recall:', metrics.recall_score(y_test_sample, y_pred_sample))
print('Precision:', metrics.precision_score(y_test_sample, y_pred_sample))
print('F1 score:', metrics.f1_score(y_test_sample, y_pred_sample))

Recall: 0.7619047619047619
Precision: 0.7272727272727273
F1 score: 0.7441860465116279


In [144]:
results_rfc_sample = pd.DataFrame(y_pred_sample)
results_rfc_sample['real'] = y_test_sample['Class'].to_numpy()
results_rfc_sample.rename(columns={0:'predicted'}, inplace=True )

conditions = [((results_rfc_sample['predicted']==1) & (results_rfc_sample['real']==1)),((results_rfc_sample['predicted']== 1) & (results_rfc_sample['real']==0)),((results_rfc_sample['predicted']==0) & (results_rfc_sample['real']==0)),((results_rfc_sample['predicted']==0) & (results_rfc_sample['real']==1))]
choice = ['True positive', 'False positive', 'True negative', 'False negative']
results_rfc_sample['eval'] = np.select(conditions, choice)

res_group_sample = results_rfc[['eval','real']].groupby(['eval']).count()

This table serves as a simplified confusion matrix

In [145]:
res_group_sample

Unnamed: 0_level_0,real
eval,Unnamed: 1_level_1
False negative,18
False positive,10
True negative,56865
True positive,69


### Apply a random search to tune the parameters

After testing and opting for the Random Forest Classifier model, we can start testing the hyperparameters. 

Although hyperparameters in a sample don't reflect perfectly on the final model with the full dataset, I find it best to use a sample first to filter out most values and test just a few on the more computationally demanding model.

In [146]:
rfc1 = RandomForestClassifier(class_weight='balanced', random_state=1)

param_distributions_continuous = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 22, num = 5)]}
#param_distributions_categorical = {'max_features':['auto', 'sqrt', 'log2']}

#clf1 = model_selection.RandomizedSearchCV(rfc1, param_distributions_continuous, random_state=1, scoring='f1')
clf2 = model_selection.GridSearchCV(rfc1, param_distributions_continuous, scoring='f1')
#clf3 = model_selection.GridSearchCV(rfc1, param_distributions_categorical, scoring='f1')

search = clf2.fit(X_train_sample_over, y_train_sample_over.values.ravel())

In [147]:
print(search.best_params_)
print(search.best_score_)

{'n_estimators': 20}
0.9999120879114937


### Create the train and test split and oversample the minority class

After running some tests with the parameters, we can go for the model with the full dataset.

We use the same logic as before, just abandoning the sample.

In [167]:
X = df.drop(columns=['Class'])
y = df[['Class']]

I chose a split of 90/10 because our dataset is pretty large, so 10% still leaves us with plenty of test data (almost 30k)

In [230]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)

In [231]:
os = SMOTE()
X_train_over, y_train_over = os.fit_resample(X_train, y_train)

This is the final model, but this code has been altered a few times as I tested different parameter to fine tune the scores.

The parameters shown here were selected after many rounds of testing for having the best scores overall.

In [232]:
rfc = RandomForestClassifier(n_estimators=90, max_features='auto', bootstrap=True, class_weight='balanced').fit(X_train_over, y_train_over.values.ravel())
y_pred = rfc.predict(X_test)

print('Recall:', metrics.recall_score(y_test, y_pred))
print('Precision:', metrics.precision_score(y_test, y_pred))
print('F1 score:', metrics.f1_score(y_test, y_pred))

Recall: 0.8085106382978723
Precision: 0.8636363636363636
F1 score: 0.8351648351648351


In [229]:
rfc.score(X_test, y_test)

0.9995435553526912

In [221]:
results_rfc = pd.DataFrame(y_pred)
results_rfc['real'] = y_test['Class'].to_numpy()
results_rfc.rename(columns={0:'predicted'}, inplace=True )

conditions = [((results_rfc['predicted']==1) & (results_rfc['real']==1)),((results_rfc['predicted']==1) & (results_rfc['real']==0)),((results_rfc['predicted']==0) & (results_rfc['real']==0)),((results_rfc['predicted']==0) & (results_rfc['real']==1))]
choice = ['True positive', 'False positive', 'True negative', 'False negative']
results_rfc['eval'] = np.select(conditions, choice)

res_group = results_rfc[['eval','real']].groupby(['eval']).count()

Here we have the same table acting as a confusion matrix

In [222]:
res_group

Unnamed: 0_level_0,real
eval,Unnamed: 1_level_1
False negative,6
False positive,7
True negative,28431
True positive,37


And with this final version of the model, we can generally get a score of over 99.9% with a true positive rate around 85% ± 3%