In [1]:
import pandas as pd
import numpy as np

## Reading the file and checking the number of fraud cases

In [2]:
data = pd.read_csv('banksim.csv', index_col=0)
data.shape

(7200, 5)

In [3]:
data
data.head()

Unnamed: 0,age,gender,category,amount,fraud
171915,3,F,es_transportation,49.71,0
426989,4,F,es_health,39.29,0
310539,3,F,es_transportation,18.76,0
215216,4,M,es_transportation,13.95,0
569244,2,M,es_transportation,49.87,0


In [4]:
data.fraud.value_counts()

fraud
0    7000
1     200
Name: count, dtype: int64

In [5]:
data.gender.value_counts()

gender
F    3972
M    3212
E      11
U       5
Name: count, dtype: int64

In [6]:
data.category.value_counts()

category
es_transportation        5975
es_food                   294
es_health                 243
es_wellnessandbeauty      198
es_fashion                 97
es_barsandrestaurants      89
es_hyper                   80
es_sportsandtoys           76
es_tech                    39
es_hotelservices           31
es_home                    24
es_travel                  18
es_contents                13
es_leisure                 13
es_otherservices           10
Name: count, dtype: int64

## Data cleaning

In [7]:
# Dropping gender E and U
data_a = data[(data.gender=='M')|(data.gender=='F')]
data_a.shape

(7184, 5)

In [8]:
# Keep the numerical fields
data_pro = data_a[['age', 'amount', 'fraud']].copy()

In [9]:
ohc_gender = pd.get_dummies(data_a.gender, drop_first=True)
data_pro['M'] = ohc_gender['M']
data_pro.head()

Unnamed: 0,age,amount,fraud,M
171915,3,49.71,0,False
426989,4,39.29,0,False
310539,3,18.76,0,False
215216,4,13.95,0,True
569244,2,49.87,0,True


In [10]:
ohc_category = pd.get_dummies(data_a.category, drop_first=True)
data_pro = data_pro.merge(ohc_category, left_index=True,right_index=True)
data_pro.head(1)

Unnamed: 0,age,amount,fraud,M,es_contents,es_fashion,es_food,es_health,es_home,es_hotelservices,es_hyper,es_leisure,es_otherservices,es_sportsandtoys,es_tech,es_transportation,es_travel,es_wellnessandbeauty
171915,3,49.71,0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False


## Novelty detection (semi-supervised learning)

We use novelty detection by training a model to detect anomalies (frauds), and on the test data we classify outliers detected by the model as fraud. Then we check the performance metrics against the actual labels.

In [11]:
from sklearn.neighbors import LocalOutlierFactor as lof
from sklearn.model_selection import train_test_split

In [12]:
y = data_pro['fraud']
x = data_pro.drop(columns='fraud')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.1, random_state=1, stratify=y)

y_train.value_counts()

fraud
0    6285
1     180
Name: count, dtype: int64

In [13]:
print(f'{180/6285*100:.2f}% of the training dataset contains outliers')

2.86% of the training dataset contains outliers


In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
from sklearn.ensemble import IsolationForest as isof

In [15]:
# Random contamination factor
otld = lof(contamination=.1)
preds = otld.fit_predict(x_train)
preds
# 1 is considered an outlier, 0 is an inlier
print(accuracy_score(y_train, [0 if x==1 else 1 for x in preds]))
print(confusion_matrix(y_train, [0 if x==1 else 1 for x in preds]))

# Proper contamination factor
otld = lof(contamination=.0286)
preds = otld.fit_predict(x_train)
preds
# 1 is considered an outlier, 0 is an inlier
print(accuracy_score(y_train, [0 if x==1 else 1 for x in preds]))
print(confusion_matrix(y_train, [0 if x==1 else 1 for x in preds]))

found 0 physical cores < 1
  File "C:\Users\jerro\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


0.8779582366589327
[[5657  628]
 [ 161   19]]
0.9450889404485692
[[6105  180]
 [ 175    5]]


In [16]:
otld = lof(contamination=.0286, novelty=True)
otld.fit(x_train)
test_preds = otld.predict(x_test)

print('Accuray, recall, and confusion matrix')
print(accuracy_score(y_test, [0 if x==1 else 1 for x in test_preds]))
print(recall_score(y_test, [0 if x==1 else 1 for x in test_preds], pos_label=0))
print(confusion_matrix(y_test, [0 if x==1 else 1 for x in test_preds]))

Accuray, recall, and confusion matrix
0.9513212795549374
0.977110157367668
[[683  16]
 [ 19   1]]




## Conclusion: Excellent novelty detection recall scores on the test dataset