In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/creditcard.csv')
df.info()

In [None]:
df.describe().T

In [None]:
df.head(5)

**Scatter Plots**

In [None]:
fig, ax = plt.subplots(figsize=(10,5)) 
ax.set_yticks(np.arange(0, 28000, 2500))
ax.set_ylabel("Amount ->")
ax.set_xlabel("Index ->")
x1 = plt.scatter(x=df[(df.Class == 0)].index, y = df[(df.Class == 0)].Amount, alpha=0.4, c='g' )
x2 = plt.scatter(x=df[(df.Class == 1)].index, y = df[(df.Class == 1)].Amount, c='r' )
plt.legend([x1,x2],['G', 'F'])
plt.title('Transaction Amount')

****ZOOM IN****

In [None]:
fig, ax = plt.subplots(figsize=(10,5)) 
ax.set_ylabel("Amount ->")
ax.set_xlabel("Index ->")
x1 = plt.scatter(x=df[(df.Class == 0) & (df.Amount >= 2500)].index, y = df[(df.Class == 0) & (df.Amount >= 2500)].Amount, alpha=0.4, c='g' )
x2 = plt.scatter(x=df[(df.Class == 1) & (df.Amount >= 2500)].index, y = df[(df.Class == 1) & (df.Amount >= 2500)].Amount, c='r' )
plt.legend([x1,x2],['G', 'F'])
plt.title('Transaction Amount => 2,500')

In [None]:
fig, ax = plt.subplots(figsize=(10,5)) 
ax.set_yticks(np.arange(0, 2500, 100))
ax.set_ylabel("Amount ->")
ax.set_xlabel("Index ->")
x1 = plt.scatter(x=df[(df.Class == 0) & (df.Amount < 2500)].index, y = df[(df.Class == 0) & (df.Amount < 2500)].Amount, alpha=0.4, c='g' )
x2 = plt.scatter(x=df[df.Class == 1].index, y = df[df.Class == 1].Amount, c='r' )
plt.legend([x1,x2],['G', 'F'])
plt.title('Transaction Amount < 2,500')

Amounts in fraudulent transactions were always below 2,500 

Drop Time column as it doesn't look like an useful feature

In [None]:
df.drop('Time', axis=1, inplace=True)

The feature "Amount" is not in the same scale as other features, this would cause the algorithm to give undeserving important to this feature.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
df.Amount = sc.fit_transform(df.Amount.values.reshape(-1, 1))

Separate fraudulent and genuine transactions 

In [None]:
df_nf = df[df.Class == 0]

In [None]:
y_nf = df_nf.Class

In [None]:
df_nf.drop('Class', axis=1, inplace=True)

In [None]:
df_f = df[df.Class == 1]

In [None]:
y_f = df_f.Class

In [None]:
df_f.drop('Class', axis=1, inplace=True)

Split the data into train and test sets, for anomaly detection algorithms, it is preferable not to include the outliers; here the outliers are the fraudulent transactions

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_nf, y_train, y_nf = train_test_split(df_nf.iloc[0:50000], y_nf[0:50000], test_size=0.30, random_state=101)

To the test set add all the fraudulent transactions, if the model is good, all these fraudulent transactions should be identified as outliers

In [None]:
X_test = pd.concat([X_nf,df_f])

In [None]:
y_test = pd.concat([y_nf,y_f])

In [None]:
y_test.sort_index(inplace=True)

In [None]:
X_test.sort_index(inplace=True)

**Isolation Forest model**

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
isf = IsolationForest(contamination=0.0, bootstrap=True, n_estimators=200)

In [None]:
isf.fit(X_train)

In [None]:
pred = isf.predict(X_test)

In [None]:
pred_t =[]
count = 0
for each in pred:
    if each == -1:
        count = count + 1
        pred_t.append(0)
    else:
        pred_t.append(1)

In [None]:
set(pred_t)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test, pred_t)

In [None]:
print(classification_report(y_test, pred_t))

Isolation forest predicted all observations as outliers, since anomaly detection failed miserably for the given data, we will try our luck with some supervised learning algorithms next

The major issue with the given data is that it is highly skewed, if we use the entire data to train the model, it would predict majority of fraudulent transactions as genuine. The best method to avoid this issue is to  down sample the genuine transactions and mix it with the entire fraudulent transactions. This would help us to reduce the skew and the model will  preform noticeably better.

In [None]:
print("SKEW :-" , df.Class.skew())

In [None]:
df_nf = df[df.Class == 0].sample(frac=0.005, random_state=101)

In [None]:
df_nf.shape

In [None]:
y_nf = df_nf.Class

In [None]:
df_nf.drop('Class', axis=1, inplace=True)

In [None]:
df_new = pd.concat([df_nf, df_f])

In [None]:
y_new = pd.concat([y_nf, y_f])

In [None]:
print("SKEW :-" , y_new.skew())

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new, y_new, test_size=0.30, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

**Logistic Regression**

In [None]:
lr = LogisticRegression(C=2)

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

With a recall score of 0.97, logistic regression is doing better with the sampled data. We will increase the recall score of fraudulent transactions to 1.0 so that none of them goes undetected, in the process, a few genuine transactions will be classified as fraudulent but for this particular scenario that would be acceptable.

In [None]:
pred_prob  = lr.predict_proba(X_test)

In [None]:
pred_prob_C = []
for each in pred_prob:
    if each[0] > .9831:
        pred_prob_C.append(0)
    else:
        pred_prob_C.append(1)

In [None]:
confusion_matrix(y_test, pred_prob_C)

In [None]:
print(classification_report(y_test, pred_prob_C))

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50)

In [None]:
rf.fit(X_train, y_train)

In [None]:
pred = rf.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred_prob_C = rf.predict_proba(X_test)

In [None]:
pred_prob_C = []
for each in pred_prob:
    if each[0] > .9831:
        pred_prob_C.append(0)
    else:
        pred_prob_C.append(1)

In [None]:
confusion_matrix(y_test, pred_prob_C)

In [None]:
print(classification_report(y_test, pred_prob_C))

**Support Vector Classifier**

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(C=3, probability=True)

In [None]:
svc.fit(X_train,y_train)

In [None]:
pred = svc.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred_prob_C = svc.predict_proba(X_test)

In [None]:
pred_prob_C = []
for each in pred_prob:
    if each[0] > .9831:
        pred_prob_C.append(0)
    else:
        pred_prob_C.append(1)

In [None]:
confusion_matrix(y_test, pred_prob_C)

In [None]:
print(classification_report(y_test, pred_prob_C))

**Nearest Neighbor**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred_prob_C = svc.predict_proba(X_test)
pred_prob_C = []
for each in pred_prob:
    if each[0] > .9831:
        pred_prob_C.append(0)
    else:
        pred_prob_C.append(1)

In [None]:
confusion_matrix(y_test, pred_prob_C)

In [None]:
print(classification_report(y_test, pred_prob_C))

**Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
nb.fit(X_train,y_train)

In [None]:
pred = nb.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred_prob_C = nb.predict_proba(X_test)
pred_prob_C = []
for each in pred_prob:
    if each[0] > .9831:
        pred_prob_C.append(0)
    else:
        pred_prob_C.append(1)

In [None]:
confusion_matrix(y_test, pred_prob_C)

In [None]:
print(classification_report(y_test, pred_prob_C))

**In case we didn't fix the skew and have used the data set as is, what would have been the score of the model ?**

In [None]:
y = df.Class

In [None]:
df.drop('Class', axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.30, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(C=2)

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred = lr.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test, pred))

A recall scroe of 0.62, while with the sampled down dataset we recieved a score of 0.91. This is why we should always fix the class skew