In [1]:
# import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import ml libraries
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


# import metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, precision_recall_curve, fbeta_score

from collections import Counter

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading the dataset and peeking few rows
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# upsampling the time feature to minutes and hours
dfFe = df.copy() 
delta = pd.to_timedelta(dfFe['Time'], unit='s')
dfFe['Time_minutes'] = (delta.dt.components.minutes).astype(int)
dfFe['Time_hours'] = (delta.dt.components.hours).astype(int)

# taking a look at the new features
dfFe.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Time_minutes,Time_hours
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,0,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0,0,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,0,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0,0,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0,0,0


In [4]:
# amount feature is not a good scaled - log(Amount)
# adding 0.0001 to handle negative values
print('Amount before transformation: ')
print(dfFe.Amount.head())
dfFe.Amount = np.log(dfFe.Amount+0.0001)
print('Amount after transformation: ')
print(dfFe.Amount.head())

Amount before transformation: 
0    149.62
1      2.69
2    378.66
3    123.50
4     69.99
Name: Amount, dtype: float64
Amount after transformation: 
0    5.008099
1    0.989578
2    5.936639
3    4.816242
4    4.248354
Name: Amount, dtype: float64


In [5]:
def printMetrics(title, true, pred):
    print(title)
    print('Accuracy: {}'.format(accuracy_score(true, pred)))
    print('Precision: {}'.format(precision_score(true, pred)))
    print('Recall: {}'.format(recall_score(true, pred)))
    print('F2: {}'.format(fbeta_score(true, pred, beta=2)))

In [6]:
# seperating target and features for modeling
X = dfFe.drop(['Class'], axis=1).values
y = dfFe.Class.values

In [7]:
# split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#### Logistic Regression

In [8]:
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
prediction = model.predict(X_test)

In [9]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56838    30]
 [   38    56]]

Logistic Regression score: 0.9988062216916541

Logistic Regression evaluation results: 
Accuracy: 0.9988062216916541
Precision: 0.6511627906976745
Recall: 0.5957446808510638
F2: 0.6060606060606061


#### KNN Classifier

In [10]:
knn = KNeighborsClassifier()
model = knn.fit(X_train, y_train)
prediction = model.predict(X_test)

In [11]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56868     0]
 [   85     9]]

Logistic Regression score: 0.9985077771145676

Logistic Regression evaluation results: 
Accuracy: 0.9985077771145676
Precision: 1.0
Recall: 0.09574468085106383
F2: 0.11688311688311687


#### Support Vector Classifier

In [12]:
svc = SVC()
model = svc.fit(X_train, y_train)
prediction = model.predict(X_test)

In [13]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56868     0]
 [   94     0]]

Logistic Regression score: 0.9983497770443454

Logistic Regression evaluation results: 
Accuracy: 0.9983497770443454
Precision: 0.0
Recall: 0.0
F2: 0.0


#### Decision Tree Classifier

In [14]:
tree = DecisionTreeClassifier()
model = tree.fit(X_train, y_train)
prediction = model.predict(X_test)

In [15]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56850    18]
 [   19    75]]

Logistic Regression score: 0.9993504441557529

Logistic Regression evaluation results: 
Accuracy: 0.9993504441557529
Precision: 0.8064516129032258
Recall: 0.7978723404255319
F2: 0.7995735607675906


#### RandomForestClassifier

In [16]:
rf = RandomForestClassifier()
model = rf.fit(X_train, y_train)
prediction = model.predict(X_test)

In [17]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56866     2]
 [   17    77]]

Logistic Regression score: 0.9996664442961974

Logistic Regression evaluation results: 
Accuracy: 0.9996664442961974
Precision: 0.9746835443037974
Recall: 0.8191489361702128
F2: 0.8461538461538461


#### AdaBoost Classifier

In [18]:
ab = RandomForestClassifier()
model = ab.fit(X_train, y_train)
prediction = model.predict(X_test)

In [19]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)

Confusion Matrix: 
[[56867     1]
 [   17    77]]

Logistic Regression score: 0.9996839998595555

Logistic Regression evaluation results: 
Accuracy: 0.9996839998595555
Precision: 0.9871794871794872
Recall: 0.8191489361702128
F2: 0.8480176211453745


#### MLPClassifier

In [None]:
mlp = MLPClassifierPClassifier()
model = mlp.fit(X_train, y_train)
prediction = model.predict(X_test)

In [None]:
print('Confusion Matrix: ')
print(confusion_matrix(y_test, prediction))
print('\nLogistic Regression score: {}'.format(model.score(X_test, y_test)))
printMetrics('\nLogistic Regression evaluation results: ', y_test, prediction)