# Credit Card Fraud

Import library

In [None]:
import numpy as np
import pylab as pl
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

Read dataset

In [None]:
data = pd.read_csv("../input/creditcardfraud/creditcard.csv")


In [None]:
data.info()
data[0:10]

# Check missing data

Some might quibble over our usage of missing. By “missing” we simply mean NA (“not available”) or “not present for whatever reason”. Many data sets simply arrive with missing data, either because it exists and was not collected or it never existed.

In [None]:
print("Any missing sample in test set:",data.isnull().values.any(), "\n")

In [None]:
#Frequency distribution of classes"
train_outcome = pd.crosstab(index=data["Class"],  # Make a crosstab
                              columns="count")      # Name the count column

train_outcome

VISUALIZING THE DATA

In [None]:
cnt_pro = data['Class'].value_counts()
plt.figure(figsize=(6,4))
sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)
plt.ylabel('Number of cp_type', fontsize=12)
plt.xlabel('cp_type', fontsize=12)
plt.xticks(rotation=80)
plt.show();

In [None]:
#Top 10 credit card fraud losses
top_fraud = data.sort_values(by='Amount', ascending=False)[:10]
figure = plt.figure(figsize=(10,6))
sns.barplot(y=top_fraud.Class, x=top_fraud.Amount)
plt.xticks()
plt.xlabel('Amount')
plt.ylabel('Class')
plt.title('Credit card fraud losses')
plt.show()

# Plotting Heatmap
Heatmap can be defined as a method of graphically representing numerical data where individual data points contained in the matrix are represented using different colors. 
The colors in the heatmap can denote the frequency of an event, the performance of various metrics in the data set, and so on. Different color schemes are selected by varying businesses to present the data they want to be plotted on a heatmap [[3](https://vwo.com/blog/heatmap/)].

In [None]:
data = data[['Time','V1','V2','V3','V4','V5','V6','V7','V8','V9','V10'
               ,'V11','V12','V13','V14','V15','V16','V17','V18', 'V19','V20'
               ,'V21','V22','V23','V24','V25','V26','V27','V28','Amount','Class']] #Subsetting the data
cor = data.corr() #Calculate the correlation of the above variables
sns.heatmap(cor, square = True) #Plot the correlation as heat map

As you can see above, we obtain the heatmap of correlation among the variables. The color palette in the side represents the amount of correlation among the variables. The lighter shade represents a high correlation.

# SPLITING DATA

Data for training and testing
To select a set of training data that will be input in the Machine Learning algorithm, to ensure that the classification algorithm training can be generalized well to new data. For this study using a sample size of 30%, assumed it ideal ratio between training and testing

In [None]:
from sklearn.model_selection import train_test_split
Y = data['Class']
X = data.drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=9)

In [None]:
print('X train shape: ', X_train.shape)
print('Y train shape: ', Y_train.shape)
print('X test shape: ', X_test.shape)
print('Y test shape: ', Y_test.shape)

> ## Naive bayes classification
> 
>Using Bayes theorem $\left(P(X|Y)=\frac{P(Y|X)P(X)}{P(Y)}\right)$, we can find the probability of $X$ happening, given that $Y$ has occurred. Here, $Y$ is the evidence and $X$ is the hypothesis. 

In [None]:
from sklearn.naive_bayes import GaussianNB

# We define the model
nbcla = GaussianNB()

# We train model
nbcla.fit(X_train, Y_train)

# We predict target values
Y_predict3 = nbcla.predict(X_test)

In [None]:
test_acc_nbcla  = round(nbcla .fit(X_train,Y_train).score(X_test, Y_test)* 100, 2)
train_acc_nbcla  = round(nbcla .fit(X_train, Y_train).score(X_train, Y_train)* 100, 2)

# Confusion Matrix 
is commonly used for a summarization of prediction results on a classification problem.The number of correct and incorrect predictions is summarized with counting values and each value broken down for each class. Each of them is the key to the confusion matrix. It shows the classification model is confused when it makes predictions, at this point in here it gives us insight not only into the errors being made by a classifier but also show the types of errors that are being made [[4](https://www.geeksforgeeks.org/confusion-matrix-machine-learning/)].

In [None]:
# The confusion matrix
nbcla_cm = confusion_matrix(Y_test, Y_predict3)
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(nbcla_cm, annot=True, linewidth=0.7, linecolor='black', fmt='g', ax=ax, cmap="BuPu")
plt.title('Naive Bayes Classification Confusion Matrix')
plt.xlabel('Y predict')
plt.ylabel('Y test')
plt.show()

# Accuracy
is closeness of the measurements to a specific value. Accuracy has two definitions:
1. More commonly, it is a description of systematic errors, a measure of statistical bias; low accuracy causes a difference between a result and a "true" value. ISO calls this trueness.
1. Alternatively, ISO defines[[1](https://en.wikipedia.org/wiki/Accuracy_and_precision)] accuracy as describing a combination of both types of observational error above (random and systematic), so high accuracy requires both high precision and high trueness.

In [None]:
model1 = pd.DataFrame({
    'Model': ['Naive Bayes'],
    'Train Score': [train_acc_nbcla],
    'Test Score': [test_acc_nbcla]
})
model1.sort_values(by='Test Score', ascending=False)

# Precision and Recall

Precision is a description of random errors, a measure of statistical variability.
In simpler terms, given a set of data points from repeated measurements of the same quantity, the set can be said to be accurate if their average is close to the true value of the quantity being measured, while the set can be said to be precise if the values are close to each other. While Recall is defined as the fraction of relevant documents retrieved compared to the total number of relevant documents (true positives divided by true positives+false negatives).

In [None]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(Y_test, Y_predict3)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

In [None]:

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

disp = plot_precision_recall_curve(nbcla,X_train, Y_train)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# ROC Curve
is a graphical plot that illustrates the diagnostic ability of a binary classifier system as its discrimination threshold is varied.

In [None]:
from sklearn.metrics import roc_curve


# Naive Bayes Classification
Y_predict3_proba = nbcla.predict_proba(X_test)
Y_predict3_proba = Y_predict3_proba[:, 1]
fpr, tpr, thresholds = roc_curve(Y_test, Y_predict3_proba)
plt.subplot(332)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='ANN')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve Naive Bayes')
plt.grid(True)
plt.subplots_adjust(top=2, bottom=0.08, left=0.10, right=1.4, hspace=0.45, wspace=0.45)
plt.show()