# Modules required to do credit card fraud analysis

In [17]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Loading and reading data

In [2]:
df = pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Shape and information of data

In [3]:
df.shape

(284807, 31)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

## Checking for missing values

In [5]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
df.Class.value_counts(dropna=False)

0    284315
1       492
Name: Class, dtype: int64

## Splitting the data into input parameters and target parameters

In [7]:
x = df.iloc[: , 1:30].values
y = df.iloc[:, 30].values

In [9]:
print("Input Shape : ", x.shape)
print("Output Shape : ", y.shape)

Input Shape :  (284807, 29)
Output Shape :  (284807,)


## Splitting the data into test parameters and train parameters

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state = 0)
print("xtrain.shape : ", xtrain.shape)
print("xtest.shape  : ", xtest.shape)
print("ytrain.shape : ", ytrain.shape)
print("xtest.shape  : ", xtest.shape)

xtrain.shape :  (213605, 29)
xtest.shape  :  (71202, 29)
ytrain.shape :  (213605,)
xtest.shape  :  (71202, 29)


## Standerdising the imbalanced dataset

In [11]:
sc_x = StandardScaler()
xtrain = sc_x.fit_transform(xtrain)
xtest = sc_x.transform(xtest)
print("Standardised Training Set : \n", xtrain[0])
print("Length of Train set:", len(xtrain))
print("Standardised Testing Set : \n", xtest[0])
print("Length of Test set:", len(xtest))

Standardised Training Set : 
 [ 1.04272047  0.06657394 -1.19051456  0.05060912  0.18235446 -1.31399333
  0.58133086 -0.40257892 -0.09319222  0.16481198  1.60036637  1.18028602
 -0.24273404  1.08764203 -0.35935009 -0.76863613 -0.28881862 -0.39536117
  0.13774039 -0.34055771  0.32484688  1.13026957  0.03716189  0.90724443
  0.61754959  0.39904973 -0.21031503 -0.2607924  -0.35356699]
Length of Train set: 213605
Standardised Testing Set : 
 [-0.16231908  0.64204284 -0.0340602  -0.42857654  0.88841768 -0.06927409
  0.91303517 -0.11320903 -0.16026301 -1.52155306 -1.09536149  0.19785178
  1.16136332 -1.86651646 -0.26840412 -0.06941666  1.00181025  0.45021705
  1.04837163  0.24329246 -0.29091123 -0.60071305 -0.41993263 -0.07379807
  0.40741336  0.01410364  0.26301127  0.46235694 -0.18685593]
Length of Test set: 71202


## Validating the split data

In [12]:
len(xtest)+len(xtrain) == df.shape[0]

True

## Decision Tree Classifier(DTC)

In [13]:
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt_classifier.fit(xtrain, ytrain)
prediction1 = dt_classifier.predict(xtest)
cm_decision = confusion_matrix(ytest, prediction1)
print("confusion Marix : \n", cm_decision)
print('\n Accuracy: {}% \n'.format(accuracy_score(ytest, prediction1) * 100))
print('\n',classification_report(ytest, prediction1))

confusion Marix : 
 [[71058    24]
 [   25    95]]

 Accuracy: 99.93118170837899% 


               precision    recall  f1-score   support

           0       1.00      1.00      1.00     71082
           1       0.80      0.79      0.79       120

    accuracy                           1.00     71202
   macro avg       0.90      0.90      0.90     71202
weighted avg       1.00      1.00      1.00     71202



## Support Vector Classifier (SVC)

In [14]:
svc_classifier = SVC(kernel = 'rbf', random_state =0)
svc_classifier.fit(xtrain, ytrain)
prediction2 = svc_classifier.predict(xtest)
cm2 = confusion_matrix(ytest, prediction2)
print("Confusion Matrix : \n\n", cm2)
print('\n Accuracy: {}% \n'.format(accuracy_score(ytest, prediction2) * 100))
print('\n',classification_report(ytest, prediction2))

Confusion Matrix : 

 [[71077     5]
 [   45    75]]

 Accuracy: 99.92977725344794% 


               precision    recall  f1-score   support

           0       1.00      1.00      1.00     71082
           1       0.94      0.62      0.75       120

    accuracy                           1.00     71202
   macro avg       0.97      0.81      0.87     71202
weighted avg       1.00      1.00      1.00     71202



## Logistic Regression(LR)

In [16]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state = 0)
LR.fit(xtrain, ytrain)
prediction3 = LR.predict(xtest)
cm3 = confusion_matrix(ytest, prediction3)
print("Confusion Matrix : \n\n", cm3)
print('\n Accuracy: {}% \n'.format(accuracy_score(ytest, prediction3) * 100))
print('\n',classification_report(ytest, prediction3))

Confusion Matrix : 

 [[71071    11]
 [   39    81]]

 Accuracy: 99.92977725344794% 


               precision    recall  f1-score   support

           0       1.00      1.00      1.00     71082
           1       0.88      0.68      0.76       120

    accuracy                           1.00     71202
   macro avg       0.94      0.84      0.88     71202
weighted avg       1.00      1.00      1.00     71202



### From the analysis it is understood that 'Desicion Tree Classifier' has higher accuracy than 
### 'Support Vector Classifier' and 'Logistic Regression'.
### Also, SVC and LR have similar accuracy as both of the models use similar loss functions.