# **Credit Card Fraud Detection**

In [131]:
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [132]:
#loading the dataset to pandas dataframe
df = pd.read_csv('/content/creditcard.csv')

In [133]:
#printing first 5 rows of the dataframe
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [134]:
#information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25838 entries, 0 to 25837
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    25838 non-null  int64  
 1   V1      25838 non-null  float64
 2   V2      25838 non-null  float64
 3   V3      25837 non-null  float64
 4   V4      25837 non-null  float64
 5   V5      25837 non-null  float64
 6   V6      25837 non-null  float64
 7   V7      25837 non-null  float64
 8   V8      25837 non-null  float64
 9   V9      25837 non-null  float64
 10  V10     25837 non-null  float64
 11  V11     25837 non-null  float64
 12  V12     25837 non-null  float64
 13  V13     25837 non-null  float64
 14  V14     25837 non-null  float64
 15  V15     25837 non-null  float64
 16  V16     25837 non-null  float64
 17  V17     25837 non-null  float64
 18  V18     25837 non-null  float64
 19  V19     25837 non-null  float64
 20  V20     25837 non-null  float64
 21  V21     25837 non-null  float64
 22

In [135]:
#checking the number of missing values
df.isnull().sum()

Time      0
V1        0
V2        0
V3        1
V4        1
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [136]:
df.shape

(25838, 31)

##**Handling The Missing Value**

In [137]:
#drop the missing rows
df.dropna(inplace=True)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25832,33784,1.261980,-0.325252,-0.143104,-1.105445,-0.156888,0.006692,-0.258470,0.171292,1.351646,...,-0.184571,-0.361529,-0.094535,-0.843711,0.618429,-0.742185,0.061109,-0.001626,1.00,0.0
25833,33785,-0.187402,0.445507,2.199225,3.030535,-0.361376,1.095100,-0.119594,0.245950,-0.371890,...,0.154001,0.813468,0.377732,0.201209,-1.671394,-0.064077,0.067891,0.021521,42.33,0.0
25834,33785,1.237064,0.312096,0.191934,0.496764,-0.129777,-0.551625,-0.007160,-0.048880,-0.240393,...,-0.255114,-0.759423,0.072774,-0.035961,0.238605,0.096858,-0.025772,0.017772,1.79,0.0
25835,33785,1.245292,-0.594493,0.275982,-0.578346,-0.920599,-0.865570,-0.321689,-0.217249,-1.007295,...,0.303333,0.736260,-0.102900,0.452253,0.510125,-0.114928,0.003129,0.021850,70.75,0.0


In [138]:
#distribution of legit and fraudulent transactions
#Conclusion - There are very less data points for fraudulent transactions. Hence model can be errorneous
df['Class'].value_counts()

Class
0.0    25749
1.0       88
Name: count, dtype: int64

##**Separating Data for Analysis**

In [139]:
#0 ---> legit
#1 ---> fraud
legit = df[df.Class == 0]
fraud = df[df.Class == 1]

In [140]:
#statistical measures of the data
legit.Amount.describe()

count    25749.000000
mean        75.726227
std        219.488324
min          0.000000
25%          6.200000
50%         18.960000
75%         67.500000
max       7879.420000
Name: Amount, dtype: float64

In [141]:
fraud.Amount.describe()

count      88.000000
mean      100.010000
std       265.845031
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [142]:
#noticing the difference between the means of fraud and legit transactions
#huge difference in the mean values
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,19380.491087,-0.197045,0.145743,0.767372,0.208854,-0.17442,0.088572,-0.100712,0.011278,0.453641,...,0.039986,-0.03957,-0.131409,-0.038716,0.012076,0.12867,0.022883,0.010424,0.003824,75.726227
1.0,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


###*since there is uneven sample of fraud and legit transactions, we will perform under sampling which will reduce the no. of legit transactions to even out the data and achieve better results.*

##**Under Sampling**

In [143]:
#create a new legit sample set with 88 rows of legit data
legit_sample = legit.sample(n=88)

In [144]:
#conacatenate the legit_sample and fraud data
new_df = pd.concat([legit_sample, fraud], axis=0)
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
2291,1835,-4.174416,2.748653,-0.776032,-2.024336,-1.54975,0.011858,0.564681,0.680609,2.448067,...,-0.63972,-0.40121,0.034804,-0.310505,0.34134,0.777287,1.070616,0.459016,153.76,0.0
11506,19902,1.442948,-0.52928,0.398502,-0.366609,-0.879701,-0.495975,-0.749817,-0.139545,1.074936,...,-0.225115,-0.357382,-0.067422,-0.209998,0.564233,-0.245127,-0.021123,-0.002161,5.0,0.0
7524,10285,1.132864,0.379761,0.67857,1.140004,-0.036583,-0.030695,-0.12368,0.006221,0.997185,...,-0.292208,-0.59713,0.120025,-0.067999,0.256047,-0.659405,0.01203,0.012008,10.69,0.0
15560,26954,-0.39317,1.061871,1.208159,-0.290506,0.615807,0.083701,0.692449,0.061387,-0.512063,...,-0.242669,-0.54263,-0.108975,-0.755168,-0.117059,0.149146,0.287971,0.111417,3.57,0.0
23004,32538,0.997286,-0.090812,0.204585,1.261164,0.001744,0.361995,0.064309,0.129303,0.034339,...,-0.024581,-0.039185,-0.183244,-0.281634,0.651185,-0.318287,0.021624,0.015997,84.88,0.0


In [145]:
new_df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
18773,29753,0.269614,3.549755,-5.810353,5.80937,1.538808,-2.269219,-0.824203,0.35107,-3.759059,...,0.371121,-0.32229,-0.549856,-0.520629,1.37821,0.564714,0.553255,0.4024,0.68,1.0
18809,29785,0.923764,0.344048,-2.880004,1.72168,-3.019565,-0.639736,-3.801325,1.299096,0.864065,...,0.899931,1.481271,0.725266,0.17696,-1.815638,-0.536517,0.489035,-0.049729,30.3,1.0
20198,30852,-2.830984,0.885657,1.19993,2.861292,0.321669,0.289966,1.76776,-2.45105,0.069736,...,0.546589,0.334971,0.172106,0.62359,-0.527114,-0.079215,-2.532445,0.311177,104.81,1.0
23308,32686,0.287953,1.728735,-1.652173,3.813544,-1.090927,-0.984745,-2.202318,0.555088,-2.033892,...,0.262202,-0.633528,0.092891,0.187613,0.368708,-0.132474,0.576561,0.309843,0.0,1.0
23422,32745,-2.179135,0.020218,-2.182733,2.572046,-3.663733,0.081568,0.268049,0.660437,-2.374027,...,1.026421,0.299614,1.6568,0.328433,0.106457,0.691775,0.196779,0.241085,717.15,1.0


In [146]:
new_df.shape

(176, 31)

In [147]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,18600.159091,-0.2894,0.135669,0.779812,0.135618,0.065514,0.327273,-0.011638,0.107303,0.281033,...,0.042615,-0.023564,-0.088484,-0.042011,-0.120948,0.136099,0.045778,0.031872,0.026154,77.709091
1.0,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


##**Splitting the data in features and targets**

In [148]:
X = new_df.drop(columns='Class', axis=1)
Y = new_df['Class']

In [149]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
2291,1835,-4.174416,2.748653,-0.776032,-2.024336,-1.549750,0.011858,0.564681,0.680609,2.448067,...,1.108400,-0.639720,-0.401210,0.034804,-0.310505,0.341340,0.777287,1.070616,0.459016,153.76
11506,19902,1.442948,-0.529280,0.398502,-0.366609,-0.879701,-0.495975,-0.749817,-0.139545,1.074936,...,-0.072876,-0.225115,-0.357382,-0.067422,-0.209998,0.564233,-0.245127,-0.021123,-0.002161,5.00
7524,10285,1.132864,0.379761,0.678570,1.140004,-0.036583,-0.030695,-0.123680,0.006221,0.997185,...,-0.160497,-0.292208,-0.597130,0.120025,-0.067999,0.256047,-0.659405,0.012030,0.012008,10.69
15560,26954,-0.393170,1.061871,1.208159,-0.290506,0.615807,0.083701,0.692449,0.061387,-0.512063,...,0.164125,-0.242669,-0.542630,-0.108975,-0.755168,-0.117059,0.149146,0.287971,0.111417,3.57
23004,32538,0.997286,-0.090812,0.204585,1.261164,0.001744,0.361995,0.064309,0.129303,0.034339,...,-0.008367,-0.024581,-0.039185,-0.183244,-0.281634,0.651185,-0.318287,0.021624,0.015997,84.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18773,29753,0.269614,3.549755,-5.810353,5.809370,1.538808,-2.269219,-0.824203,0.351070,-3.759059,...,0.310525,0.371121,-0.322290,-0.549856,-0.520629,1.378210,0.564714,0.553255,0.402400,0.68
18809,29785,0.923764,0.344048,-2.880004,1.721680,-3.019565,-0.639736,-3.801325,1.299096,0.864065,...,0.170872,0.899931,1.481271,0.725266,0.176960,-1.815638,-0.536517,0.489035,-0.049729,30.30
20198,30852,-2.830984,0.885657,1.199930,2.861292,0.321669,0.289966,1.767760,-2.451050,0.069736,...,-1.016923,0.546589,0.334971,0.172106,0.623590,-0.527114,-0.079215,-2.532445,0.311177,104.81
23308,32686,0.287953,1.728735,-1.652173,3.813544,-1.090927,-0.984745,-2.202318,0.555088,-2.033892,...,0.265250,0.262202,-0.633528,0.092891,0.187613,0.368708,-0.132474,0.576561,0.309843,0.00


In [150]:
Y

2291     0.0
11506    0.0
7524     0.0
15560    0.0
23004    0.0
        ... 
18773    1.0
18809    1.0
20198    1.0
23308    1.0
23422    1.0
Name: Class, Length: 176, dtype: float64

##**Split the data in train and test data**

In [151]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [152]:
X_train.shape

(140, 30)

In [153]:
X_test.shape

(36, 30)

In [154]:
Y_train.shape

(140,)

In [155]:
Y_test.shape

(36,)

##**Model Training**


###***Logistic Regression***

In [156]:
model = LogisticRegression(max_iter=1000)

In [157]:
#training the logistic regression model with training data
model.fit(X_train, Y_train)

##**Model Evaluation**

###**Calculating Accuracy Score**

In [158]:
#accuracy on train data
X_train_prediction = model.predict(X_train)
accuracy_on_training_data = accuracy_score(X_train_prediction, Y_train)

In [159]:
accuracy_on_training_data

1.0

In [160]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
accuracy_on_test_data = accuracy_score(X_test_prediction, Y_test)

In [161]:
accuracy_on_test_data

0.9722222222222222