# Credit Card Fraud Detection

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150632 entries, 0 to 150631
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    150632 non-null  int64  
 1   V1      150632 non-null  float64
 2   V2      150632 non-null  float64
 3   V3      150632 non-null  float64
 4   V4      150632 non-null  float64
 5   V5      150632 non-null  float64
 6   V6      150632 non-null  float64
 7   V7      150632 non-null  float64
 8   V8      150631 non-null  float64
 9   V9      150631 non-null  float64
 10  V10     150631 non-null  float64
 11  V11     150631 non-null  float64
 12  V12     150631 non-null  float64
 13  V13     150631 non-null  float64
 14  V14     150631 non-null  float64
 15  V15     150631 non-null  float64
 16  V16     150631 non-null  float64
 17  V17     150631 non-null  float64
 18  V18     150631 non-null  float64
 19  V19     150631 non-null  float64
 20  V20     150631 non-null  float64
 21  V21     15

In [6]:
#checking missing values in each column

data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [7]:
#distribution of legit transaction and fradulent transaction
data['Class'].value_counts()

0.0    150337
1.0       294
Name: Class, dtype: int64

In [8]:
#seperating the data for analysis
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [9]:
legit

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150626,93788,-0.754576,0.753291,1.780803,-1.252487,0.655902,0.318362,0.479174,0.027790,1.191168,...,-0.005282,0.174430,-0.463886,0.090478,0.643110,0.656022,-0.061843,0.043862,24.85,0.0
150627,93789,-0.487808,0.524641,-1.976990,0.215200,1.706376,-2.220200,1.011071,-0.252453,0.716540,...,0.488790,1.615337,-0.031607,0.033280,-1.075134,-0.188378,0.434657,0.185544,15.88,0.0
150628,93790,-4.948835,4.463739,-4.315625,-0.876520,-2.068503,-0.587427,-2.512790,3.852223,1.211863,...,-0.302062,-1.216998,0.431651,-1.497677,0.232590,0.210221,-0.204348,-0.024772,9.72,0.0
150629,93798,1.963076,0.761481,-0.695665,3.900855,0.564233,-0.388571,0.243059,-0.305132,0.169108,...,0.067740,0.454708,0.056026,-0.183339,0.121210,0.100941,-0.073290,-0.059967,4.58,0.0


In [11]:
legit.shape

(150337, 31)

In [10]:
fraud

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1.0
623,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1.0
4920,4462,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1.0
6108,6986,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1.0
6329,7519,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149587,91524,1.954852,1.630056,-4.337200,2.378367,2.113348,-1.583851,0.653745,-0.192892,1.217608,...,-0.474437,-0.974625,-0.048155,-0.023524,0.362192,-0.570709,0.025619,0.081880,1.00,1.0
149600,91554,-5.100256,3.633442,-3.843919,0.183208,-1.183997,1.602139,-3.005953,-8.645038,1.285458,...,8.280439,-2.797150,1.090707,-0.159260,0.532156,-0.497126,0.943622,0.553581,261.22,1.0
149869,92092,-1.108478,3.448953,-6.216972,3.021052,-0.529901,-2.551375,-2.001743,1.092432,-0.836098,...,0.825951,1.144170,0.208559,-0.295497,-0.690232,-0.364749,0.229327,0.208830,18.00,1.0
149874,92102,-1.662937,3.253892,-7.040485,2.266456,-4.177649,-0.746925,-0.248337,1.091157,-0.307137,...,0.450381,0.521162,0.308325,-0.318012,-1.255362,-0.691963,0.264878,-0.130445,600.73,1.0


In [12]:
fraud.shape

(294, 31)

In [13]:
#Statistical measures of the data
legit.Amount.describe()

count    150337.000000
mean         89.060438
std         242.885780
min           0.000000
25%           5.800000
50%          22.800000
75%          79.150000
max       19656.530000
Name: Amount, dtype: float64

In [14]:
fraud.Amount.describe()

count     294.000000
mean      120.787653
std       243.689320
min         0.000000
25%         1.000000
50%        10.685000
75%       105.695000
max      1809.680000
Name: Amount, dtype: float64

In [16]:
# compare the values for both transactions
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,54371.534506,-0.232065,0.032225,0.641204,0.120976,-0.243064,0.074549,-0.092698,0.054327,-0.057038,...,0.040169,-0.040026,-0.113084,-0.029453,0.012249,0.120349,0.022038,0.001146,0.002549,89.060438
1.0,47016.714286,-5.295684,3.789179,-6.914529,4.396624,-3.758181,-1.430356,-5.663365,1.360196,-2.526687,...,0.246076,1.250714,-0.293445,-0.093689,-0.111138,0.194626,0.062231,0.504392,0.094689,120.787653


# Undersampling

 build a sample dataset containing similar diatribution of normal transaction and fradulent transaction

In [17]:
legit_sample= legit.sample(n=492)
#concatenating two dataframes
new_dataset = pd.concat([legit_sample,fraud], axis=0)
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
135378,81226,-0.708327,0.583911,1.884921,0.049618,0.235578,0.124778,0.839703,-0.004919,-0.313102,...,-0.101823,0.068348,-0.034074,0.263861,-0.125126,0.206892,0.153671,-0.054011,51.0,0.0
29129,35333,-1.42793,0.510706,0.415764,0.473698,2.016358,4.557816,-0.535437,1.055101,1.058707,...,-0.475769,-0.757971,0.072431,1.002053,-0.134339,-0.541458,-0.63031,-0.243276,6.58,0.0
102339,68156,1.075865,0.26721,-0.069417,1.152233,0.137196,-0.477157,0.44053,-0.087697,-0.614693,...,0.124274,0.291775,-0.143962,0.23165,0.694702,-0.285472,-0.006268,0.007964,51.45,0.0
126978,78166,-1.6311,-0.14344,0.7784,0.46042,0.64721,0.03191,0.510259,0.420657,-0.13374,...,0.030519,0.261555,0.375016,-0.336522,-0.158449,-0.565638,-0.166431,-0.213542,81.51,0.0
56918,47669,-3.68231,3.17127,-0.724391,-1.836038,-1.462323,-0.891451,-0.692339,1.499636,1.351804,...,-0.337315,-0.367555,0.150649,-0.002149,0.231269,0.6655,-0.167068,-0.606388,2.92,0.0


In [18]:
new_dataset['Class'].value_counts()

0.0    492
1.0    294
Name: Class, dtype: int64

In [19]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,53596.056911,-0.129778,-0.041353,0.557614,0.078431,-0.217178,0.088692,-0.122334,0.039385,-0.167451,...,0.047337,-0.042301,-0.086205,-0.032896,0.018695,0.149269,0.002739,0.008736,-0.012669,92.434675
1.0,47016.714286,-5.295684,3.789179,-6.914529,4.396624,-3.758181,-1.430356,-5.663365,1.360196,-2.526687,...,0.246076,1.250714,-0.293445,-0.093689,-0.111138,0.194626,0.062231,0.504392,0.094689,120.787653


In [20]:
#splitting the data into features and targets
x = new_dataset.drop(columns = 'Class',axis = 1)
y= new_dataset['Class']
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
135378,81226,-0.708327,0.583911,1.884921,0.049618,0.235578,0.124778,0.839703,-0.004919,-0.313102,...,0.307556,-0.101823,0.068348,-0.034074,0.263861,-0.125126,0.206892,0.153671,-0.054011,51.00
29129,35333,-1.427930,0.510706,0.415764,0.473698,2.016358,4.557816,-0.535437,1.055101,1.058707,...,0.032285,-0.475769,-0.757971,0.072431,1.002053,-0.134339,-0.541458,-0.630310,-0.243276,6.58
102339,68156,1.075865,0.267210,-0.069417,1.152233,0.137196,-0.477157,0.440530,-0.087697,-0.614693,...,-0.087560,0.124274,0.291775,-0.143962,0.231650,0.694702,-0.285472,-0.006268,0.007964,51.45
126978,78166,-1.631100,-0.143440,0.778400,0.460420,0.647210,0.031910,0.510259,0.420657,-0.133740,...,-0.343337,0.030519,0.261555,0.375016,-0.336522,-0.158449,-0.565638,-0.166431,-0.213542,81.51
56918,47669,-3.682310,3.171270,-0.724391,-1.836038,-1.462323,-0.891451,-0.692339,1.499636,1.351804,...,0.922987,-0.337315,-0.367555,0.150649,-0.002149,0.231269,0.665500,-0.167068,-0.606388,2.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149587,91524,1.954852,1.630056,-4.337200,2.378367,2.113348,-1.583851,0.653745,-0.192892,1.217608,...,-0.230640,-0.474437,-0.974625,-0.048155,-0.023524,0.362192,-0.570709,0.025619,0.081880,1.00
149600,91554,-5.100256,3.633442,-3.843919,0.183208,-1.183997,1.602139,-3.005953,-8.645038,1.285458,...,-2.806302,8.280439,-2.797150,1.090707,-0.159260,0.532156,-0.497126,0.943622,0.553581,261.22
149869,92092,-1.108478,3.448953,-6.216972,3.021052,-0.529901,-2.551375,-2.001743,1.092432,-0.836098,...,-0.068598,0.825951,1.144170,0.208559,-0.295497,-0.690232,-0.364749,0.229327,0.208830,18.00
149874,92102,-1.662937,3.253892,-7.040485,2.266456,-4.177649,-0.746925,-0.248337,1.091157,-0.307137,...,-0.842178,0.450381,0.521162,0.308325,-0.318012,-1.255362,-0.691963,0.264878,-0.130445,600.73


In [21]:
y

135378    0.0
29129     0.0
102339    0.0
126978    0.0
56918     0.0
         ... 
149587    1.0
149600    1.0
149869    1.0
149874    1.0
150601    1.0
Name: Class, Length: 786, dtype: float64

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)
print(x.shape,x_train.shape,x_test.shape)

(786, 30) (628, 30) (158, 30)


In [23]:
#Model training
#logistic regression
model = LogisticRegression()
model.fit(x_train,y_train)

In [24]:
#accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [25]:
print('Accuracy on training data: ',training_data_accuracy)

Accuracy on training data:  0.9267515923566879


In [26]:
#accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Accuracy on test data: ',test_data_accuracy)

Accuracy on test data:  0.9367088607594937
