#### Importing dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

##### Loading the dataset to a pandas Dataframe

In [3]:
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [4]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
150627,93789,-0.487808,0.524641,-1.97699,0.2152,1.706376,-2.2202,1.011071,-0.252453,0.71654,...,0.48879,1.615337,-0.031607,0.03328,-1.075134,-0.188378,0.434657,0.185544,15.88,0.0
150628,93790,-4.948835,4.463739,-4.315625,-0.87652,-2.068503,-0.587427,-2.51279,3.852223,1.211863,...,-0.302062,-1.216998,0.431651,-1.497677,0.23259,0.210221,-0.204348,-0.024772,9.72,0.0
150629,93798,1.963076,0.761481,-0.695665,3.900855,0.564233,-0.388571,0.243059,-0.305132,0.169108,...,0.06774,0.454708,0.056026,-0.183339,0.12121,0.100941,-0.07329,-0.059967,4.58,0.0
150630,93798,0.000511,1.064202,-0.403154,-0.629699,1.218252,-0.233102,0.870353,-0.143075,1.567454,...,-0.569978,-1.103569,0.041873,-0.014577,-0.437402,0.065077,0.127707,-0.105725,5.25,0.0
150631,93798,-0.796099,0.85551,-1.547924,0.276334,0.257263,-1.366656,0.124734,,,...,,,,,,,,,,


#### Dataset Information

In [6]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150632 entries, 0 to 150631
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    150632 non-null  int64  
 1   V1      150632 non-null  float64
 2   V2      150632 non-null  float64
 3   V3      150632 non-null  float64
 4   V4      150632 non-null  float64
 5   V5      150632 non-null  float64
 6   V6      150632 non-null  float64
 7   V7      150632 non-null  float64
 8   V8      150631 non-null  float64
 9   V9      150631 non-null  float64
 10  V10     150631 non-null  float64
 11  V11     150631 non-null  float64
 12  V12     150631 non-null  float64
 13  V13     150631 non-null  float64
 14  V14     150631 non-null  float64
 15  V15     150631 non-null  float64
 16  V16     150631 non-null  float64
 17  V17     150631 non-null  float64
 18  V18     150631 non-null  float64
 19  V19     150631 non-null  float64
 20  V20     150631 non-null  float64
 21  V21     15

##### Checking the number of missing values in each column

In [7]:
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

Distribution of legit transactions and fradulent transactions

In [8]:
credit_card_data['Class'].value_counts()

0.0    150337
1.0       294
Name: Class, dtype: int64

This Dataset is highly unbalanced

0 --> Normal Transaction
1 --> fraudulent transaction

##### Seperating the data for analysis

In [10]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [11]:
print(legit.shape)
print(fraud.shape)

(150337, 31)
(294, 31)


In [12]:
legit.Amount.describe()

count    150337.000000
mean         89.060438
std         242.885780
min           0.000000
25%           5.800000
50%          22.800000
75%          79.150000
max       19656.530000
Name: Amount, dtype: float64

In [13]:
fraud.Amount.describe()

count     294.000000
mean      120.787653
std       243.689320
min         0.000000
25%         1.000000
50%        10.685000
75%       105.695000
max      1809.680000
Name: Amount, dtype: float64

In [14]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,54371.534506,-0.232065,0.032225,0.641204,0.120976,-0.243064,0.074549,-0.092698,0.054327,-0.057038,...,0.040169,-0.040026,-0.113084,-0.029453,0.012249,0.120349,0.022038,0.001146,0.002549,89.060438
1.0,47016.714286,-5.295684,3.789179,-6.914529,4.396624,-3.758181,-1.430356,-5.663365,1.360196,-2.526687,...,0.246076,1.250714,-0.293445,-0.093689,-0.111138,0.194626,0.062231,0.504392,0.094689,120.787653


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [15]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames



In [16]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [17]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
136524,81746,-1.477225,-0.081141,1.199358,-0.360418,1.890102,-1.369992,0.167767,-0.246464,0.06129,...,-0.391069,-0.841733,-0.361555,-0.476902,-0.227314,-0.047459,0.187405,-0.10469,2.69,0.0
42828,41263,1.23559,0.496827,-0.128144,1.051554,0.274497,-0.710731,0.512086,-0.249315,-0.368801,...,0.036725,0.228147,-0.121928,0.110645,0.799356,-0.268581,0.012415,0.008651,0.89,0.0
105812,69702,-0.8,0.545554,2.044824,0.156834,-0.170901,0.042464,0.632999,0.185602,-0.086758,...,-0.194232,-0.581365,-0.040504,0.052023,0.161375,0.245341,0.012581,0.064053,59.95,0.0
93853,64615,1.036272,0.006169,0.462614,1.361964,-0.278645,0.020213,-0.018274,0.138963,0.096597,...,-0.020772,0.030289,-0.075436,0.218995,0.573448,-0.344346,0.025909,0.012984,44.95,0.0
106550,70007,-1.109787,1.179688,1.143622,1.506532,-0.542649,-0.146282,0.605552,0.341341,-0.604607,...,0.060984,0.429438,-0.06017,0.657554,-0.118874,-0.269815,0.248102,0.073373,93.15,0.0


In [18]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
149587,91524,1.954852,1.630056,-4.3372,2.378367,2.113348,-1.583851,0.653745,-0.192892,1.217608,...,-0.474437,-0.974625,-0.048155,-0.023524,0.362192,-0.570709,0.025619,0.08188,1.0,1.0
149600,91554,-5.100256,3.633442,-3.843919,0.183208,-1.183997,1.602139,-3.005953,-8.645038,1.285458,...,8.280439,-2.79715,1.090707,-0.15926,0.532156,-0.497126,0.943622,0.553581,261.22,1.0
149869,92092,-1.108478,3.448953,-6.216972,3.021052,-0.529901,-2.551375,-2.001743,1.092432,-0.836098,...,0.825951,1.14417,0.208559,-0.295497,-0.690232,-0.364749,0.229327,0.20883,18.0,1.0
149874,92102,-1.662937,3.253892,-7.040485,2.266456,-4.177649,-0.746925,-0.248337,1.091157,-0.307137,...,0.450381,0.521162,0.308325,-0.318012,-1.255362,-0.691963,0.264878,-0.130445,600.73,1.0
150601,93742,-3.291125,4.401194,-8.394212,4.45358,-4.790055,-4.240182,-9.219001,1.97403,-2.912943,...,2.102343,0.59737,-0.328086,0.445752,0.585281,-0.399005,2.116004,1.050744,1.0,1.0


In [19]:
new_dataset['Class'].value_counts()

0.0    492
1.0    294
Name: Class, dtype: int64

In [20]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,51901.896341,-0.493867,0.037236,0.648659,0.174006,-0.141039,-0.033332,-0.105549,0.09749,-0.022384,...,0.073995,-0.0376,-0.14778,-0.032312,-0.005453,0.107209,0.016463,0.020716,0.021983,91.025386
1.0,47016.714286,-5.295684,3.789179,-6.914529,4.396624,-3.758181,-1.430356,-5.663365,1.360196,-2.526687,...,0.246076,1.250714,-0.293445,-0.093689,-0.111138,0.194626,0.062231,0.504392,0.094689,120.787653


Splitting the data into Features & Targets

In [23]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [24]:
print(X)

         Time        V1        V2        V3        V4        V5        V6  \
136524  81746 -1.477225 -0.081141  1.199358 -0.360418  1.890102 -1.369992   
42828   41263  1.235590  0.496827 -0.128144  1.051554  0.274497 -0.710731   
105812  69702 -0.800000  0.545554  2.044824  0.156834 -0.170901  0.042464   
93853   64615  1.036272  0.006169  0.462614  1.361964 -0.278645  0.020213   
106550  70007 -1.109787  1.179688  1.143622  1.506532 -0.542649 -0.146282   
...       ...       ...       ...       ...       ...       ...       ...   
149587  91524  1.954852  1.630056 -4.337200  2.378367  2.113348 -1.583851   
149600  91554 -5.100256  3.633442 -3.843919  0.183208 -1.183997  1.602139   
149869  92092 -1.108478  3.448953 -6.216972  3.021052 -0.529901 -2.551375   
149874  92102 -1.662937  3.253892 -7.040485  2.266456 -4.177649 -0.746925   
150601  93742 -3.291125  4.401194 -8.394212  4.453580 -4.790055 -4.240182   

              V7        V8        V9  ...       V20       V21       V22  \


In [25]:
print(Y)

136524    0.0
42828     0.0
105812    0.0
93853     0.0
106550    0.0
         ... 
149587    1.0
149600    1.0
149869    1.0
149874    1.0
150601    1.0
Name: Class, Length: 786, dtype: float64


Split the data into Training data & Testing Data

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(786, 30) (628, 30) (158, 30)


Model Training

Logistic Regression

In [34]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [36]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [37]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9506369426751592


In [38]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [39]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.930379746835443
