In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
# loading the dataset to a pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_data.csv')

In [4]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [5]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43659,41598,-0.538686,-0.635328,-0.041569,-3.022158,0.771776,3.556774,-1.41882,1.371999,-2.475053,...,-0.090409,-0.310254,0.16851,0.955413,-0.427495,-0.375891,0.05101,0.072416,60.01,0.0
43660,41599,-0.349615,-2.860571,0.297766,0.801212,-1.523995,1.16426,0.037484,0.32522,0.862825,...,0.31537,-0.503099,-0.514747,-0.19389,-0.327091,0.87582,-0.144908,0.125368,745.56,0.0
43661,41599,-3.001222,2.899766,0.726874,-0.729992,-0.312792,-0.735557,0.960944,-0.276141,2.128747,...,-0.64554,-0.613956,0.111663,0.338567,0.268357,0.076981,1.184716,0.491066,8.99,0.0
43662,41599,1.042342,-0.390001,-0.46368,-1.737393,0.450611,0.604354,0.068099,0.27803,0.555053,...,-0.137137,-0.286887,0.143336,-0.98681,0.135599,-0.185344,0.03559,-0.008304,48.0,0.0
43663,41600,1.193909,0.067328,0.609239,0.825453,-0.332038,-0.101522,-0.130782,-0.033697,0.424588,...,-0.04995,0.213908,-0.06875,0.182281,0.557841,0.4,,,,


In [6]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43664 entries, 0 to 43663
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    43664 non-null  int64  
 1   V1      43664 non-null  float64
 2   V2      43664 non-null  float64
 3   V3      43664 non-null  float64
 4   V4      43664 non-null  float64
 5   V5      43664 non-null  float64
 6   V6      43664 non-null  float64
 7   V7      43664 non-null  float64
 8   V8      43664 non-null  float64
 9   V9      43664 non-null  float64
 10  V10     43664 non-null  float64
 11  V11     43664 non-null  float64
 12  V12     43664 non-null  float64
 13  V13     43664 non-null  float64
 14  V14     43664 non-null  float64
 15  V15     43664 non-null  float64
 16  V16     43664 non-null  float64
 17  V17     43664 non-null  float64
 18  V18     43664 non-null  float64
 19  V19     43664 non-null  float64
 20  V20     43664 non-null  float64
 21  V21     43664 non-null  float64
 22

In [7]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [8]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,43529
1.0,134


In [9]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [10]:
print(legit.shape)
print(fraud.shape)

(43529, 31)
(134, 31)


In [11]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,43529.0
mean,90.3028
std,238.465356
min,0.0
25%,7.58
50%,24.9
75%,81.4
max,7879.42


In [12]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,134.0
mean,93.928433
std,231.551248
min,0.0
25%,1.0
50%,6.455
75%,99.99
max,1809.68


In [13]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,26892.92961,-0.210136,0.014404,0.735522,0.173494,-0.226584,0.106817,-0.092752,0.039222,0.185126,...,0.046501,-0.02974,-0.109589,-0.039132,0.010036,0.135999,0.022175,0.004776,0.004927,90.3028
1.0,25259.246269,-7.759814,5.619386,-10.672832,6.111351,-5.77614,-2.360976,-8.192938,4.04,-3.571525,...,0.50715,0.931331,-0.270332,-0.315606,-0.096976,0.279383,0.168436,0.687503,0.063472,93.928433


Under-Sampling

Build a sample dataset containing similar distribution of normal transaction and Fraudulent Transactions

Number of Fraudulent Transaction --> 492

In [14]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [15]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [16]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
20963,31384,-3.151183,-1.110894,2.131186,0.53628,0.048715,-0.400256,-0.098679,0.440111,0.319844,...,-0.088252,0.470594,0.758572,0.309622,0.70958,-0.440694,0.008291,-0.177852,100.55,0.0
26302,33995,-0.74399,0.809811,1.892754,-0.512528,0.212477,-0.245269,0.861728,-0.020905,-0.213684,...,-0.255141,-0.672184,-0.268116,-0.111745,0.449079,0.251057,-0.012686,0.052801,27.61,0.0
32721,36938,-0.302422,0.735146,1.878776,1.843682,-0.427169,-0.089387,0.292397,0.105966,-0.055048,...,0.01365,0.213171,-0.18152,0.367244,-0.047155,-0.081612,0.163321,0.1315,23.36,0.0
24898,33433,-1.734894,-4.9945,-0.447054,1.825128,-2.124752,1.518201,1.021934,0.161125,0.704053,...,0.763435,-0.808833,-1.160351,-0.187855,-0.469967,0.264872,-0.237203,0.256371,1450.16,0.0
34224,37569,-1.986349,1.693153,0.60065,0.33007,0.690256,0.208071,1.169273,-0.772293,0.449984,...,0.124676,0.512424,-0.13738,-0.307074,-0.328152,-0.518352,-1.498672,0.033573,19.61,0.0


In [17]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43061,41353,-15.020981,8.07524,-16.298091,5.66482,-11.918153,-4.246957,-14.716668,9.435084,-6.795398,...,2.525115,-0.832074,-0.186117,0.429781,0.697103,0.056031,-1.310888,-0.707403,34.12,1.0
43160,41397,-14.970346,8.401421,-16.867238,8.252334,-13.56513,-2.782438,-14.263735,9.643419,-7.701499,...,2.714045,-0.101355,-0.439666,0.519514,0.789328,0.064357,-1.621386,-1.104819,273.01,1.0
43204,41413,-15.14045,7.378042,-16.356367,9.194935,-13.466163,-2.958431,-16.165539,10.075254,-7.901821,...,3.147428,0.341678,-1.150162,0.79519,-0.194542,0.145964,-2.45868,-1.189888,106.55,1.0
43428,41505,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,...,1.190739,-1.12767,-2.358579,0.673461,-1.4137,-0.462762,-2.018575,-1.042804,364.19,1.0
43624,41582,-1.048005,1.300219,-0.180401,2.589843,-1.164794,0.031823,-2.175778,0.699072,-1.140208,...,0.549014,0.624321,-0.136663,0.131738,0.030921,-0.176701,0.504898,0.069882,39.45,1.0


In [18]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,134


In [19]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,26120.926829,-0.203916,-0.008875,0.717271,0.17841,-0.2942,0.068005,-0.165879,0.08421,0.165042,...,0.011393,-0.068688,-0.09403,-0.050116,0.042149,0.156692,0.005695,0.004891,-0.008525,82.302398
1.0,25259.246269,-7.759814,5.619386,-10.672832,6.111351,-5.77614,-2.360976,-8.192938,4.04,-3.571525,...,0.50715,0.931331,-0.270332,-0.315606,-0.096976,0.279383,0.168436,0.687503,0.063472,93.928433


Splitting the data into Featrues & Targets

In [20]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [21]:
print(X)

        Time         V1        V2         V3        V4         V5        V6  \
20963  31384  -3.151183 -1.110894   2.131186  0.536280   0.048715 -0.400256   
26302  33995  -0.743990  0.809811   1.892754 -0.512528   0.212477 -0.245269   
32721  36938  -0.302422  0.735146   1.878776  1.843682  -0.427169 -0.089387   
24898  33433  -1.734894 -4.994500  -0.447054  1.825128  -2.124752  1.518201   
34224  37569  -1.986349  1.693153   0.600650  0.330070   0.690256  0.208071   
...      ...        ...       ...        ...       ...        ...       ...   
43061  41353 -15.020981  8.075240 -16.298091  5.664820 -11.918153 -4.246957   
43160  41397 -14.970346  8.401421 -16.867238  8.252334 -13.565130 -2.782438   
43204  41413 -15.140450  7.378042 -16.356367  9.194935 -13.466163 -2.958431   
43428  41505 -16.526507  8.584972 -18.649853  9.505594 -13.793819 -2.832404   
43624  41582  -1.048005  1.300219  -0.180401  2.589843  -1.164794  0.031823   

              V7         V8        V9  ...       V2

In [22]:
print(Y)

20963    0.0
26302    0.0
32721    0.0
24898    0.0
34224    0.0
        ... 
43061    1.0
43160    1.0
43204    1.0
43428    1.0
43624    1.0
Name: Class, Length: 626, dtype: float64


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [25]:
print(X.shape, X_train.shape, X_test.shape)

(626, 30) (500, 30) (126, 30)


Model Training

Logistic Regression

In [26]:
model = LogisticRegression()

In [27]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [28]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [29]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.988


In [30]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9682539682539683
