In [1]:
# Import the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to pandas dataframe
credit_card_data = pd.read_csv('./creditcard.csv')

In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# distribution of legit trasaction & fraudulent transaction
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [6]:
# This dataset is highly unbalanced
# 0 --> normal transaction
# 1 --> fraudulent transaction



# separating the data for analysis
legit= credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [7]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [8]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [9]:
# compare the value for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [24]:
# Under sampling
# Build a sample dataset containing similar distribution of normal transaction and fraudulent transaction
# Number of Fraudulent Transaction --> 492

legit_sample = legit.sample(n=492)

In [25]:
# Concatenating to data frame
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [26]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
40973,40483.0,1.489742,-0.980588,0.171558,-1.66254,-0.877379,0.186702,-1.043378,0.025035,-2.210737,...,-0.171215,-0.040991,-0.041426,-0.652126,0.411169,-0.15478,0.0488,0.002005,10.0,0
275584,166617.0,-1.607295,2.457863,-0.420932,-0.056082,0.445393,0.176219,-0.694881,-4.839829,-0.090757,...,4.332506,-2.063545,0.693695,0.940209,-0.661992,0.292137,0.609901,0.333037,4.99,0
127701,78431.0,1.184085,-0.074912,0.403109,0.046704,-0.684018,-0.968438,-0.056359,-0.074789,-0.040167,...,-0.161136,-0.611954,0.116996,0.552016,0.066608,0.774023,-0.098756,0.000443,29.95,0
20063,30759.0,-0.655723,1.024772,0.322892,1.459715,2.875499,4.17833,0.031661,1.087042,-1.195325,...,-0.11939,-0.384497,-0.363255,0.92388,0.61947,0.226355,0.073639,0.095845,18.59,0
141296,84221.0,-1.206284,0.466352,2.619789,0.296151,-1.020314,0.944852,-0.608231,0.974434,0.306397,...,0.221922,0.847249,-0.276942,0.322134,0.331871,0.725996,-0.050289,-0.011769,30.75,0


In [27]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [28]:
# Spliting the data into features and target
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [30]:
# Model training
model = LogisticRegression()

In [31]:
# training the logistic regression model
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [32]:
# Model evaluation
# Accuracy score
training_data_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, training_data_prediction)
print("Training data accuracy: ", training_data_accuracy)

Training data accuracy:  0.9440914866581956


In [33]:
test_data_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, test_data_prediction)
print("Test data accuracy: ", test_data_accuracy)

Test data accuracy:  0.9035532994923858
