## Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
# loading the datasets to pandas dataframe
credit_card_data = pd.read_csv('./data/creditcard.csv')

In [2]:
# credit_card_data.sample(5)
# credit_card_data.head()
credit_card_data.tail()

NameError: name 'credit_card_data' is not defined

In [5]:
#  Time for each column gives time elapsed in second from the first traansaction has ever happened in datset
# V1 to VC28 features about each vertical transactions 
# but they cannot give credit card details as details are very sensistive 
# so datset provider converted all the features through PCA (Principal Component Analysis) intro numerical values , which will be used in this model
# 'Amount' column gives transaction amounts in USD
# 'Class' column describves if transaction legit vor fraudulant 0-legit ; 1-fraudulant

In [6]:
# datset informations 
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [8]:
# checking no of missing values in each column
credit_card_data.isnull().sum()
# here we dont have any missing values

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

### This Dataset is highly unbalanced
#### 0---> Normal Transaction 
#### 1---> fraudulent Transaction
#### Since we have very less datapoint for fraudulent datasets , most of time we give new data it will predict it as normal transaction because of the imbalace , so we need to take care of fraudulent dataset by handling the unbalance dataset

In [10]:
# seperating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [11]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [12]:
# statistical measures of the data
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [13]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [14]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


#### dealing with unbalance data
####  Under- Sampling
#####  Build a sample dataset containing similar distribution of normal(legit) transactions & fraudulent transactions 
#### No of fraudulent transactons - 492 

In [15]:
# random sampling
legit_sample = legit.sample(492)

In [16]:
legit_sample

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
107753,70597.0,-0.802727,0.383081,-1.223697,-2.664248,1.835880,3.029579,-0.396046,1.232311,-1.197629,...,-0.392712,-0.646692,0.062147,1.030286,-0.360020,0.859768,0.333820,0.211123,15.00,0
21572,31743.0,-0.603392,-0.096769,1.684518,-1.883595,-0.478288,-0.353147,0.163191,0.049772,-1.504190,...,0.513802,1.204478,-0.233402,0.044849,0.247987,-0.179587,0.057017,0.095721,69.56,0
124452,77292.0,-0.607790,0.786043,1.095003,-0.200869,0.935844,-0.257239,0.769997,0.070761,-0.705499,...,0.140208,0.272980,-0.327529,-0.543442,0.192448,-0.470450,0.094062,0.113432,8.93,0
152838,97479.0,2.321452,-0.297117,-2.161787,-0.850726,0.395098,-1.379737,0.251433,-0.736328,0.202428,...,0.165164,0.790165,-0.156525,-0.622710,0.551755,0.078236,-0.100463,-0.091678,15.95,0
13079,22955.0,-1.561472,0.600965,1.996487,-0.812322,1.380493,-1.072513,0.614452,-0.239384,1.175410,...,-0.482633,-0.968963,-0.302949,-0.035102,0.265334,-0.174210,-0.011199,-0.140410,1.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217504,140915.0,-0.752958,-0.109235,0.474591,-0.649422,-0.328499,-0.425052,1.729860,-0.162955,0.262422,...,0.056863,-0.178236,0.425963,-0.131293,-0.093562,-0.114670,0.023793,0.162614,256.98,0
204563,135335.0,-3.682848,-4.270768,1.774809,-2.481456,1.228043,0.851209,-0.742637,0.007746,-0.828942,...,-0.716430,0.512329,2.045924,-0.604090,0.796179,-0.050149,0.513025,-0.690627,142.70,0
278331,168164.0,-2.492263,-1.198893,1.352726,1.555412,1.099981,-0.286940,-0.578888,0.812218,-0.343905,...,-0.225792,-1.461585,-0.002978,-0.477503,0.371141,-0.815962,-0.010024,-0.303994,89.35,0
191671,129363.0,0.856395,0.280863,-3.319194,0.337390,2.838186,3.720570,-0.682561,-2.041054,-0.862928,...,-1.015186,0.781382,0.020595,0.701666,0.304759,-0.421835,0.064979,0.174236,144.00,0


### Concatenating two dataframes

In [17]:
# row wise concatenation axis=0
new_dataset = pd.concat([legit_sample , fraud],axis = 0)
new_dataset

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
107753,70597.0,-0.802727,0.383081,-1.223697,-2.664248,1.835880,3.029579,-0.396046,1.232311,-1.197629,...,-0.392712,-0.646692,0.062147,1.030286,-0.360020,0.859768,0.333820,0.211123,15.00,0
21572,31743.0,-0.603392,-0.096769,1.684518,-1.883595,-0.478288,-0.353147,0.163191,0.049772,-1.504190,...,0.513802,1.204478,-0.233402,0.044849,0.247987,-0.179587,0.057017,0.095721,69.56,0
124452,77292.0,-0.607790,0.786043,1.095003,-0.200869,0.935844,-0.257239,0.769997,0.070761,-0.705499,...,0.140208,0.272980,-0.327529,-0.543442,0.192448,-0.470450,0.094062,0.113432,8.93,0
152838,97479.0,2.321452,-0.297117,-2.161787,-0.850726,0.395098,-1.379737,0.251433,-0.736328,0.202428,...,0.165164,0.790165,-0.156525,-0.622710,0.551755,0.078236,-0.100463,-0.091678,15.95,0
13079,22955.0,-1.561472,0.600965,1.996487,-0.812322,1.380493,-1.072513,0.614452,-0.239384,1.175410,...,-0.482633,-0.968963,-0.302949,-0.035102,0.265334,-0.174210,-0.011199,-0.140410,1.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


In [18]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [19]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,92402.644309,0.057971,0.043101,0.069073,-0.009991,-0.055733,0.031733,-0.014596,0.06486,-0.052065,...,-0.056498,0.014968,-0.012024,0.034231,-0.020359,-0.003184,0.021843,0.011623,-0.001044,76.891911
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


##### It can be seen that the nature of the datset has not changed much after undersampling the dataset as evident from the fact that the mean is similar to corrosponding mean of data before undersampling.
##### It is important to check whether the sample of (undersampled ) data is a good sample or a bad sample 


##### Splitting the data into Features and Targets

In [20]:
X=new_dataset.drop(columns='Class' , axis=1)
Y=new_dataset['Class']

In [21]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
107753,70597.0,-0.802727,0.383081,-1.223697,-2.664248,1.835880,3.029579,-0.396046,1.232311,-1.197629,...,-0.285041,-0.392712,-0.646692,0.062147,1.030286,-0.360020,0.859768,0.333820,0.211123,15.00
21572,31743.0,-0.603392,-0.096769,1.684518,-1.883595,-0.478288,-0.353147,0.163191,0.049772,-1.504190,...,0.253109,0.513802,1.204478,-0.233402,0.044849,0.247987,-0.179587,0.057017,0.095721,69.56
124452,77292.0,-0.607790,0.786043,1.095003,-0.200869,0.935844,-0.257239,0.769997,0.070761,-0.705499,...,-0.088592,0.140208,0.272980,-0.327529,-0.543442,0.192448,-0.470450,0.094062,0.113432,8.93
152838,97479.0,2.321452,-0.297117,-2.161787,-0.850726,0.395098,-1.379737,0.251433,-0.736328,0.202428,...,-0.006440,0.165164,0.790165,-0.156525,-0.622710,0.551755,0.078236,-0.100463,-0.091678,15.95
13079,22955.0,-1.561472,0.600965,1.996487,-0.812322,1.380493,-1.072513,0.614452,-0.239384,1.175410,...,-0.179725,-0.482633,-0.968963,-0.302949,-0.035102,0.265334,-0.174210,-0.011199,-0.140410,1.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [22]:
Y

107753    0
21572     0
124452    0
152838    0
13079     0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64

### Split the data intpo Training data & test data

In [23]:
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size=0.2 , stratify=Y , random_state=2)
# stratify makes sure that the distribution of 0's 1's in trainging data is balanced(evenly)

In [24]:
print(X.shape , X_train.shape , X_test.shape)

(984, 30) (787, 30) (197, 30)


### MODEL TRAINING 
### Logistic Regression

In [25]:
model = LogisticRegression()

In [31]:
# traing the logistic regression model with training data
model.fit(X_train , Y_train)
# fits our data to logistic Regression function

### Model Evaluation
#### Accuracy Score

In [27]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction ,Y_train )

In [28]:
print("Accuracy of Training data:" , training_data_accuracy)

Accuracy of Training data: 0.9491740787801779


In [29]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [30]:
print("Accuracy of test data:" , test_data_accuracy)

Accuracy of test data: 0.9137055837563451


#### since the accuracy of both test and training data is similar and a good result so can we consider it generalised model