Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
# loading the dataset to a pandas dataframe
credit_card_data = pd.read_csv('/content/drive/MyDrive/Fraud.csv')

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#print first 5 rows of dataset
credit_card_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [8]:
credit_card_data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [9]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [10]:
# checking for the number of missing values in each column
credit_card_data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [11]:
# distribution of legit and fraud transactions
credit_card_data['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

The given dataset is highly unbalanced

0 --> normal Trasaction

1 --> fraudulent Transaction

In [12]:
# seperating the data for analysis
legit = credit_card_data[credit_card_data.isFraud == 0]
fraud = credit_card_data[credit_card_data.isFraud == 1]

In [13]:
print(legit.shape)
print(fraud.shape)

(6354407, 11)
(8213, 11)


In [14]:
# statistical measure for the data
legit.amount.describe()

count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

In [15]:
fraud.amount.describe()

count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64

In [16]:
# comparing the values for both tranactions
credit_card_data.groupby('isFraud').mean()

  credit_card_data.groupby('isFraud').mean()


Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.235663,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


Under Sampling

build a samle dataset containing similar distribution of normal trasaction and fraudulent trasactions

no. of fraudulent transactions --> 8213

In [17]:
legit_sample = legit.sample(n=8213)

concatneting the two dataframes

In [18]:
new_dataset = pd.concat([legit_sample,fraud],axis=0)

In [19]:
new_dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
603760,34,CASH_IN,233716.5,C443837392,13595258.21,13828974.71,C1512598168,805569.05,571852.55,0,0
4412910,322,TRANSFER,796438.35,C2070331335,15978.0,0.0,C1120263376,586916.49,1383354.84,0,0
110753,11,PAYMENT,10746.18,C1215744186,10076.0,0.0,M988756116,0.0,0.0,0,0
3793750,281,PAYMENT,11243.35,C1801666687,0.0,0.0,M670362701,0.0,0.0,0,0
2561077,206,CASH_IN,92703.79,C1866918954,58482.0,151185.79,C808263860,0.0,0.0,0,0


In [20]:
new_dataset.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [21]:
new_dataset['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [22]:
new_dataset.groupby('isFraud').mean()

  new_dataset.groupby('isFraud').mean()


Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.939121,180055.6,900063.6,924923.28236,1132180.0,1257323.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


splitting the dara into features & target

In [45]:
X = new_dataset.drop(columns=['isFraud','type','nameOrig','nameDest','isFlaggedFraud'],axis=1)
Y = new_dataset['isFraud']

In [46]:
print(X)

         step      amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
603760     34   233716.50    13595258.21     13828974.71       805569.05   
4412910   322   796438.35       15978.00            0.00       586916.49   
110753     11    10746.18       10076.00            0.00            0.00   
3793750   281    11243.35           0.00            0.00            0.00   
2561077   206    92703.79       58482.00       151185.79            0.00   
...       ...         ...            ...             ...             ...   
6362615   743   339682.13      339682.13            0.00            0.00   
6362616   743  6311409.28     6311409.28            0.00            0.00   
6362617   743  6311409.28     6311409.28            0.00        68488.84   
6362618   743   850002.52      850002.52            0.00            0.00   
6362619   743   850002.52      850002.52            0.00      6510099.11   

         newbalanceDest  
603760        571852.55  
4412910      1383354.84  
110753   

In [47]:
print(Y)

603760     0
4412910    0
110753     0
3793750    0
2561077    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


split the data into Training data and Testing data

In [50]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [51]:
print(X.shape,X_train.shape,X_test.shape)

(16426, 6) (13140, 6) (3286, 6)


Model Training

Logistic Regression

In [52]:
model = LogisticRegression()

In [53]:
# training the logistic regression model with training data
model.fit(X_train,Y_train)

Model Evaluation

Accuracy Score

In [55]:
# accuracy on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [56]:
print(f"The accuracy of the training data : {training_data_accuracy}")

The accuracy of the training data : 0.9063165905631659


In [57]:
# accuracy on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [58]:
print(f"The accuracy of the test data : {test_data_accuracy}")

The accuracy of the test data : 0.8998782714546562


**Our model is trained for generalised data means it does not underfit or overfit the training data**