<a href="https://colab.research.google.com/github/kunalnischal7/Predict-Fraudulent-Transactions/blob/main/PredictFraudulentTransactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Import Libraries

In [1]:
import tensorflow as tf
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np

## Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [3]:
file_path = '/content/drive/My Drive/Fraud.csv'

In [4]:
df = pd.read_csv(file_path)

# Data Preparation

## Data Shuffling

In [5]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 44)

In [6]:
df = df.drop('isFlaggedFraud', axis=1)
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
584771,33,CASH_OUT,128143.84,C25914186,22007.00,0.00,C1450282055,140130.89,268274.73,0
1111643,130,PAYMENT,16789.22,C1518672148,130499.39,113710.17,M293128500,0.00,0.00,0
3611497,273,PAYMENT,38065.38,C811511306,0.00,0.00,M642413460,0.00,0.00,0
4491117,324,CASH_OUT,261515.07,C807077025,19203.00,0.00,C1492756391,284637.24,546152.31,0
4071959,300,CASH_IN,348783.73,C354564231,118174.91,466958.64,C762408345,792084.69,443300.96,0
...,...,...,...,...,...,...,...,...,...,...
2146875,184,CASH_OUT,75288.74,C1661227955,10541.00,0.00,C1844272985,603046.41,678335.15,0
2253997,187,CASH_OUT,8896.32,C2120413395,11166.00,2269.68,C32755964,4654929.08,4663825.40,0
5499633,380,PAYMENT,1802.06,C1674048218,0.00,0.00,M569596054,0.00,0.00,0
3870115,283,CASH_OUT,305475.67,C2140358791,0.00,0.00,C707361236,5305573.30,5611048.97,0


In [7]:
df = df.drop('nameOrig',axis=1)
df

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
584771,33,CASH_OUT,128143.84,22007.00,0.00,C1450282055,140130.89,268274.73,0
1111643,130,PAYMENT,16789.22,130499.39,113710.17,M293128500,0.00,0.00,0
3611497,273,PAYMENT,38065.38,0.00,0.00,M642413460,0.00,0.00,0
4491117,324,CASH_OUT,261515.07,19203.00,0.00,C1492756391,284637.24,546152.31,0
4071959,300,CASH_IN,348783.73,118174.91,466958.64,C762408345,792084.69,443300.96,0
...,...,...,...,...,...,...,...,...,...
2146875,184,CASH_OUT,75288.74,10541.00,0.00,C1844272985,603046.41,678335.15,0
2253997,187,CASH_OUT,8896.32,11166.00,2269.68,C32755964,4654929.08,4663825.40,0
5499633,380,PAYMENT,1802.06,0.00,0.00,M569596054,0.00,0.00,0
3870115,283,CASH_OUT,305475.67,0.00,0.00,C707361236,5305573.30,5611048.97,0


In [8]:
df = df.drop('nameDest', axis=1)
df

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
584771,33,CASH_OUT,128143.84,22007.00,0.00,140130.89,268274.73,0
1111643,130,PAYMENT,16789.22,130499.39,113710.17,0.00,0.00,0
3611497,273,PAYMENT,38065.38,0.00,0.00,0.00,0.00,0
4491117,324,CASH_OUT,261515.07,19203.00,0.00,284637.24,546152.31,0
4071959,300,CASH_IN,348783.73,118174.91,466958.64,792084.69,443300.96,0
...,...,...,...,...,...,...,...,...
2146875,184,CASH_OUT,75288.74,10541.00,0.00,603046.41,678335.15,0
2253997,187,CASH_OUT,8896.32,11166.00,2269.68,4654929.08,4663825.40,0
5499633,380,PAYMENT,1802.06,0.00,0.00,0.00,0.00,0
3870115,283,CASH_OUT,305475.67,0.00,0.00,5305573.30,5611048.97,0


# Feature Engineering

## One hot encoding

In [9]:
encoded_df = pd.get_dummies(df, columns=['type'])

In [10]:
encoded_df

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
584771,33,128143.84,22007.00,0.00,140130.89,268274.73,0,0,1,0,0,0
1111643,130,16789.22,130499.39,113710.17,0.00,0.00,0,0,0,0,1,0
3611497,273,38065.38,0.00,0.00,0.00,0.00,0,0,0,0,1,0
4491117,324,261515.07,19203.00,0.00,284637.24,546152.31,0,0,1,0,0,0
4071959,300,348783.73,118174.91,466958.64,792084.69,443300.96,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2146875,184,75288.74,10541.00,0.00,603046.41,678335.15,0,0,1,0,0,0
2253997,187,8896.32,11166.00,2269.68,4654929.08,4663825.40,0,0,1,0,0,0
5499633,380,1802.06,0.00,0.00,0.00,0.00,0,0,0,0,1,0
3870115,283,305475.67,0.00,0.00,5305573.30,5611048.97,0,0,1,0,0,0


# Preparing Data as X and Y

In [11]:
X = encoded_df.drop('isFraud', axis=1)
X

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
584771,33,128143.84,22007.00,0.00,140130.89,268274.73,0,1,0,0,0
1111643,130,16789.22,130499.39,113710.17,0.00,0.00,0,0,0,1,0
3611497,273,38065.38,0.00,0.00,0.00,0.00,0,0,0,1,0
4491117,324,261515.07,19203.00,0.00,284637.24,546152.31,0,1,0,0,0
4071959,300,348783.73,118174.91,466958.64,792084.69,443300.96,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2146875,184,75288.74,10541.00,0.00,603046.41,678335.15,0,1,0,0,0
2253997,187,8896.32,11166.00,2269.68,4654929.08,4663825.40,0,1,0,0,0
5499633,380,1802.06,0.00,0.00,0.00,0.00,0,0,0,1,0
3870115,283,305475.67,0.00,0.00,5305573.30,5611048.97,0,1,0,0,0


In [12]:
y = encoded_df ['isFraud']
y

584771     0
1111643    0
3611497    0
4491117    0
4071959    0
          ..
2146875    0
2253997    0
5499633    0
3870115    0
3684116    0
Name: isFraud, Length: 6362620, dtype: int64

### Checking for missing values in the data

In [13]:
encoded_df.isnull().sum()

step              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
type_CASH_IN      0
type_CASH_OUT     0
type_DEBIT        0
type_PAYMENT      0
type_TRANSFER     0
dtype: int64

In [14]:
encoded_df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

## Splitting the data into Training and Testing

# Model Building

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
encoded_df = LogisticRegression()

In [18]:
encoded_df.fit(X_train, y_train)
y_pred = encoded_df.predict(X_test)

### Evaluate the model

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9982672232507992


### Confusion Matrix

In [20]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[1269630    1249]
 [    956     689]]


### Classification Report

In [21]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270879
           1       0.36      0.42      0.38      1645

    accuracy                           1.00   1272524
   macro avg       0.68      0.71      0.69   1272524
weighted avg       1.00      1.00      1.00   1272524



## Tuning the hyperparameters

In [22]:
df_encoded = LogisticRegression(solver='lbfgs', C=0.1)
df_encoded.fit(X_train, y_train)
y_pred = df_encoded.predict(X_test)

## Interpretation

In [23]:
coefficients = df_encoded.coef_
print("Coefficients:", coefficients)

Coefficients: [[-2.74040534e-02 -1.93676270e-04  2.12383594e-04 -2.31778858e-04
   6.92829351e-06 -8.26172572e-06 -9.66974300e-08 -9.14146782e-06
  -1.60516074e-06 -1.02451554e-04 -1.87628462e-07]]
