<a href="https://colab.research.google.com/github/kunalnischal7/Predict-Fraudulent-Transactions/blob/main/PredictFraudulentTransactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Import Libraries

In [19]:
import tensorflow as tf
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np

## Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [3]:
file_path = '/content/drive/My Drive/Fraud.csv'

In [5]:
df = pd.read_csv(file_path)

# Data Preparation

## Data Shuffling

In [7]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 44)

In [13]:
df = df.drop('isFlaggedFraud', axis=1)
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
584771,33,CASH_OUT,128143.84,C25914186,22007.00,0.00,C1450282055,140130.89,268274.73,0
1111643,130,PAYMENT,16789.22,C1518672148,130499.39,113710.17,M293128500,0.00,0.00,0
3611497,273,PAYMENT,38065.38,C811511306,0.00,0.00,M642413460,0.00,0.00,0
4491117,324,CASH_OUT,261515.07,C807077025,19203.00,0.00,C1492756391,284637.24,546152.31,0
4071959,300,CASH_IN,348783.73,C354564231,118174.91,466958.64,C762408345,792084.69,443300.96,0
...,...,...,...,...,...,...,...,...,...,...
2146875,184,CASH_OUT,75288.74,C1661227955,10541.00,0.00,C1844272985,603046.41,678335.15,0
2253997,187,CASH_OUT,8896.32,C2120413395,11166.00,2269.68,C32755964,4654929.08,4663825.40,0
5499633,380,PAYMENT,1802.06,C1674048218,0.00,0.00,M569596054,0.00,0.00,0
3870115,283,CASH_OUT,305475.67,C2140358791,0.00,0.00,C707361236,5305573.30,5611048.97,0


In [14]:
X = df.drop('isFraud' , axis=1)
X

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest
584771,33,CASH_OUT,128143.84,C25914186,22007.00,0.00,C1450282055,140130.89,268274.73
1111643,130,PAYMENT,16789.22,C1518672148,130499.39,113710.17,M293128500,0.00,0.00
3611497,273,PAYMENT,38065.38,C811511306,0.00,0.00,M642413460,0.00,0.00
4491117,324,CASH_OUT,261515.07,C807077025,19203.00,0.00,C1492756391,284637.24,546152.31
4071959,300,CASH_IN,348783.73,C354564231,118174.91,466958.64,C762408345,792084.69,443300.96
...,...,...,...,...,...,...,...,...,...
2146875,184,CASH_OUT,75288.74,C1661227955,10541.00,0.00,C1844272985,603046.41,678335.15
2253997,187,CASH_OUT,8896.32,C2120413395,11166.00,2269.68,C32755964,4654929.08,4663825.40
5499633,380,PAYMENT,1802.06,C1674048218,0.00,0.00,M569596054,0.00,0.00
3870115,283,CASH_OUT,305475.67,C2140358791,0.00,0.00,C707361236,5305573.30,5611048.97


# Feature Engineering

## One hot encoding

In [15]:
encoded_df = pd.get_dummies(df, columns=['type'])

In [16]:
encoded_df

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
584771,33,128143.84,C25914186,22007.00,0.00,C1450282055,140130.89,268274.73,0,0,1,0,0,0
1111643,130,16789.22,C1518672148,130499.39,113710.17,M293128500,0.00,0.00,0,0,0,0,1,0
3611497,273,38065.38,C811511306,0.00,0.00,M642413460,0.00,0.00,0,0,0,0,1,0
4491117,324,261515.07,C807077025,19203.00,0.00,C1492756391,284637.24,546152.31,0,0,1,0,0,0
4071959,300,348783.73,C354564231,118174.91,466958.64,C762408345,792084.69,443300.96,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2146875,184,75288.74,C1661227955,10541.00,0.00,C1844272985,603046.41,678335.15,0,0,1,0,0,0
2253997,187,8896.32,C2120413395,11166.00,2269.68,C32755964,4654929.08,4663825.40,0,0,1,0,0,0
5499633,380,1802.06,C1674048218,0.00,0.00,M569596054,0.00,0.00,0,0,0,0,1,0
3870115,283,305475.67,C2140358791,0.00,0.00,C707361236,5305573.30,5611048.97,0,0,1,0,0,0


# Preparing Data as X and Y

In [17]:
y = encoded_df ['isFraud']
y

584771     0
1111643    0
3611497    0
4491117    0
4071959    0
          ..
2146875    0
2253997    0
5499633    0
3870115    0
3684116    0
Name: isFraud, Length: 6362620, dtype: int64

### Checking for missing values in the data

In [21]:
encoded_df.isnull().sum()

step              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
type_CASH_IN      0
type_CASH_OUT     0
type_DEBIT        0
type_PAYMENT      0
type_TRANSFER     0
dtype: int64

In [22]:
encoded_df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

## Splitting the data into Training and Testing

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
X = np.random.rand(100, 2)
y = np.random.randint(2, size=100)

In [39]:
df_encoded = LogisticRegression()
df_encoded.fit(X_train, y_train)
y_pred = df_encoded.predict(X_test)

### Evaluate the model

In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6


### Confusion Matrix

In [42]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[5 6]
 [2 7]]


### Classification Report

In [43]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.45      0.56        11
           1       0.54      0.78      0.64         9

    accuracy                           0.60        20
   macro avg       0.63      0.62      0.60        20
weighted avg       0.64      0.60      0.59        20



## Tuning the hyperparameters

In [44]:
df_encoded = LogisticRegression(solver='lbfgs', C=0.1)
df_encoded.fit(X_train, y_train)
y_pred = df_encoded.predict(X_test)

## Interpretation

In [46]:
coefficients = df_encoded.coef_
print("Coefficients:", coefficients)

Coefficients: [[-0.11650901  0.15058505]]
