In [1]:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
df = pd.read_csv("fraud_detection_dataset.csv")
df.head(5)

Unnamed: 0,timestamp,user_id,amount,location,device_type,is_fraud,age,income,debt,credit_score
0,2023-12-15 02:48:24,3cbf0b8a-c799-4c0e-9782-89272ab8fa2b,998.99,Grantfurt,Mobile,0,56,42524.98,8394.05,655
1,2023-10-30 11:08:37,3675ed98-6863-4bae-b089-6f0db2df006b,241.39,Kimberlychester,Tablet,0,52,69884.04,28434.06,395
2,2023-11-21 05:19:35,4be1043f-20ee-4ac5-a689-b25c6a1de50c,836.42,Gutierrezville,Desktop,0,58,126953.62,39121.78,496
3,2023-11-07 00:03:25,cad0b3c9-099f-479b-8043-6d5ecb4e1467,612.74,Markside,Desktop,0,19,128650.7,39652.48,612
4,2023-04-02 14:12:23,b6ebea7d-ba62-41d7-b042-3978a23fef8b,135.55,Anthonyshire,Tablet,0,59,102020.39,7439.81,302


In [3]:
df.shape

(2000000, 10)

In [4]:
df.user_id.nunique()

2000000

In [5]:
df = df.drop(['timestamp', 'user_id', 'location'], axis =1)
df.head(5)

Unnamed: 0,amount,device_type,is_fraud,age,income,debt,credit_score
0,998.99,Mobile,0,56,42524.98,8394.05,655
1,241.39,Tablet,0,52,69884.04,28434.06,395
2,836.42,Desktop,0,58,126953.62,39121.78,496
3,612.74,Desktop,0,19,128650.7,39652.48,612
4,135.55,Tablet,0,59,102020.39,7439.81,302


In [6]:
df.head(5)

Unnamed: 0,amount,device_type,is_fraud,age,income,debt,credit_score
0,998.99,Mobile,0,56,42524.98,8394.05,655
1,241.39,Tablet,0,52,69884.04,28434.06,395
2,836.42,Desktop,0,58,126953.62,39121.78,496
3,612.74,Desktop,0,19,128650.7,39652.48,612
4,135.55,Tablet,0,59,102020.39,7439.81,302


In [7]:
def convert_device_type(df, col_name):
    le = LabelEncoder()
    df[col_name] = le.fit_transform(df[col_name])

    return df

In [8]:
df = convert_device_type(df, 'device_type')
df = convert_device_type(df, 'is_fraud')
df.device_type.unique(), df.is_fraud.unique()

(array([1, 2, 0]), array([0, 1]))

In [9]:
df.is_fraud.value_counts()

0    1000000
1    1000000
Name: is_fraud, dtype: int64

In [10]:
y = df['is_fraud']
x = df.drop(['is_fraud'], axis =1)


In [11]:
scaler = StandardScaler()
x_std = scaler.fit_transform(x)

## 1. Input output and model defining

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_std, y, test_size = 0.3, random_state = 42, shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1400000, 6), (600000, 6), (1400000,), (600000,))

In [13]:
x = torch.tensor(np.array(X_train), dtype= torch.float32, requires_grad=True)
y = torch.tensor(np.array(y_train).reshape(-1,1), dtype = torch.float32)

In [14]:
y.shape

torch.Size([1400000, 1])

In [15]:
torch.unique(y)

tensor([0., 1.])

In [16]:
#Model 

class Classification(nn.Module):
    def __init__(self, input_size):
        super(Classification,self).__init__()

        self.layer1 = nn.Linear(input_size, 16)
        self.layer2 = nn.Linear(16,8)
        self.layer3 = nn.Linear(8, 4)
        self.layer4 = nn.Linear(4,2)
        self.layer5 = nn.Linear(2,1)

    def forward(self, x):
        x = self.layer1(x)
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        x = torch.sigmoid(self.layer5(x))

        return x
    

model = Classification(X_train.shape[1])
print(model)


Classification(
  (layer1): Linear(in_features=6, out_features=16, bias=True)
  (layer2): Linear(in_features=16, out_features=8, bias=True)
  (layer3): Linear(in_features=8, out_features=4, bias=True)
  (layer4): Linear(in_features=4, out_features=2, bias=True)
  (layer5): Linear(in_features=2, out_features=1, bias=True)
)


## 2.Loss, epochs, learning rate, optimizer

In [17]:
epochs = 200
lerning_rate = 0.003
loss = nn.BCELoss()
sgd_optim = torch.optim.Adam(model.parameters(), lr = lerning_rate)

## 3. Model run forward pass, backward pass and update parameters

In [18]:
loss_list = []
for epoch in range(epochs):
    #Initialize the optimizer
    sgd_optim.zero_grad()

    #forward pass the model
    y_pred = model(x)

    #Loss
    l = loss(y_pred, y)

    #backward pass the losses
    l.backward()

    #Set the optimizer to reset and update parameters
    sgd_optim.step()

    if (epoch%10==0):
        print(f'Epoch : {epoch}, loss : {l.item()}')


Epoch : 0, loss : 0.6935584545135498
Epoch : 10, loss : 0.6902232766151428
Epoch : 20, loss : 0.6765246391296387
Epoch : 30, loss : 0.6508973240852356
Epoch : 40, loss : 0.6392704844474792
Epoch : 50, loss : 0.6274430155754089
Epoch : 60, loss : 0.6158627867698669
Epoch : 70, loss : 0.6039554476737976
Epoch : 80, loss : 0.5913461446762085
Epoch : 90, loss : 0.5779817700386047
Epoch : 100, loss : 0.5641186237335205
Epoch : 110, loss : 0.5500268340110779
Epoch : 120, loss : 0.5355876088142395
Epoch : 130, loss : 0.5208609104156494
Epoch : 140, loss : 0.505916178226471
Epoch : 150, loss : 0.4908166527748108
Epoch : 160, loss : 0.47560837864875793
Epoch : 170, loss : 0.460361123085022
Epoch : 180, loss : 0.4451153576374054
Epoch : 190, loss : 0.4299200773239136


In [19]:
torch.unique(y)

tensor([0., 1.])

In [20]:
X_test

array([[ 7.10782047e-01, -1.22538248e+00,  1.11100880e+00,
         6.63970612e-01,  1.20426611e-01, -8.15625936e-01],
       [ 8.79720221e-01, -1.10124098e-03,  5.22678894e-01,
        -2.06491167e-01, -1.44312521e+00, -1.60117703e+00],
       [-1.04690361e+00, -1.22538248e+00, -1.17694084e+00,
         1.13349078e+00, -9.39273626e-01, -1.11727756e+00],
       ...,
       [ 1.44140103e-01, -1.10124098e-03,  5.22678894e-01,
         5.66834564e-01,  3.87019256e-01,  9.87999382e-01],
       [-8.53147592e-01, -1.10124098e-03,  1.43785875e+00,
        -5.04221117e-01,  1.44240095e+00, -6.14524855e-01],
       [ 4.85407048e-01, -1.10124098e-03,  1.43785875e+00,
         9.76960608e-01,  7.47885839e-01, -1.30580982e+00]])

In [21]:
outputs = model(torch.tensor(np.array(X_test),dtype=torch.float32,requires_grad=True))
__, predicted = torch.max(outputs, 1)
print(torch.unique(predicted))

tensor([0])


In [22]:
def accuracy(out, labels):
    _,pred = torch.max(out, dim=1)
    return torch.sum(pred==labels).item()

In [23]:
y_train_pred = model(torch.tensor(np.array(X_train),dtype=torch.float32,requires_grad=True))
y_test_pred = model(torch.tensor(np.array(X_test),dtype=torch.float32))

#convert to numpy array
#y_train_pred = y_train_pred.detach().numpy()
#y_test_pred = y_test_pred.detach().numpy()

y_test = torch.tensor(np.array(y_test).reshape(-1,1), dtype = torch.float32)
y_train = torch.tensor(np.array(y_train).reshape(-1,1), dtype = torch.float32)

In [25]:
test_accuracy=accuracy(y_test_pred[:100], y_test[:100])
#train_accuracy=accuracy(y_train_pred, y_train)
#print(train_accuracy)
print(test_accuracy)

5100


In [28]:
y_test_np = y_test.detach().numpy()
y_test_pred_np = y_test_pred.detach().numpy()

# Threshold predicted probabilities to obtain binary predictions (0 or 1)
y_test_pred_binary = (y_test_pred_np >= 0.5).astype(int)

# Compare binary predictions with actual labels to compute accuracy
accuracy = np.mean(y_test_pred_binary == y_test_np)
print("Accuracy:", accuracy)

Accuracy: 0.9949716666666667


In [30]:
print(classification_report(y_test_pred_binary, y_test_np))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    303045
           1       0.99      1.00      0.99    296955

    accuracy                           0.99    600000
   macro avg       0.99      1.00      0.99    600000
weighted avg       1.00      0.99      0.99    600000

