In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [127]:
# read csv, split data and target

df = pd.read_csv('transactions.csv')

X = df.drop('Class', axis=1)
y = df['Class']

count_1 = (df['Class'] == 1).sum()
print(f'Number of class 1 entries: {count_1}')

Number of class 1 entries: 394


In [128]:
# Count features and rows of data for general picture

n_samples, n_features = df.shape
n_features -= 1
print(f'number of samples: {n_samples}, number of features: {n_features}')

number of samples: 227845, number of features: 30


In [129]:
# split data in test and training data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [130]:
# scale data

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [131]:
# convert to tensors

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

In [132]:
# reshape y tensors

y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)

print(len(y_train))

159491


In [133]:
# Create model
# f = wx + b, sigmoid at the end
class LogisticRegression(nn.Module):

    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_predicted = self.linear(x)
        return y_predicted
    
model = LogisticRegression(n_features)

In [134]:
# Calculate class weights:
num_neg = 200_000
num_pos = 300
pos_weight = torch.tensor([num_neg / num_pos])  # ≈ 666.67

In [135]:
# Loss and optimizer
learning_rate = 0.01
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [136]:
# training loop
num_epochs = 100

for epoch in range(num_epochs):
    # forward pass and loss
    y_predicted = model(X_train)
    loss = criterion(y_predicted, y_train)
    
    # backward pass
    loss.backward()
    
    # updates
    optimizer.step()
    
    # zero gradients
    optimizer.zero_grad()
    
    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')

epoch: 10, loss = 0.9526
epoch: 20, loss = 0.9005
epoch: 30, loss = 0.8659
epoch: 40, loss = 0.8380
epoch: 50, loss = 0.8138
epoch: 60, loss = 0.7922
epoch: 70, loss = 0.7726
epoch: 80, loss = 0.7544
epoch: 90, loss = 0.7376
epoch: 100, loss = 0.7219


In [137]:
with torch.no_grad():
    y_pred = torch.sigmoid(model(X_test))  # because logits output
    y_pred_cls = y_pred > 0.5
    print(classification_report(y_test.numpy(), y_pred_cls.numpy()))

NameError: name 'classification_report' is not defined