In [None]:
# Data Preprocessing:
import numpy as np
import pandas as pd

# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# GAN Model:
import torch
from torch import nn
from torch.utils.data import Dataset

from collections import namedtuple

# EDA

In [2]:
# Set global batch size
BATCH_SIZE = 32

# Set global PyTorch seed
TORCH_SEED = 111

In [3]:
#Load dataset and drop duplicates
fraud_data = pd.read_csv("fraud_data.csv")
fraud_data.drop_duplicates(inplace=True)
fraud_data.Class.value_counts()

Class
0    21204
1      346
Name: count, dtype: int64

In [4]:
#Display feature statistics
fraud_data.describe().T[["mean", "std", "min", "50%", "max"]]

Unnamed: 0,mean,std,min,50%,max
V1,-0.036547,2.09217,-41.928738,-0.007161,2.451888
V2,0.044165,1.681791,-40.803981,0.076209,21.467203
V3,-0.079662,1.850424,-31.103685,0.182884,4.069865
V4,0.056684,1.538158,-4.848504,-0.013552,12.114672
V5,-0.037196,1.521778,-32.092129,-0.066487,29.162172
V6,-0.030833,1.330158,-20.367836,-0.282499,21.393069
V7,-0.066622,1.562575,-41.506796,0.032245,34.303177
V8,0.006927,1.318763,-38.987263,0.022768,20.007208
V9,-0.044095,1.156803,-13.434066,-0.074955,9.125535
V10,-0.086981,1.340121,-24.403185,-0.099292,12.701539


In [None]:
# Define a namedtuple to conveniently hold processed data components
PdToTensor = namedtuple("PdToTensor", ["df", "sample", "data", "labels"])

class CustomPandasTorch(Dataset):
    """
    Custom PyTorch Dataset for loading and managing tabular data from a pandas DataFrame.

    This dataset:
    - Takes a pandas DataFrame.
    - Splits it into raw, train, and test versions with upsampling to balance classes.
    - Provides easy switching between raw, train, and test splits.
    - Prepares tensors for PyTorch models (separates features and labels).

    Args:
        df (pandas.DataFrame): Input DataFrame where the last column is assumed to be the label.

    Attributes:
        raw (PdToTensor): Full dataset transformed to tensors.
        train (PdToTensor): Training dataset (75% split with stratified sampling).
        test (PdToTensor): Testing dataset (25% split with stratified sampling).
        active (PdToTensor): Currently active dataset (used for loading via __getitem__).
    """

    def __init__(self, df):
        # Transform the entire dataframe into tensors
        self.raw = CustomPandasTorch.transform_data(df)
        # Set the initial active dataset to raw
        self.active = self.raw

        # Split the dataframe into train and test sets with stratification on 'Class'
        df_train, df_test = train_test_split(df, test_size=0.25, stratify=df.Class, random_state=6)

        # Transform both train and test sets into tensors
        self.train = CustomPandasTorch.transform_data(df_train)
        self.test = CustomPandasTorch.transform_data(df_test)

    @staticmethod
    def transform_data(df_input):
        """
        Transform a DataFrame into tensors, upsampling each class to balance the dataset.

        Args:
            df_input (pandas.DataFrame): The input DataFrame.

        Returns:
            PdToTensor: A named tuple containing the original dataframe, the upsampled dataframe,
                        feature tensors (data), and label tensors (labels).
        """
        # Compute upsampled sample size (next multiple of batch size or at least 16000)
        sample_size = max((df_input.shape[0] // BATCH_SIZE + 1) * BATCH_SIZE, 16000)

        # Upsample each class separately and shuffle the result
        df = (df_input
                .groupby('Class')[df_input.columns]
                .apply(lambda x: x.sample(sample_size, random_state=6, replace=True))
                .sample(frac=1, random_state=6)  # Shuffle the full dataset
                .reset_index(drop=True)
        )

        # Convert the dataframe to a float32 tensor for PyTorch compatability
        t = torch.from_numpy(df.astype('float32').values)

        # Return the dataframe, upsampled dataframe, features (all but last column), and labels (last column)
        return PdToTensor(df_input, df, t[:, :-1], t[:, -1:])

    def set_raw(self):
        """
        Switch the active dataset to the raw (full) data.
        """
        self.active = self.raw

    def set_train(self):
        """
        Switch the active dataset to the training split.
        """
        self.active = self.train

    def set_test(self):
        """
        Switch the active dataset to the testing split.
        """
        self.active = self.test

    def __len__(self):
        """
        Return the number of samples in the currently active dataset.

        Returns:
            int: Number of samples.
        """
        return self.active.sample.shape[0]

    def __getitem__(self, idx):
        """
        Retrieve a feature-label pair from the active dataset by index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: (features tensor, label tensor)
        """
        return self.active.data[idx], self.active.labels[idx]

# Modeling

In [6]:
# Discriminator class: tries to distinguish between real and fake (generated) samples
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        # A simple feedforward neural network that outputs a probability (real or fake)
        self.model = nn.Sequential(
            nn.Linear(29, 256),    # Input layer (29 features) → hidden layer with 256 units
            nn.ReLU(),             # Activation function
            nn.Dropout(0.3),        # Dropout for regularization
            nn.Linear(256, 64),     # Hidden layer (256 → 64 units)
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 16),      # Hidden layer (64 → 16 units)
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(16, 1),       # Output layer (single unit for binary classification)
            nn.Sigmoid(),           # Sigmoid activation to output probability between 0 and 1
        )

    def forward(self, x):
        # Defines the forward pass through the discriminator
        output = self.model(x)
        return output


# Generator class: tries to generate synthetic (fake) samples that resemble real data
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        # A simple feedforward neural network that maps random noise to synthetic data
        self.model = nn.Sequential(
            nn.Linear(100, 256),    # Input: random noise vector (100 dimensions) → 256 units
            nn.ReLU(),
            nn.Linear(256, 128),    # Hidden layer (256 → 128 units)
            nn.ReLU(),
            nn.Linear(128, 64),     # Hidden layer (128 → 64 units)
            nn.ReLU(),
            nn.Linear(64, 29),      # Output: fake sample with 29 features (matching real data)
            nn.Tanh(),              # Tanh activation to output values between -1 and 1
        )

    def forward(self, x):
        # Defines the forward pass through the generator
        output = self.model(x)
        return output

In [7]:
def train_discriminator(train_loader):
    """
    Trains a Discriminator neural network using binary cross-entropy loss.

    Args:
        train_loader (DataLoader): A PyTorch DataLoader providing batches of (real_samples, real_labels),
                                   where real_samples are input data and real_labels are corresponding labels
                                   (typically 0 or 1 for binary classification).

    Returns:
        tuple: (trained_discriminator, None) — returns the trained discriminator model.
    """

    # Set initial learning rate
    lr = 0.0001

    # Define number of epochs for training
    num_epochs = 50

    # Define loss function: Binary Cross Entropy, commonly used for binary classification tasks
    loss_function = nn.BCELoss()

    # Instantiate the Discriminator model
    discriminator = Discriminator()

    # Define optimizer: Adam optimizer used for updating discriminator weights
    optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)

    # Loop over the number of epochs
    for epoch in range(num_epochs):
        # Loop over each batch from the training DataLoader
        for n, (real_samples, real_labels) in enumerate(train_loader):
            # Zero the gradients accumulated in the discriminator from previous steps
            discriminator.zero_grad()

            # Forward pass: compute the discriminator's predictions on real samples
            predictions = discriminator(real_samples)

            # Compute the loss between the predictions and true labels
            loss_discriminator = loss_function(predictions, real_labels)

            # Backward pass: compute gradients
            loss_discriminator.backward()

            # Update the discriminator's parameters based on computed gradients
            optimizer_discriminator.step()
            
            # Print the loss once per epoch, at the last batch (where n == BATCH_SIZE - 1)
            # if n == BATCH_SIZE - 1:
            #     print(f"Epoch: {epoch} Loss D.: {loss_discriminator}")

    # Return the trained discriminator model; the second return value is None to sync with train_gan function output
    return discriminator, None

def train_gan(train_loader):
    """
    Trains a simple Generative Adversarial Network (GAN) composed of a Generator and Discriminator.

    The training alternates between:
    - Training the Discriminator to distinguish real samples from fake (generated) samples.
    - Training the Generator to produce samples that can "fool" the Discriminator.

    Args:
        train_loader (DataLoader): A PyTorch DataLoader providing batches of (real_samples, real_labels),
                                   where real_samples are true data examples and real_labels are the corresponding labels (typically ones).

    Returns:
        tuple: (trained_discriminator, trained_generator) — the trained Discriminator and Generator models.
    """

    # Set initial learning rate
    lr = 0.0001

    # Define number of epochs for training
    num_epochs = 50

    # Define the loss function: Binary Cross Entropy (BCE) Loss, used for binary classification tasks
    loss_function = nn.BCELoss()

    # Instantiate Discriminator and Generator models
    discriminator = Discriminator()
    generator = Generator()
    
    # Define optimizers for both discriminator and generator using Adam optimizer
    optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)
    optimizer_generator = torch.optim.Adam(generator.parameters(), lr=lr)

    # Loop over the number of epochs
    for epoch in range(num_epochs):
        # Loop over each batch from the training DataLoader
        for n, (real_samples, real_labels) in enumerate(train_loader):
            # ------------------------
            # Train the Discriminator
            # ------------------------

            # Generate fake samples using random noise input
            gen_samples = generator(torch.randn((BATCH_SIZE, 100)))

            # Create fake labels (zeros) for generated samples
            gen_labels = torch.zeros((BATCH_SIZE, 1))

            # Combine real and generated samples
            all_samples = torch.cat((real_samples, gen_samples))

            # Combine real and fake labels
            all_labels = torch.cat((real_labels, gen_labels))

            # Zero the discriminator's gradient buffers
            discriminator.zero_grad()

            # Compute discriminator loss on both real and fake samples
            loss_discriminator = loss_function(discriminator(all_samples), all_labels)

            # Backpropagation and optimizer step for discriminator
            loss_discriminator.backward()
            optimizer_discriminator.step()
            
            # ------------------------
            # Train the Generator
            # ------------------------

            # Zero the generator's gradient buffers
            generator.zero_grad()

            # Generate new fake samples for generator training
            gen_samples = generator(torch.randn((BATCH_SIZE, 100)))

            # Attempt to "fool" the discriminator: generator wants discriminator to predict real labels (ones)
            loss_generator = loss_function(discriminator(gen_samples), real_labels)

            # Backpropagation and optimizer step for generator
            loss_generator.backward()
            optimizer_generator.step()

            # ------------------------
            # Logging
            # ------------------------

            # Print loss at the last batch of the epoch
            # if n == BATCH_SIZE - 1:
            #     print(f"Epoch: {epoch} Loss D.: {loss_discriminator.item()}")
            #     print(f"Epoch: {epoch} Loss G.: {loss_generator.item()}")

    # Return both trained models
    return discriminator, generator

def recall_calc(y_pred, y_test):
    """
    Calculates Recall for binary classification.
    Note: this is preferred over accuracy for predicting fraudulent credit card transactions because:
                1. There's so few fraudulent record, accuracy can be high even if it incorrectly returns all negatives
                2. False negatives are far higher priority than false positives (i.e., more repercussions when incorrectly predicting fraudulent records to be valid)

    Recall = (True Positives) / (All Fraudulent Records)

    Args:
        y_pred (Tensor): Predicted labels (0 for non-fraud, 1 for fraud).
        y_test (Tensor): True labels (ground truth; 0 for non-fraud, 1 for fraud).

    Returns:
        Tensor: Recall value as a float tensor (scalar).
    """

    # Calculate number of true positives:
    # - y_pred == 1 identifies predicted positives
    # - y_test == 1 identifies fraudulent records
    # - Their element-wise multiplication gives True Positives (correctly predicted frauds)
    true_positives = ((y_pred == 1) * (y_test == 1)).to(torch.float32).sum()

    # Calculate total number of actual fraud cases
    actual_fraud = (y_test == 1).to(torch.float32).sum()

    # Recall is the ratio of true positives to actual fraud cases
    recall = true_positives / actual_fraud

    return recall


In [8]:
torch.manual_seed(TORCH_SEED)
DataGroups = namedtuple("DataGroups", ["all", "fraud", "valid"])
dg = DataGroups(CustomPandasTorch(fraud_data), CustomPandasTorch(fraud_data.query("Class == 1")), CustomPandasTorch(fraud_data.query("Class == 0")))

#Loop 1:
#   records: all (21,550)
#   training: discriminator only
#Loop 2:
#   records: fraudulent only (346)
#   training: discriminator+generator (GAN)
#Within each loop:
#   Recall is calculated for:
#        1. all records (loop 1: 21,550 | loop 2: 346)
#        2. 25% of the records after 75% (loop 1: 16,162 | loop 2: 259) were used for training 

#This cell takes ~2.5-4mins to run

result_lst = []

for i, (t, func) in enumerate(zip([dg.all, dg.fraud], [train_discriminator, train_gan])):
    print(i)

    #Set active dataset to raw
    t.set_raw()

    #All records recall score
    train_loader = torch.utils.data.DataLoader(t, batch_size=BATCH_SIZE, shuffle=True) 
    discriminator, generator = func(train_loader)
    result_lst.append([recall_calc(discriminator(t.active.data), t.active.labels), t.active.sample.shape[0], t.active.df.shape[0], discriminator, generator, "all"])
    
    #Set active dataset to train
    t.set_train() 

    #25% test records recall score
    train_loader = torch.utils.data.DataLoader(t, batch_size=BATCH_SIZE, shuffle=True)
    discriminator, generator = func(train_loader)
    result_lst.append([recall_calc(discriminator(t.test.data), t.test.labels), t.active.sample.shape[0], t.active.df.shape[0], discriminator, generator, "train"])
    # break

0
1


# Final Results

In [None]:
df_results = pd.DataFrame(result_lst, columns = ["recall", "sample_size", "df_size", "disc", "gen", "df_type"])
df_results

Unnamed: 0,recall,sample_size,df_size,disc,gen,df_type
0,tensor(0.7940),43136,21550,Discriminator(\n (model): Sequential(\n (0...,,all
1,tensor(0.8040),32384,16162,Discriminator(\n (model): Sequential(\n (0...,,train
2,tensor(0.9944),16000,346,Discriminator(\n (model): Sequential(\n (0...,Generator(\n (model): Sequential(\n (0): L...,all
3,tensor(0.9625),16000,259,Discriminator(\n (model): Sequential(\n (0...,Generator(\n (model): Sequential(\n (0): L...,train


In [14]:
#Training for best discriminator:
#1. 75% of all fraudulent records were used to train (other 25% used later for testing along with 100% of valid records)
#2. Used resampling to upsample fraudulent training records to 16,000 (up from 259)
#3. Used GAN model to generate extra fraduluent training records for improved recall
best_discriminator = df_results["disc"][3]
best_discriminator

Discriminator(
  (model): Sequential(
    (0): Linear(in_features=29, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=16, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.3, inplace=False)
    (9): Linear(in_features=16, out_features=1, bias=True)
    (10): Sigmoid()
  )
)

In [None]:
#Testing best discriminator:
#1. Test set includes 25% of the fraudulent records (not used for training) and all valid records
#2. Final Test Data Recall Calculation with best Discriminator below (i.e., best_discriminator)
torch.manual_seed(TORCH_SEED)
train_idx = dg.fraud.train.df.index
X_test = torch.from_numpy(dg.all.raw.df.query("index not in @train_idx").astype('float32').values)[:,:-1]
y_test = torch.from_numpy(dg.all.raw.df.query("index not in @train_idx").astype('float32').values)[:,-1:]
recall_result = recall_calc(best_discriminator(X_test), y_test).item()
print(f"Final test set recall: {recall_result:.3}")

Final test set recall: 0.954


# Further Work

In [None]:
#Further Work TODO:

#1. Standardize Amount feature
#2. Increase Number of Epochs
#3. Analyze ROC and recall-precision curves
#4. Hyperparametrization