# Case Study 4: Credit Card Fraud Detection
Group Members: Benjamin Ang, Harvey Felipe, Enika Maninang, Jeremy Tan

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


In [2]:
# File path
file_path = "creditcardfraud.csv"

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path, header=None)

# Number of features
num_features = df.shape[1]

# Create column names: x1, x2, ..., x29
column_names = [f"x{i}" for i in range(1, num_features )]

# Add the target column name 'y'
column_names.append("y")

# Rename the columns in the DataFrame
df.columns = column_names

df.head()


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,y
0,0.935192,0.76649,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,0.5106,...,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824,1
1,0.978542,0.770067,0.840298,0.271796,0.76612,0.262192,0.264875,0.786298,0.453981,0.505267,...,0.55784,0.480237,0.666938,0.33644,0.58729,0.446013,0.416345,0.313423,0.000105,1
2,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,0.513018,...,0.565477,0.54603,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739,1
3,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,0.507585,...,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807,1
4,0.938617,0.77652,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.49095,0.524303,...,0.561327,0.547271,0.663392,0.40127,0.566343,0.507497,0.420561,0.31749,0.002724,1


In [3]:
df["y"].value_counts()

# 1 are the normal, -1 are the anomaly

y
 1    284315
-1       492
Name: count, dtype: int64

In [4]:
# y = 1
df_normal = df[df["y"] == 1]

# y = -1
df_anomaly = df[df["y"] == -1]

In [5]:
# Testing Set 
# 492 1's and 492 -1's

df_test_ones = df_normal.sample(492)
df_test = pd.concat([df_test_ones, df_anomaly])

In [6]:
indices_to_remove = df_test.index.tolist()
df_train = df.drop(indices_to_remove)


In [7]:
df_train["y"].value_counts()

y
1    283823
Name: count, dtype: int64

In [8]:
df["y"].value_counts()

y
 1    284315
-1       492
Name: count, dtype: int64

In [9]:
df_train_x = df_train.drop("y", axis = 1)
df_train_y = df_train["y"]
df_test_x = df_test.drop("y", axis = 1)
df_test_y = df_test["y"]


In [10]:

class CustomDataset(Dataset):
    def __init__(self, df_x, df_y=None):
        self.data_x = torch.tensor(df_x.values, dtype=torch.float32)
        self.data_y = torch.tensor(df_y.values, dtype=torch.float32) if df_y is not None else None

    def __len__(self):
        return self.data_x.shape[0]

    def __getitem__(self, idx):
        if self.data_y is not None:
            return self.data_x[idx], self.data_y[idx]
        return self.data_x[idx]
    
def dataset_to_tensor(dataset):
    """
    Convert a dataset to a single tensor.
    
    Args:
    dataset: A PyTorch Dataset object
    
    Returns:
    A tensor containing all the features from the dataset
    """
    # Create a DataLoader with batch_size equal to the dataset size
    loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    
    # Get the single batch from the DataLoader
    data = next(iter(loader))
    
    # The first element of data is the features (X)
    features = data[0]
    
    return features

In [11]:
train_dataset = CustomDataset(df_train_x, df_train_y)
test_dataset = CustomDataset(df_test_x, df_test_y)

batch_size = 256  # You can adjust this value
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# We need to preserve the order of the labels in the test set
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

test_tensor = dataset_to_tensor(test_dataset)


In [12]:
class Autoencoder(nn.Module):
    def __init__(self, in_dim, layer_1, layer_2):
        super(Autoencoder, self).__init__()
        self.encoder_a = nn.Linear(in_dim, layer_1)
        self.encoder_b = nn.Linear(layer_1, layer_2)
        self.decoder_a = nn.Linear(layer_2, layer_1)
        self.decoder_b = nn.Linear(layer_1, in_dim)
        self.ReLU = nn.ReLU()
    
    def encode(self, x):
        x = self.encoder_a(x)
        x = self.ReLU(x)
        x = self.encoder_b(x)
        x = self.ReLU(x)
        return x
    
    def decode(self, x):
        x = self.decoder_a(x)
        x = self.ReLU(x)
        x = self.decoder_b(x)
        return x  # Remove ReLU here to allow negative values
    
    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x
    
    def compute_reconstruction_error(self, x):
        with torch.no_grad():
            x_pred = self(x)
            return torch.norm(x_pred - x, dim=1)
    
    def detect_anomaly(self, x, threshold):
        with torch.no_grad():
            reconstruction_errors = self.compute_reconstruction_error(x)
            # if the error is greater than t, anomaly is a True boolean value
            return reconstruction_errors > threshold

def train_autoencoder(model, train_loader, lr, num_epochs=5):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_values = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for x_batch, _ in train_loader:  # Ignore y values for autoencoder
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, x_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        loss_values.append(avg_loss)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    return loss_values


def set_threshold(model, train_loader, percentile=95):
    model.eval()
    reconstruction_errors = []
    
    with torch.no_grad():
        for x_batch, _ in train_loader:
            errors = model.compute_reconstruction_error(x_batch)
            reconstruction_errors.extend(errors.cpu().numpy())
    
    threshold = np.percentile(reconstruction_errors, percentile)
    return threshold

in_dim = df_train_x.shape[1]  # 29 features
layer_1 = 15
layer_2 = 10
lr = 0.001
model = Autoencoder(in_dim, layer_1, layer_2)

# Train the model
loss_values = train_autoencoder(model, train_loader, lr)

# Set the threshold
threshold = set_threshold(model, train_loader, percentile=95)
print(f"Threshold set to: {threshold:.4f}")

# Detect anomalies
model.eval()
anomalies = model.detect_anomaly(test_tensor, threshold)

# Calculate accuracy
test_labels = torch.tensor(df_test_y.values, dtype=torch.long)
true_labels = test_labels.numpy()  # Assuming -1 for anomalies, 1 for normal
predicted_labels = (~anomalies.cpu().numpy()).astype(int) * 2 - 1  # Convert bool to -1/1
accuracy = (true_labels == predicted_labels).mean()
print(f"Accuracy: {accuracy:.4f}")

Epoch 1/5, Loss: 0.0155
Epoch 2/5, Loss: 0.0016
Epoch 3/5, Loss: 0.0014
Epoch 4/5, Loss: 0.0013
Epoch 5/5, Loss: 0.0011
Threshold set to: 0.2581
Accuracy: 0.9075


## Hyperparameter Tuning

In [17]:
from itertools import product

layer_depths = [
    [in_dim, 20], # number of dimensions in each layer
    [in_dim, 20, 15]  # number of dimensions in each layer
]
latent_space_dims = [5, 10]  
thresholds = [0.1, 0.2, 0.4] 
activations = [nn.ReLU(), nn.Sigmoid()]  # Add this line

results = []

# Perform grid search
for layers, latent_dim, threshold, activation in product(layer_depths, latent_space_dims, thresholds, activations):
    print(f"Training Autoencoder with layers={layers}, latent_dim={latent_dim}, threshold={threshold}, activation={activation.__class__.__name__}")
    class Autoencoder(nn.Module):
        def __init__(self, layers, latent_dim, activation):

            # the autoencoder has a depth of 2 layers
            if len(layers) == 2:
                super(Autoencoder, self).__init__()
                self.encoder = nn.Sequential(
                    nn.Linear(layers[0], layers[1]),
                    activation,
                    nn.Linear(layers[1], latent_dim),
                    activation)
                
                self.decoder = nn.Sequential(
                nn.Linear(latent_dim, layers[1]),
                activation,
                nn.Linear(layers[1], layers[0]))

            # the autoencoder has a depth of 3 layers
            elif len(layers) == 3:
                super(Autoencoder, self).__init__()
                self.encoder = nn.Sequential(
                    nn.Linear(layers[0], layers[1]),
                    activation,
                    nn.Linear(layers[1], layers[2]),
                    activation,
                    nn.Linear(layers[2], latent_dim),
                    activation
            )
                self.decoder = nn.Sequential(
                nn.Linear(latent_dim, layers[2]),
                activation,
                nn.Linear(layers[2], layers[1]),
                activation, 
                nn.Linear(layers[1], layers[0]))
                    
        def forward(self, x):
            encoded = self.encoder(x)
            decoded = self.decoder(encoded)
            return decoded
        
        def compute_reconstruction_error(self, x):
            with torch.no_grad():
                x_pred = self(x)
                return torch.norm(x_pred - x, dim=1)  
    
        def detect_anomaly(self, x, threshold):
            with torch.no_grad():
                reconstruction_errors = self.compute_reconstruction_error(x)
                return reconstruction_errors > threshold

    model = Autoencoder(layers, latent_dim, activation)
    
    loss_values = train_autoencoder(model, train_loader, lr)
    
    model.eval()
    anomalies = model.detect_anomaly(test_tensor, threshold)
    
    predicted_labels = (~anomalies.cpu().numpy()).astype(int) * 2 - 1  # Convert bool to -1/1
    accuracy = (true_labels == predicted_labels).mean()
    
    results.append({
        'layers': layers,
        'latent_dim': latent_dim,
        'threshold': threshold,
        'activation': activation.__class__.__name__,
        'accuracy': accuracy
    })

results_df = pd.DataFrame(results)
print(results_df)


Training Autoencoder with layers=[29, 20], latent_dim=5, threshold=0.1, activation=ReLU
Epoch 1/5, Loss: 0.0155
Epoch 2/5, Loss: 0.0016
Epoch 3/5, Loss: 0.0016
Epoch 4/5, Loss: 0.0015
Epoch 5/5, Loss: 0.0015
Training Autoencoder with layers=[29, 20], latent_dim=5, threshold=0.1, activation=Sigmoid
Epoch 1/5, Loss: 0.0133
Epoch 2/5, Loss: 0.0017
Epoch 3/5, Loss: 0.0017
Epoch 4/5, Loss: 0.0017
Epoch 5/5, Loss: 0.0015
Training Autoencoder with layers=[29, 20], latent_dim=5, threshold=0.2, activation=ReLU
Epoch 1/5, Loss: 0.0195
Epoch 2/5, Loss: 0.0017
Epoch 3/5, Loss: 0.0017
Epoch 4/5, Loss: 0.0017
Epoch 5/5, Loss: 0.0017
Training Autoencoder with layers=[29, 20], latent_dim=5, threshold=0.2, activation=Sigmoid
Epoch 1/5, Loss: 0.0114
Epoch 2/5, Loss: 0.0017
Epoch 3/5, Loss: 0.0017
Epoch 4/5, Loss: 0.0017
Epoch 5/5, Loss: 0.0016
Training Autoencoder with layers=[29, 20], latent_dim=5, threshold=0.5, activation=ReLU
Epoch 1/5, Loss: 0.0163
Epoch 2/5, Loss: 0.0015
Epoch 3/5, Loss: 0.0014
Ep

In [19]:
results_df

Unnamed: 0,layers,latent_dim,threshold,activation,accuracy
0,"[29, 20]",5,0.1,ReLU,0.504065
1,"[29, 20]",5,0.1,Sigmoid,0.518293
2,"[29, 20]",5,0.2,ReLU,0.732724
3,"[29, 20]",5,0.2,Sigmoid,0.772358
4,"[29, 20]",5,0.5,ReLU,0.793699
5,"[29, 20]",5,0.5,Sigmoid,0.802846
6,"[29, 20]",10,0.1,ReLU,0.529472
7,"[29, 20]",10,0.1,Sigmoid,0.515244
8,"[29, 20]",10,0.2,ReLU,0.885163
9,"[29, 20]",10,0.2,Sigmoid,0.807927


In [27]:
best_model = results_df.loc[results_df['accuracy'].idxmax()]
best_model

layers        [29, 20]
latent_dim          10
threshold          0.2
activation        ReLU
accuracy      0.885163
Name: 8, dtype: object

## Guide Questions:
1. What was the optimal t value in your group's case? (example: was optimal t more conservative compared to the other t values. If so, why do you think this is the case for something like credit card fraud detection?)
2. 
What was the optimal deepness for an autoencoder in your case? Why do you think this value worked well?
3. 
What was the optimal latent space dimensionality in your case compared to the other latent space dimensionality values? Why do you think this value worked well?


## Answers:

In [21]:
results_df.groupby('threshold')['accuracy'].mean()

threshold
0.1    0.519309
0.2    0.777820
0.5    0.797129
Name: accuracy, dtype: float64

1. When taking the average accuracy for all the autoencoders, the models with a threshold of 0.2 and 0.5 have a significantly higher average accuracy than the models with a threshold of 0.1. While the average accuracy of the models with a threshold of 0.5 is slightly higher than the models with a threshold of 0.2, the best model had a threshold of 0.2. This was because a moderatively conservative threshold such as 0.2 balances the likelihood of the model classifying false positives and false negatives. A less conservative threshold such as 0.1 will result in the model classifying too many legitimate credit card transactions as fraudulent (false positives).

In [30]:
results_df['layers_str'] = results_df['layers'].apply(str)
results_df.groupby('layers_str')['accuracy'].mean()

layers_str
[29, 20, 15]    0.693259
[29, 20]        0.702913
Name: accuracy, dtype: float64

2. On average, the shallow autoencoder with two hidden layers, one in the encoder and decoder respectively, had a higher average accuracy score compared to the deeper autoencoder. The best performing model also had only two hidden layers. Since the model has fewer parameters and layers, the autoencoder is less likely to overfit. Moreover, the dataset used might not require complex feature extraction or dimensional reductionability so a simpler model was able to achieve better results.

In [31]:
results_df.groupby('latent_dim')['accuracy'].mean()

latent_dim
5     0.690888
10    0.705285
Name: accuracy, dtype: float64

3. A latent space of 10 resulted in a higher average accuracy compared to a latent space of 5. For the dataset used in the context of anomaly detection, the data may be better represented in 10 dimensions compared to being overcompressed in 5 dimensions. Moreover, less information is lost when the autoencoder compresses the data from 20 or 15 dimensions to 10 dimensions as compared to when the data is compressed directly to 5 dimensions before it is subsequently reconstructed.