In [1]:
import pandas as pd

# File path
file_path = "creditcardfraud.csv"

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path, header=None)

# Number of features
num_features = df.shape[1]

# Create column names: x1, x2, ..., x29
column_names = [f"x{i}" for i in range(1, num_features )]

# Add the target column name 'y'
column_names.append("y")

# Rename the columns in the DataFrame
df.columns = column_names

df.head()


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x21,x22,x23,x24,x25,x26,x27,x28,x29,y
0,0.935192,0.76649,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,0.5106,...,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824,1
1,0.978542,0.770067,0.840298,0.271796,0.76612,0.262192,0.264875,0.786298,0.453981,0.505267,...,0.55784,0.480237,0.666938,0.33644,0.58729,0.446013,0.416345,0.313423,0.000105,1
2,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,0.513018,...,0.565477,0.54603,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739,1
3,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,0.507585,...,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807,1
4,0.938617,0.77652,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.49095,0.524303,...,0.561327,0.547271,0.663392,0.40127,0.566343,0.507497,0.420561,0.31749,0.002724,1


In [2]:
df["y"].value_counts()

# 1 are the normal, -1 are the anomaly

y
 1    284315
-1       492
Name: count, dtype: int64

In [3]:
# y = 1
df_normal = df[df["y"] == 1]

# y = -1
df_anomaly = df[df["y"] == -1]

In [5]:
# Testing Set 
# 492 1's and 492 -1's

df_test_ones = df_normal.sample(492)
df_test = pd.concat([df_test_ones, df_anomaly])

In [7]:
indices_to_remove = df_test.index.tolist()
df_train = df.drop(indices_to_remove)


In [8]:
df_train["y"].value_counts()

y
1    283823
Name: count, dtype: int64

In [9]:
df["y"].value_counts()

y
 1    284315
-1       492
Name: count, dtype: int64

In [10]:
df_train_x = df_train.drop("y", axis = 1)
df_train_y = df_train["y"]
df_test_x = df_test.drop("y", axis = 1)
df_test_y = df_test["y"]


In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [12]:
# Custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, df_train):
        self.data_x = torch.tensor(df_train.values, dtype=torch.float32)

    def __len__(self):
        return self.data_x.shape[0]

    def __getitem__(self, idx):
        return self.data_x[idx]

In [15]:
class CustomDataset(Dataset):
    def __init__(self, df_x, df_y=None):
        self.data_x = torch.tensor(df_x.values, dtype=torch.float32)
        self.data_y = torch.tensor(df_y.values, dtype=torch.float32) if df_y is not None else None

    def __len__(self):
        return self.data_x.shape[0]

    def __getitem__(self, idx):
        if self.data_y is not None:
            return self.data_x[idx], self.data_y[idx]
        return self.data_x[idx]

In [16]:
def dataset_to_tensor(dataset):
    """
    Convert a dataset to a single tensor.
    
    Args:
    dataset: A PyTorch Dataset object
    
    Returns:
    A tensor containing all the features from the dataset
    """
    # Create a DataLoader with batch_size equal to the dataset size
    loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
    
    # Get the single batch from the DataLoader
    data = next(iter(loader))
    
    # The first element of data is the features (X)
    features = data[0]
    
    return features

In [17]:
train_dataset = CustomDataset(df_train_x, df_train_y)
test_dataset = CustomDataset(df_test_x, df_test_y)

batch_size = 256  # You can adjust this value
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# We need to preserve the order of the labels in the test set
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

test_tensor = dataset_to_tensor(test_dataset)


In [22]:
import torch
import torch.nn as nn
import torch.optim as optim

class Autoencoder(nn.Module):
    def __init__(self, in_dim, layer_1, layer_2):
        super(Autoencoder, self).__init__()
        self.encoder_a = nn.Linear(in_dim, layer_1)
        self.encoder_b = nn.Linear(layer_1, layer_2)
        self.decoder_a = nn.Linear(layer_2, layer_1)
        self.decoder_b = nn.Linear(layer_1, in_dim)
        self.ReLU = nn.ReLU()
    
    def encode(self, x):
        x = self.encoder_a(x)
        x = self.ReLU(x)
        x = self.encoder_b(x)
        x = self.ReLU(x)
        return x
    
    def decode(self, x):
        x = self.decoder_a(x)
        x = self.ReLU(x)
        x = self.decoder_b(x)
        return x
    
    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x
    
    def detect_anomaly(self, x, threshold):
        with torch.no_grad():
            x_pred = self(x)
            diff = torch.norm(x_pred - x, dim=1)
            return diff > threshold

def train_autoencoder(model, train_loader, lr, num_epochs=5):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_values = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for x_batch, _ in train_loader:  # Ignore y values for autoencoder
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, x_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        loss_values.append(avg_loss)
    
    return loss_values


# Assume we have a dataset and dataloader
in_dim = df_train_x.shape[1]  
layer_1 = 8
layer_2 = 4
lr = 0.001
threshold = 0.5 # Example threshold value

model = Autoencoder(in_dim, layer_1, layer_2)

# Train the model
loss_values = train_autoencoder(model, train_loader, lr)

# Detect anomalies
anomalies = model.detect_anomaly(test_tensor, threshold)



tensor([0.2058, 0.2252, 0.3257, 0.1648, 0.2052, 0.2247, 0.1791, 0.1340, 0.1963,
        0.1903, 0.1480, 0.1994, 0.2263, 0.2221, 0.2250, 0.2245, 0.2143, 0.1351,
        0.1301, 0.1677, 0.1971, 0.1378, 0.2518, 0.2006, 0.1570, 0.2777, 0.1616,
        0.1305, 0.1554, 0.3124, 0.2238, 0.1934, 0.2126, 0.2255, 0.2092, 0.2218,
        0.1662, 0.1839, 0.1796, 0.1936, 0.3354, 0.1447, 0.2547, 0.1745, 0.2093,
        0.2254, 0.2209, 0.1388, 0.2657, 0.3009, 0.3363, 0.2039, 0.2794, 0.1328,
        0.1917, 0.1272, 0.2933, 0.1203, 0.2263, 0.2499, 0.2737, 0.2551, 0.2710,
        0.2703, 0.2425, 0.2600, 0.2109, 0.2306, 0.2715, 0.4298, 0.2924, 0.2265,
        0.1985, 0.1819, 0.1723, 0.1865, 0.1902, 0.1992, 0.2328, 0.3629, 0.1995,
        0.1529, 0.2016, 0.2146, 0.1607, 0.1716, 0.1920, 0.2458, 0.2480, 0.1868,
        0.2856, 0.1917, 0.2047, 0.3246, 0.1571, 0.2578, 0.2559, 0.3903, 0.1657,
        0.1932, 0.1867, 0.1792, 0.3621, 0.2292, 0.2812, 0.1717, 0.1533, 0.2665,
        0.3203, 0.3466, 0.2517, 0.1523, 

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class Autoencoder(nn.Module):
    def __init__(self, in_dim, layer_1, layer_2):
        super(Autoencoder, self).__init__()
        self.encoder_a = nn.Linear(in_dim, layer_1)
        self.encoder_b = nn.Linear(layer_1, layer_2)
        self.decoder_a = nn.Linear(layer_2, layer_1)
        self.decoder_b = nn.Linear(layer_1, in_dim)
        self.ReLU = nn.ReLU()
    
    def encode(self, x):
        x = self.encoder_a(x)
        x = self.ReLU(x)
        x = self.encoder_b(x)
        x = self.ReLU(x)
        return x
    
    def decode(self, x):
        x = self.decoder_a(x)
        x = self.ReLU(x)
        x = self.decoder_b(x)
        return x  # Remove ReLU here to allow negative values
    
    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x
    
    def compute_reconstruction_error(self, x):
        with torch.no_grad():
            x_pred = self(x)
            return torch.norm(x_pred - x, dim=1)
    
    def detect_anomaly(self, x, threshold):
        with torch.no_grad():
            reconstruction_errors = self.compute_reconstruction_error(x)
            # if the error is greater than t, anomaly is a True boolean value
            return reconstruction_errors > threshold

def train_autoencoder(model, train_loader, lr, num_epochs=5):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_values = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for x_batch, _ in train_loader:  # Ignore y values for autoencoder
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, x_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        loss_values.append(avg_loss)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    
    return loss_values

def set_threshold(model, train_loader, percentile=95):
    model.eval()
    reconstruction_errors = []
    
    with torch.no_grad():
        for x_batch, _ in train_loader:
            errors = model.compute_reconstruction_error(x_batch)
            reconstruction_errors.extend(errors.cpu().numpy())
    
    threshold = np.percentile(reconstruction_errors, percentile)
    return threshold

in_dim = df_train_x.shape[1]  # 29 features
layer_1 = 15
layer_2 = 10
lr = 0.001
model = Autoencoder(in_dim, layer_1, layer_2)

# Train the model
loss_values = train_autoencoder(model, train_loader, lr)

# Set the threshold
threshold = set_threshold(model, train_loader, percentile=95)
print(f"Threshold set to: {threshold:.4f}")

# Detect anomalies
model.eval()
anomalies = model.detect_anomaly(test_tensor, threshold)

# Calculate accuracy
test_labels = torch.tensor(df_test_y.values, dtype=torch.long)
true_labels = test_labels.numpy()  # Assuming -1 for anomalies, 1 for normal
predicted_labels = (~anomalies.cpu().numpy()).astype(int) * 2 - 1  # Convert bool to -1/1
accuracy = (true_labels == predicted_labels).mean()
print(f"Accuracy: {accuracy:.4f}")

Epoch 1/5, Loss: 0.0339
Epoch 2/5, Loss: 0.0016
Epoch 3/5, Loss: 0.0014
Epoch 4/5, Loss: 0.0014
Epoch 5/5, Loss: 0.0014
Threshold set to: 0.2949
Accuracy: 0.8943


## Hyperparameter Tuning

In [None]:
# Three parameters to tune
# 1. threshold value
# 2. deepness of the encoder
# 3. latent space dimensionality
thresholds = [0.2, 0.3, 0.4]
encoder_depths = [1, 2, 3]
latent_dims = [2, 4, 8, 16]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from itertools import product
import numpy as np

class Autoencoder(nn.Module):
    def __init__(self, in_dim, hidden_dims, latent_dim):
        super(Autoencoder, self).__init__()
        
        # Encoder
        encoder_layers = []
        for h_dim in hidden_dims:
            encoder_layers.append(nn.Linear(in_dim, h_dim))
            encoder_layers.append(nn.ReLU())
            in_dim = h_dim
        encoder_layers.append(nn.Linear(in_dim, latent_dim))
        self.encoder = nn.Sequential(*encoder_layers)
        
        # Decoder
        decoder_layers = []
        for h_dim in reversed(hidden_dims):
            decoder_layers.append(nn.Linear(latent_dim, h_dim))
            decoder_layers.append(nn.ReLU())
            latent_dim = h_dim
        decoder_layers.append(nn.Linear(latent_dim, in_dim))
        self.decoder = nn.Sequential(*decoder_layers)
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def compute_reconstruction_error(self, x):
        with torch.no_grad():
            x_pred = self(x)
            return torch.norm(x_pred - x, dim=1)

def train_autoencoder(model, train_loader, val_loader, lr=0.001, num_epochs=10):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    best_val_loss = float('inf')
    best_model = None
    
    for epoch in range(num_epochs):
        model.train()
        for x_batch, _ in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output, x_batch)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_batch, _ in val_loader:
                output = model(x_batch)
                val_loss += criterion(output, x_batch).item()
        val_loss /= len(val_loader)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
    
    model.load_state_dict(best_model)
    return model

def evaluate_model(model, test_loader, threshold):
    model.eval()
    true_labels = []
    pred_labels = []
    
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            errors = model.compute_reconstruction_error(x_batch)
            predictions = (errors > threshold).float()
            true_labels.extend(y_batch.numpy())
            pred_labels.extend(predictions.numpy())
    
    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)
    
    accuracy = (true_labels == pred_labels).mean()
    return accuracy

def hyperparameter_tuning(X_train, y_train, X_test, y_test):
    # Define hyperparameter ranges
    thresholds = [0.1, 0.5, 1.0, 1.5, 2.0]
    encoder_depths = [1, 2, 3]
    latent_dims = [2, 4, 8, 16]
    
    best_accuracy = 0
    best_params = None
    
    for threshold, depth, latent_dim in product(thresholds, encoder_depths, latent_dims):
        print(f"Testing: threshold={threshold}, depth={depth}, latent_dim={latent_dim}")
        
        hidden_dims = [64] * depth  # You can adjust this based on your needs
        model = Autoencoder(X_train.shape[1], hidden_dims, latent_dim)
        
        trained_model = train_autoencoder(model, train_loader, val_loader)
        accuracy = evaluate_model(trained_model, test_loader, threshold)
        
        print(f"Accuracy: {accuracy}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = (threshold, depth, latent_dim)
    
    print(f"Best parameters: threshold={best_params[0]}, depth={best_params[1]}, latent_dim={best_params[2]}")
    print(f"Best accuracy: {best_accuracy}")
    
    return best_params

# Usage
# Assuming X_train, y_train, X_test, y_test are your data
best_params = hyperparameter_tuning(X_train, y_train, X_test, y_test)