# BT4012 Fraud Analytics Project

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import torch
from torch_geometric.data import Data

## Importing Data

In [8]:
# Import the training dataset
train = pd.read_csv("fraudTrain.csv", index_col=0)

# Import the testing dataset
test = pd.read_csv("fraudTest.csv", index_col=0)

validation = train[len(train) - round(len(train)*0.1):]
train = train[:len(train) - round(len(train)*0.1)]

In [9]:
all_nodes = pd.concat([train["cc_num"], train["merchant"], test["cc_num"], test["merchant"], validation["cc_num"], validation["merchant"]]).unique()
mapping = {node: i for i, node in enumerate(all_nodes)}
num_nodes = len(mapping)

In [10]:
train["transaction_dt"] = pd.to_datetime(train["trans_date_trans_time"])
test["transaction_dt"] = pd.to_datetime(test["trans_date_trans_time"])
validation["transaction_dt"] = pd.to_datetime(validation["trans_date_trans_time"])

train["transaction_hour"] = train["transaction_dt"].dt.hour
validation["transaction_hour"] = validation["transaction_dt"].dt.hour
test["transaction_hour"] = test["transaction_dt"].dt.hour

In [11]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import torch
from torch_geometric.data import Data

# Fit encoders on the training data
category_encoder = OneHotEncoder(sparse=False).fit(train[['category']])
state_encoder = OneHotEncoder(sparse=False).fit(train[['state']])

def process_data(df, mapping, category_encoder, state_encoder):
    df["from"] = df["cc_num"].map(mapping)
    df["to"] = df["merchant"].map(mapping)
    edge_index = torch.tensor([df["from"].values, df["to"].values], dtype=torch.long)
    
    # Transform 'category' and 'state' using the fitted encoders
    category_features = category_encoder.transform(df[['category']])
    state_features = state_encoder.transform(df[['state']])
    
    # Normalize 'transaction_hour'
    transaction_hours = df['transaction_hour'].values / 24.0  # Normalize hours to [0, 1]
    transaction_hours = transaction_hours.reshape(-1, 1)

    edge_features = torch.tensor(df[['amt']].values, dtype=torch.float)
    
    # Concatenate all features to create node features
    node_features = np.concatenate((category_features, state_features, transaction_hours), axis=1)
    node_features_tensor = torch.tensor(node_features, dtype=torch.float)
    
    # Create the data object
    data = Data(x=node_features_tensor, edge_index=edge_index, edge_attr=edge_features)
    return data

# Create a node mapping for all nodes
all_nodes = pd.concat([train["cc_num"], train["merchant"], test["cc_num"], test["merchant"], validation["cc_num"], validation["merchant"]]).unique()
mapping = {node: i for i, node in enumerate(all_nodes)}

# Identify the indices of the fraud and non-fraud instances
fraud_indices = train[train['is_fraud'] == 1].index
non_fraud_indices = train[train['is_fraud'] == 0].index

# Randomly sample the non-fraud instances to match the number of fraud instances
undersampled_non_fraud_indices = np.random.choice(non_fraud_indices, size=len(fraud_indices), replace=False)
undersampled_indices = np.concatenate([fraud_indices, undersampled_non_fraud_indices])

# Create the undersampled training set
undersampled_train = train.loc[undersampled_indices]

data_train = process_data(undersampled_train, mapping, category_encoder, state_encoder)
data_val = process_data(validation, mapping, category_encoder, state_encoder)
data_test = process_data(test, mapping, category_encoder, state_encoder)




In [12]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

data_train.y = torch.tensor(undersampled_train['is_fraud'].values, dtype=torch.long)
data_val.y = torch.tensor(validation['is_fraud'].values, dtype=torch.long)
data_test.y = torch.tensor(test['is_fraud'].values, dtype=torch.long)

# Creating masks for all nodes in train and validation sets
data_train.train_mask = torch.tensor([True] * len(undersampled_train), dtype=torch.bool)
data_val.val_mask = torch.tensor([True] * data_val.num_nodes, dtype=torch.bool) 
data_test.test_mask = torch.tensor([True] * data_test.num_nodes, dtype=torch.bool) 

import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Define the GCN model architecture
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

# Initialize the model and optimizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(num_node_features=data_train.num_node_features, num_classes=2).to(device)

labels = undersampled_train['is_fraud'].values
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float, device=device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

# Function to calculate metrics
def compute_metrics(output, labels, zero_division=1):
    _, predictions = torch.max(output, dim=1)
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()
    
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, zero_division=zero_division),
        "precision": precision_score(labels, predictions, zero_division=zero_division),
        "recall": recall_score(labels, predictions, zero_division=zero_division),
        "roc_auc": roc_auc_score(labels, predictions)
    }


# Adjust the training and evaluation functions to utilize the masks
def train_and_evaluate(data_train, data_val, model, optimizer, epochs=200):
    best_val_auc = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data_train)
        # Use the train mask to calculate the loss
        loss = F.nll_loss(out[data_train.train_mask], data_train.y[data_train.train_mask], weight=class_weights)
        # loss = criterion(out[data_train.train_mask], data_train.y[data_train.train_mask]) 
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            # Use the val mask to evaluate performance
            val_out = model(data_val)
            val_loss = F.nll_loss(val_out[data_val.val_mask], data_val.y[data_val.val_mask])
            val_metrics = compute_metrics(val_out[data_val.val_mask], data_val.y[data_val.val_mask])
            
            # Print validation metrics
            print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val AUC: {val_metrics["roc_auc"]}')
            
            # Save the best model state based on validation AUC
            if val_metrics['roc_auc'] > best_val_auc:
                best_val_auc = val_metrics['roc_auc']
                best_model_state = model.state_dict()

    # Load the best model state after training
    model.load_state_dict(best_model_state)
    return model

# Train and validate the model
best_model = train_and_evaluate(data_train.to(device), data_val.to(device), model, optimizer)

# Evaluate on the test set
def evaluate_test_set(best_model, data_test):
    best_model.eval()
    with torch.no_grad():
        # Use the test mask to evaluate performance
        test_out = best_model(data_test)
        test_metrics = compute_metrics(test_out[data_test.test_mask], data_test.y[data_test.test_mask])
        
        # Print test metrics
        print("Test Set Evaluation:")
        print(f'Accuracy: {test_metrics["accuracy"]}')
        print(f'F1 Score: {test_metrics["f1"]}')
        print(f'Precision: {test_metrics["precision"]}')
        print(f'Recall: {test_metrics["recall"]}')
        print(f'ROC AUC: {test_metrics["roc_auc"]}')

# Call the function to evaluate the model on the test set
evaluate_test_set(best_model, data_test.to(device))


Epoch 1, Loss: 0.6985903382301331, Val Loss: 0.6697142124176025, Val AUC: 0.5897166970770233
Epoch 2, Loss: 0.6804229021072388, Val Loss: 0.697489857673645, Val AUC: 0.5954144968439303
Epoch 3, Loss: 0.6788742542266846, Val Loss: 0.7110375761985779, Val AUC: 0.619621479125519
Epoch 4, Loss: 0.6650028228759766, Val Loss: 0.7126448154449463, Val AUC: 0.6317489646945312
Epoch 5, Loss: 0.6597253084182739, Val Loss: 0.7038187384605408, Val AUC: 0.6388463150528877
Epoch 6, Loss: 0.6493579745292664, Val Loss: 0.6892430782318115, Val AUC: 0.6475338466971725
Epoch 7, Loss: 0.641473114490509, Val Loss: 0.6709657311439514, Val AUC: 0.6548558789409076
Epoch 8, Loss: 0.6381662487983704, Val Loss: 0.6509443521499634, Val AUC: 0.6583313580532913
Epoch 9, Loss: 0.6301102638244629, Val Loss: 0.6307270526885986, Val AUC: 0.6665229550925804
Epoch 10, Loss: 0.6218758225440979, Val Loss: 0.6121864914894104, Val AUC: 0.6731768936417676
Epoch 11, Loss: 0.6195500493049622, Val Loss: 0.5972059369087219, Val AU

In [13]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

data_train.y = torch.tensor(undersampled_train['is_fraud'].values, dtype=torch.long)
data_val.y = torch.tensor(validation['is_fraud'].values, dtype=torch.long)
data_test.y = torch.tensor(test['is_fraud'].values, dtype=torch.long)

# Creating masks for all nodes in train and validation sets
data_train.train_mask = torch.tensor([True] * len(undersampled_train), dtype=torch.bool)
data_val.val_mask = torch.tensor([True] * data_val.num_nodes, dtype=torch.bool) 
data_test.test_mask = torch.tensor([True] * data_test.num_nodes, dtype=torch.bool)

import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# Define the GCN model architecture
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 32)  
        self.conv2 = GCNConv(32, 64)  
        self.conv3 = GCNConv(64, num_classes) 

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv2(x, edge_index))  # Additional layer with ReLU
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)  # Output layer

        return F.log_softmax(x, dim=1)

# Initialize the model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(num_node_features=data_train.num_node_features, num_classes=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

# Function to calculate metrics
def compute_metrics(output, labels, zero_division=1):
    _, predictions = torch.max(output, dim=1)
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()
    
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, zero_division=zero_division),
        "precision": precision_score(labels, predictions, zero_division=zero_division),
        "recall": recall_score(labels, predictions, zero_division=zero_division),
        "roc_auc": roc_auc_score(labels, predictions)
    }

labels = undersampled_train['is_fraud'].values
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float, device=device)


# Adjust the training and evaluation functions to utilize the masks
def train_and_evaluate(data_train, data_val, model, optimizer, epochs=200):
    best_val_auc = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data_train)
        # Use the train mask to calculate the loss
        loss = F.nll_loss(out[data_train.train_mask], data_train.y[data_train.train_mask], weight=class_weights)
        loss.backward()
        optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            # Use the val mask to evaluate performance
            val_out = model(data_val)
            val_loss = F.nll_loss(val_out[data_val.val_mask], data_val.y[data_val.val_mask])
            val_metrics = compute_metrics(val_out[data_val.val_mask], data_val.y[data_val.val_mask])
            
            # Print validation metrics
            print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}, Val AUC: {val_metrics["roc_auc"]}')
            
            # Save the best model state based on validation AUC
            if val_metrics['roc_auc'] > best_val_auc:
                best_val_auc = val_metrics['roc_auc']
                best_model_state = model.state_dict()

    # Load the best model state after training
    model.load_state_dict(best_model_state)
    return model

# Train and validate the model
best_model = train_and_evaluate(data_train.to(device), data_val.to(device), model, optimizer)

# Evaluate on the test set
def evaluate_test_set(best_model, data_test):
    best_model.eval()
    with torch.no_grad():
        # Use the test mask to evaluate performance
        test_out = best_model(data_test)
        test_metrics = compute_metrics(test_out[data_test.test_mask], data_test.y[data_test.test_mask])
        
        # Print test metrics
        print("Test Set Evaluation:")
        print(f'Accuracy: {test_metrics["accuracy"]}')
        print(f'F1 Score: {test_metrics["f1"]}')
        print(f'Precision: {test_metrics["precision"]}')
        print(f'Recall: {test_metrics["recall"]}')
        print(f'ROC AUC: {test_metrics["roc_auc"]}')

# Call the function to evaluate the model on the test set
evaluate_test_set(best_model, data_test.to(device))


Epoch 1, Loss: 0.7339551448822021, Val Loss: 0.8597514629364014, Val AUC: 0.5051275869202827
Epoch 2, Loss: 0.6940368413925171, Val Loss: 0.9365063309669495, Val AUC: 0.5001319506970101
Epoch 3, Loss: 0.6934372186660767, Val Loss: 0.9105685353279114, Val AUC: 0.5002561395883138
Epoch 4, Loss: 0.6831748485565186, Val Loss: 0.8512023687362671, Val AUC: 0.5029814476424411
Epoch 5, Loss: 0.6720594167709351, Val Loss: 0.7895699739456177, Val AUC: 0.5326597193032525
Epoch 6, Loss: 0.6660507321357727, Val Loss: 0.7371712327003479, Val AUC: 0.5874377115092055
Epoch 7, Loss: 0.655001699924469, Val Loss: 0.6981844305992126, Val AUC: 0.6321887568154625
Epoch 8, Loss: 0.6523035168647766, Val Loss: 0.6700224876403809, Val AUC: 0.6431571211880579
Epoch 9, Loss: 0.6442570686340332, Val Loss: 0.651323139667511, Val AUC: 0.6584497815648755
Epoch 10, Loss: 0.6388890147209167, Val Loss: 0.6412959098815918, Val AUC: 0.6626838465777601
Epoch 11, Loss: 0.6288988590240479, Val Loss: 0.638067364692688, Val AU