## 1. Load the Data 📥

In [48]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

dataset_df = pd.read_csv("../data/generic_changed_dataset.csv")
le = LabelEncoder()
type_encoded = le.fit_transform(dataset_df["type"])  # Now each type is a unique int
num_types = len(le.classes_)  # Number of unique types
print(num_types)

9


In [49]:
# Load embeddings without training it again from the dataset_df
X_distil_bert = np.load("distil_bert_embeddings.npy")
print(X_distil_bert.shape)
X_sbert = np.load("sbert_embeddings.npy")
print(X_sbert.shape)

(2000, 768)
(2000, 384)


In [50]:
X_BERT_train, X_BERT_test, y1_BERT_train, y1_BERT_test, y2_BERT_train, y2_BERT_test = train_test_split(X_distil_bert, dataset_df["labels"], type_encoded, test_size=0.6, random_state=42) # Train split
X_BERT_val, X_BERT_test, y1_BERT_val, y1_BERT_test, y2_BERT_val, y2_BERT_test = train_test_split(X_BERT_test, y1_BERT_test, y2_BERT_test, test_size=0.5, random_state=42) # Validation and test split

X_SBERT_train, X_SBERT_test, y1_SBERT_train, y1_SBERT_test, y2_SBERT_train, y2_SBERT_test = train_test_split(X_sbert, dataset_df["labels"], type_encoded, test_size=0.6, random_state=42) # Train split
X_SBERT_val, X_SBERT_test, y1_SBERT_val, y1_SBERT_test, y2_SBERT_val, y2_SBERT_test = train_test_split(X_SBERT_test, y1_SBERT_test, y2_SBERT_test, test_size=0.5, random_state=42) # Validation and test split


# Put the shapes into a table for easy comparison
pd.DataFrame(
    {
        "Embedding Type": [
            "DistilBERT",
            "SBERT",
        ],
        "Train Shape": [
            X_BERT_train.shape,
            X_SBERT_train.shape,
        ],
        "Val Shape": [
            X_BERT_val.shape,
            X_SBERT_val.shape,
        ],
        "Test Shape": [
            X_BERT_test.shape,
            X_SBERT_test.shape,
        ],
    }
)

Unnamed: 0,Embedding Type,Train Shape,Val Shape,Test Shape
0,DistilBERT,"(800, 768)","(600, 768)","(600, 768)"
1,SBERT,"(800, 384)","(600, 384)","(600, 384)"


In [51]:
embedding_data = {
    "DistilBERT": {
        "X_train": X_BERT_train,
        "X_val": X_BERT_val,
        "X_test": X_BERT_test,
        "y1_train": y1_BERT_train,
        "y1_val": y1_BERT_val,
        "y1_test": y1_BERT_test,
        "y2_train": y2_BERT_train,
        "y2_val": y2_BERT_val,
        "y2_test": y2_BERT_test,
    },
    "SBERT": {
        "X_train": X_SBERT_train,
        "X_val": X_SBERT_val,
        "X_test": X_SBERT_test,
        "y1_train": y1_SBERT_train,
        "y1_val": y1_SBERT_val,
        "y1_test": y1_SBERT_test,
        "y2_train": y2_SBERT_train,
        "y2_val": y2_SBERT_val,
        "y2_test": y2_SBERT_test,
    },
}

In [52]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
import scipy


class MultiTaskDataset(Dataset):
    def __init__(self, embedding, labels, types):
        self.X = torch.tensor(embedding, dtype=torch.float32)
        # For binary task, we use float labels
        self.labels = torch.tensor(
            labels.values if hasattr(labels, "values") else labels, dtype=torch.float32
        )
        # For multi-class, targets should be long (integer encoded)
        self.types = torch.tensor(
            types.values if hasattr(types, "values") else types, dtype=torch.long
        )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.labels[idx], self.types[idx]

In [53]:
class MultiTaskNet(nn.Module):
    def __init__(self, feature_dimension, num_types):
        super(MultiTaskNet, self).__init__()
        # Shared layers
        self.shared = nn.Sequential(
            nn.Linear(feature_dimension, 512),
            nn.ReLU(),
            nn.Dropout(0.2),  # Regularization for preventing overfitting
        )
        # Head for binary scam label prediction
        self.label_head = nn.Linear(512, 1)  # output logit for binary classification

        # Head for multi-class scam type prediction
        self.type_head = nn.Linear(512, num_types)

    def forward(self, x):
        shared_rep = self.shared(x)
        # For binary classification, we’ll use BCEWithLogitsLoss, so no sigmoid here.
        label_logits = self.label_head(shared_rep)
        # For type classification, output logits (to be used with CrossEntropyLoss)
        type_logits = self.type_head(shared_rep)
        return label_logits, type_logits

In [54]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
from torch.utils.data import DataLoader


class MutltiTaskModel:
    def __init__(self, X_train, y1_train, y2_train, num_types, batch_size=32, lr=1e-3):
        self.model = MultiTaskNet(X_train.shape[1], num_types)

        # Determine device (cuda, mps, or cpu)
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cpu")
        self.model.to(self.device)

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion_label = nn.BCEWithLogitsLoss()
        self.criterion_type = nn.CrossEntropyLoss()
        self.batch_size = batch_size
        self.input_dim = X_train.shape[1]

        # Create DataLoader for training
        self.train_dataset = MultiTaskDataset(X_train, y1_train, y2_train)
        self.train_loader = DataLoader(
            self.train_dataset, batch_size=batch_size, shuffle=True
        )

        # Lists to store losses for plotting
        self.train_label_losses = []
        self.train_type_losses = []

        # For saving the best model
        self.best_model_weights = None

    def summary(self):
        """
        Prints the model summary.
        """
        self.model.to("cpu")
        summary(self.model, (self.batch_size, self.input_dim))
        self.model.to(self.device)

    def train(self, num_epochs):
        best_loss = float("inf")

        for epoch in range(num_epochs):
            self.model.train()
            total_loss_label = 0.0
            total_loss_type = 0.0
            num_batches = 0

            for inputs, labels, types in self.train_loader:
                inputs, labels, types = (
                    inputs.to(self.device),
                    labels.to(self.device),
                    types.to(self.device),
                )

                self.optimizer.zero_grad()
                label_logits, type_logits = self.model(inputs)

                loss_label = self.criterion_label(label_logits.view(-1), labels)
                loss_type = self.criterion_type(type_logits, types)

                loss = loss_label + loss_type
                loss.backward()
                self.optimizer.step()

                total_loss_label += loss_label.item()
                total_loss_type += loss_type.item()
                num_batches += 1

            # Calculate average loss for the epoch
            avg_loss_label = total_loss_label / num_batches
            avg_loss_type = total_loss_type / num_batches
            total_loss = avg_loss_label + avg_loss_type

            # Store losses for plotting
            self.train_label_losses.append(avg_loss_label)
            self.train_type_losses.append(avg_loss_type)

            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}")
                print(f"Average Loss (Label): {avg_loss_label:.4f}")
                print(f"Average Loss (Type): {avg_loss_type:.4f}")
                print(f"Total Loss: {total_loss:.4f}\n")

            # Save best model weights
            if total_loss < best_loss:
                best_loss = total_loss
                self.best_model_weights = self.model.state_dict()

        print("Training complete.")

    def plot_losses(self):
        """
        Plots the losses for label and type predictions during training.
        """
        plt.figure(figsize=(10, 5))
        plt.plot(self.train_label_losses, label="Label Loss")
        plt.plot(self.train_type_losses, label="Type Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training Losses")
        plt.legend()
        plt.show()

    def predict(self, X):
        """
        Makes predictions for the given inputs X.

        Parameters:
            X (numpy array or tensor): Input features.

        Returns:
            tuple: (predicted_label, predicted_type) where predicted_label is the binary prediction (0 or 1)
                   and predicted_type is the predicted class for multi-class task.
        """
        self.model.eval()
        with torch.no_grad():
            inputs = torch.tensor(X, dtype=torch.float32).to(self.device)
            label_logits, type_logits = self.model(inputs)
            label_pred = (
                (torch.sigmoid(label_logits) >= 0.5).cpu().numpy().astype(int)
            )  # Convert to 0 or 1 instead of floating point from sigmoid
            type_pred = torch.argmax(type_logits, dim=1).cpu().numpy()

        return label_pred, type_pred

    def evaluate(self, X, y1, y2):
        """
        Evaluates the model on the provided dataset and computes various performance metrics.

        Parameters:
            X (array-like): Input features.
            y1 (array-like): True labels for binary classification.
            y2 (array-like): True labels for multi-class classification.

        Returns:
            tuple: A nested tuple containing:
                - accuracy (float): Overall accuracy of the model.
                - precision (float): Precision score.
                - recall (float): Recall score.
                - f1 (float): F1 score.
                - conf_matrix (array-like): Confusion matrix.
                - class_report (str): Text summary of the precision, recall, and f1-score for each class.
        """
        y1_pred, y2_pred = self.predict(X)

        # Binary classification metrics
        accuracy_label = accuracy_score(y1, y1_pred)
        precision_label = precision_score(y1, y1_pred, zero_division=0)
        recall_label = recall_score(y1, y1_pred, zero_division=0)
        f1_label = f1_score(y1, y1_pred, zero_division=0)
        conf_matrix_label = confusion_matrix(y1, y1_pred)

        # Multi-class classification metrics
        accuracy_type = accuracy_score(y2, y2_pred)
        precision_type = precision_score(
            y2, y2_pred, average="weighted", zero_division=0
        )
        recall_type = recall_score(y2, y2_pred, average="weighted", zero_division=0)
        f1_type = f1_score(y2, y2_pred, average="weighted", zero_division=0)
        conf_matrix_type = confusion_matrix(y2, y2_pred)

        return (
            accuracy_label,
            precision_label,
            recall_label,
            f1_label,
            conf_matrix_label,
        ), (accuracy_type, precision_type, recall_type, f1_type, conf_matrix_type)

    def plot_confusion_matrix(self, X, y1, y2, labels):
        """
        Plots the confusion matrix for the model's predictions.

        Parameters:
            X (array-like): Input features.
            y1 (array-like): True labels for binary classification.
            y2 (array-like): True labels for multi-class classification.
            labels (list): List of label names to be used in the plot axes.
        """
        y1_pred, y2_pred = self.predict(X)

        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.heatmap(
            confusion_matrix(y1, y1_pred),
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=["Not Scam", "Scam"],
            yticklabels=["Not Scam", "Scam"],
        )
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Binary Classification")

        plt.subplot(1, 2, 2)
        sns.heatmap(
            confusion_matrix(y2, y2_pred),
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=labels,
            yticklabels=labels,
        )
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Multi-Class Classification")

        plt.tight_layout()
        plt.show()

In [55]:

# Initialize the multi_task_model for the different embeddings
multi_task_models = {}

for embedding, data in embedding_data.items():
    print(f"Training Multi-Task Model for {embedding}...")

    # Convert sparse matrix to dense if needed
    X_train = (
        data["X_train"].toarray()
        if scipy.sparse.issparse(data["X_train"])
        else data["X_train"]
    )

    multi_task_model = MutltiTaskModel(
        X_train, data["y1_train"], data["y2_train"], num_types=len(le.classes_)
    )

    # Add the model to the dictionary
    multi_task_models[embedding] = multi_task_model

Training Multi-Task Model for DistilBERT...
Training Multi-Task Model for SBERT...


In [56]:
from ray import tune


def train_model(config, embedding):
    # Assume you're working with the DistilBERT embeddings
    data = embedding_data[embedding]
    # Convert to dense if needed
    X_train = data["X_train"].toarray() if scipy.sparse.issparse(data["X_train"]) else data["X_train"]
    X_val = data["X_val"].toarray() if scipy.sparse.issparse(data["X_val"]) else data["X_val"]

    # Instantiate the model with hyperparameters from config.
    model = MutltiTaskModel(
        X_train, 
        data["y1_train"], 
        data["y2_train"], 
        num_types=len(le.classes_),
        batch_size=config["batch_size"],
        lr=config["lr"]
    )
    
    num_epochs = config["num_epochs"]
    
    # Training loop
    for epoch in range(num_epochs):
        # Train for one epoch at a time
        model.train(num_epochs=1)
        
        # Evaluate on validation data after each epoch
        (_, _, _, f1_label, _), \
        (_, _, _, f1_type, _) \
        = model.evaluate(X_val, data["y1_val"], data["y2_val"])
        
        # Create a combined metric. Adjust weights if necessary.
        combined_f1 = (f1_label + f1_type) / 2
        
        # Report the combined F1 score
        tune.report({"combined_f1": combined_f1})



# Define the hyperparameter search space.
config = {
    "lr": tune.loguniform(1e-5, 1e-2),
    "batch_size": tune.choice([16, 32, 64]),
    "num_epochs": tune.choice([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]),
}

# Run the tuning experiment for each embedding

for embedding in embedding_data.keys():
    analysis = tune.run(
        lambda config: train_model(config, embedding),
        config=config,
        metric="combined_f1",
        mode="max",
        num_samples=10,  # Increase this number for a broader search
    )

    print(f"\nBest hyperparameters for {embedding}:")
    print(analysis.get_best_config(metric="combined_f1", mode="max"))

2025-03-07 22:10:26,258	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-03-07 22:10:57
Running for:,00:00:31.50
Memory:,10.3/16.0 GiB

Trial name,status,loc,batch_size,lr,num_epochs,iter,total time (s),combined_f1
lambda_eae00_00000,TERMINATED,127.0.0.1:28531,32,0.00385852,40,40,10.1791,0.969099
lambda_eae00_00001,TERMINATED,127.0.0.1:28534,64,0.000198153,70,70,10.4035,0.945395
lambda_eae00_00002,TERMINATED,127.0.0.1:28533,16,0.000318458,80,80,22.2735,0.963178
lambda_eae00_00003,TERMINATED,127.0.0.1:28532,32,0.00105317,40,40,10.1131,0.96835
lambda_eae00_00004,TERMINATED,127.0.0.1:28535,32,0.00119516,20,20,6.14591,0.958058
lambda_eae00_00005,TERMINATED,127.0.0.1:28536,64,2.34847e-05,90,90,11.6371,0.740575
lambda_eae00_00006,TERMINATED,127.0.0.1:28537,64,0.00250959,60,60,9.03338,0.96975
lambda_eae00_00007,TERMINATED,127.0.0.1:28538,64,1.33953e-05,60,60,8.89883,0.617266
lambda_eae00_00008,TERMINATED,127.0.0.1:28579,64,0.000335611,60,60,4.69758,0.95468
lambda_eae00_00009,TERMINATED,127.0.0.1:28591,16,0.000254012,70,70,14.2119,0.966424


Trial name,combined_f1
lambda_eae00_00000,0.969099
lambda_eae00_00001,0.945395
lambda_eae00_00002,0.963178
lambda_eae00_00003,0.96835
lambda_eae00_00004,0.958058
lambda_eae00_00005,0.740575
lambda_eae00_00006,0.96975
lambda_eae00_00007,0.617266
lambda_eae00_00008,0.95468
lambda_eae00_00009,0.966424


2025-03-07 22:10:57,793	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/choonkeatling/ray_results/lambda_2025-03-07_22-10-26' in 0.0038s.
2025-03-07 22:10:57,796	INFO tune.py:1041 -- Total run time: 31.54 seconds (31.50 seconds for the tuning loop).
2025-03-07 22:10:57,812	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949



Best hyperparameters for DistilBERT:
{'lr': 0.0025095931971985054, 'batch_size': 64, 'num_epochs': 60}


0,1
Current time:,2025-03-07 22:11:32
Running for:,00:00:34.97
Memory:,10.1/16.0 GiB

Trial name,status,loc,batch_size,lr,num_epochs,iter,total time (s),combined_f1
lambda_fdaf1_00000,TERMINATED,127.0.0.1:28630,16,3.49922e-05,80,80,23.5591,0.881303
lambda_fdaf1_00001,TERMINATED,127.0.0.1:28627,32,0.000767218,50,50,11.3269,0.966105
lambda_fdaf1_00002,TERMINATED,127.0.0.1:28628,64,8.00534e-05,50,50,7.10104,0.789879
lambda_fdaf1_00003,TERMINATED,127.0.0.1:28629,16,0.00615873,10,10,5.33175,0.973078
lambda_fdaf1_00004,TERMINATED,127.0.0.1:28634,32,0.00225477,70,70,14.3072,0.97297
lambda_fdaf1_00005,TERMINATED,127.0.0.1:28633,16,0.000165867,30,30,12.5082,0.945111
lambda_fdaf1_00006,TERMINATED,127.0.0.1:28631,32,3.66624e-05,40,40,9.01927,0.703782
lambda_fdaf1_00007,TERMINATED,127.0.0.1:28632,16,0.00490586,80,80,23.7803,0.971829
lambda_fdaf1_00008,TERMINATED,127.0.0.1:28659,32,0.000827938,70,70,10.377,0.970229
lambda_fdaf1_00009,TERMINATED,127.0.0.1:28670,16,2.40694e-05,90,90,19.8695,0.820583


Trial name,combined_f1
lambda_fdaf1_00000,0.881303
lambda_fdaf1_00001,0.966105
lambda_fdaf1_00002,0.789879
lambda_fdaf1_00003,0.973078
lambda_fdaf1_00004,0.97297
lambda_fdaf1_00005,0.945111
lambda_fdaf1_00006,0.703782
lambda_fdaf1_00007,0.971829
lambda_fdaf1_00008,0.970229
lambda_fdaf1_00009,0.820583


2025-03-07 22:11:32,817	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/choonkeatling/ray_results/lambda_2025-03-07_22-10-57' in 0.0040s.
2025-03-07 22:11:32,820	INFO tune.py:1041 -- Total run time: 35.01 seconds (34.97 seconds for the tuning loop).



Best hyperparameters for SBERT:
{'lr': 0.006158726672413405, 'batch_size': 16, 'num_epochs': 10}


In [57]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Wrap SVC in a OneVsRestClassifier for multi-class problems
ovr_svc = OneVsRestClassifier(SVC())

# Define parameter grid with prefix 'estimator__'
param_grid = {
    "estimator__C": [0.1, 1, 10],
    "estimator__kernel": ["linear", "rbf"],
    "estimator__gamma": ["scale", "auto"],
}

# Initialize GridSearchCV with the one-vs-rest classifier
grid_search = GridSearchCV(
    estimator=ovr_svc, param_grid=param_grid, cv=5, scoring="f1_weighted"
)

# Fit grid search on the training data
grid_search.fit(embedding_data["SBERT"]["X_train"], embedding_data["SBERT"]["y2_train"])

# Get the best parameters and the corresponding accuracy
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best F1 Score: {best_score}")

Best parameters: {'estimator__C': 10, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf'}
Best F1 Score: 0.9720999880689141
