## 1. Load the Data and Model

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

dataset_df = pd.read_csv("../data/generic_changed_dataset.csv")
le = LabelEncoder()
type_encoded = le.fit_transform(dataset_df["type"])  # Now each type is a unique int
num_types = len(le.classes_)  # Number of unique types
print(num_types)

9


In [2]:
# Load embeddings without training it again from the dataset_df
X_distil_bert = np.load("distil_bert_embeddings.npy")
print(X_distil_bert.shape)
X_sbert = np.load("sbert_embeddings.npy")
print(X_sbert.shape)

(2000, 768)
(2000, 384)


In [3]:
X_BERT_train, X_BERT_test, y1_BERT_train, y1_BERT_test, y2_BERT_train, y2_BERT_test = train_test_split(X_distil_bert, dataset_df["labels"], type_encoded, test_size=0.6, random_state=42) # Train split
X_BERT_val, X_BERT_test, y1_BERT_val, y1_BERT_test, y2_BERT_val, y2_BERT_test = train_test_split(X_BERT_test, y1_BERT_test, y2_BERT_test, test_size=0.5, random_state=42) # Validation and test split

X_SBERT_train, X_SBERT_test, y1_SBERT_train, y1_SBERT_test, y2_SBERT_train, y2_SBERT_test = train_test_split(X_sbert, dataset_df["labels"], type_encoded, test_size=0.6, random_state=42) # Train split
X_SBERT_val, X_SBERT_test, y1_SBERT_val, y1_SBERT_test, y2_SBERT_val, y2_SBERT_test = train_test_split(X_SBERT_test, y1_SBERT_test, y2_SBERT_test, test_size=0.5, random_state=42) # Validation and test split


# Put the shapes into a table for easy comparison
pd.DataFrame(
    {
        "Embedding Type": [
            "DistilBERT",
            "SBERT",
        ],
        "Train Shape": [
            X_BERT_train.shape,
            X_SBERT_train.shape,
        ],
        "Val Shape": [
            X_BERT_val.shape,
            X_SBERT_val.shape,
        ],
        "Test Shape": [
            X_BERT_test.shape,
            X_SBERT_test.shape,
        ],
    }
)

Unnamed: 0,Embedding Type,Train Shape,Val Shape,Test Shape
0,DistilBERT,"(800, 768)","(600, 768)","(600, 768)"
1,SBERT,"(800, 384)","(600, 384)","(600, 384)"


In [4]:
embedding_data = {
    "DistilBERT": {
        "X_train": X_BERT_train,
        "X_val": X_BERT_val,
        "X_test": X_BERT_test,
        "y1_train": y1_BERT_train,
        "y1_val": y1_BERT_val,
        "y1_test": y1_BERT_test,
        "y2_train": y2_BERT_train,
        "y2_val": y2_BERT_val,
        "y2_test": y2_BERT_test,
    },
    "SBERT": {
        "X_train": X_SBERT_train,
        "X_val": X_SBERT_val,
        "X_test": X_SBERT_test,
        "y1_train": y1_SBERT_train,
        "y1_val": y1_SBERT_val,
        "y1_test": y1_SBERT_test,
        "y2_train": y2_SBERT_train,
        "y2_val": y2_SBERT_val,
        "y2_test": y2_SBERT_test,
    },
}

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
import scipy


class MultiTaskDataset(Dataset):
    def __init__(self, embedding, labels, types):
        self.X = torch.tensor(embedding, dtype=torch.float32)
        # For binary task, we use float labels
        self.labels = torch.tensor(
            labels.values if hasattr(labels, "values") else labels, dtype=torch.float32
        )
        # For multi-class, targets should be long (integer encoded)
        self.types = torch.tensor(
            types.values if hasattr(types, "values") else types, dtype=torch.long
        )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.labels[idx], self.types[idx]

In [6]:
class MultiTaskNet(nn.Module):
    def __init__(self, feature_dimension, num_types):
        super(MultiTaskNet, self).__init__()
        # Shared layers
        self.shared = nn.Sequential(
            nn.Linear(feature_dimension, 512),
            nn.ReLU(),
            nn.Dropout(0.2),  # Regularization for preventing overfitting
        )
        # Head for binary scam label prediction
        self.label_head = nn.Linear(512, 1)  # output logit for binary classification

        # Head for multi-class scam type prediction
        self.type_head = nn.Linear(512, num_types)

    def forward(self, x):
        shared_rep = self.shared(x)
        # For binary classification, we’ll use BCEWithLogitsLoss, so no sigmoid here.
        label_logits = self.label_head(shared_rep)
        # For type classification, output logits (to be used with CrossEntropyLoss)
        type_logits = self.type_head(shared_rep)
        return label_logits, type_logits

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
from torch.utils.data import DataLoader


class MutltiTaskModel:
    def __init__(self, X_train, y1_train, y2_train, num_types, batch_size=32, lr=1e-3):
        self.model = MultiTaskNet(X_train.shape[1], num_types)

        # Determine device (cuda, mps, or cpu)
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            self.device = torch.device("mps")
        else:
            self.device = torch.device("cpu")
        self.model.to(self.device)

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion_label = nn.BCEWithLogitsLoss()
        self.criterion_type = nn.CrossEntropyLoss()
        self.batch_size = batch_size
        self.input_dim = X_train.shape[1]

        # Create DataLoader for training
        self.train_dataset = MultiTaskDataset(X_train, y1_train, y2_train)
        self.train_loader = DataLoader(
            self.train_dataset, batch_size=batch_size, shuffle=True
        )

        # Lists to store losses for plotting
        self.train_label_losses = []
        self.train_type_losses = []

        # For saving the best model
        self.best_model_weights = None

    def summary(self):
        """
        Prints the model summary.
        """
        self.model.to("cpu")
        summary(self.model, (self.batch_size, self.input_dim))
        self.model.to(self.device)

    def train(self, num_epochs):
        best_loss = float("inf")

        for epoch in range(num_epochs):
            self.model.train()
            total_loss_label = 0.0
            total_loss_type = 0.0
            num_batches = 0

            for inputs, labels, types in self.train_loader:
                inputs, labels, types = (
                    inputs.to(self.device),
                    labels.to(self.device),
                    types.to(self.device),
                )

                self.optimizer.zero_grad()
                label_logits, type_logits = self.model(inputs)

                loss_label = self.criterion_label(label_logits.view(-1), labels)
                loss_type = self.criterion_type(type_logits, types)

                loss = loss_label + loss_type
                loss.backward()
                self.optimizer.step()

                total_loss_label += loss_label.item()
                total_loss_type += loss_type.item()
                num_batches += 1

            # Calculate average loss for the epoch
            avg_loss_label = total_loss_label / num_batches
            avg_loss_type = total_loss_type / num_batches
            total_loss = avg_loss_label + avg_loss_type

            # Store losses for plotting
            self.train_label_losses.append(avg_loss_label)
            self.train_type_losses.append(avg_loss_type)

            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}")
                print(f"Average Loss (Label): {avg_loss_label:.4f}")
                print(f"Average Loss (Type): {avg_loss_type:.4f}")
                print(f"Total Loss: {total_loss:.4f}\n")

            # Save best model weights
            if total_loss < best_loss:
                best_loss = total_loss
                self.best_model_weights = self.model.state_dict()

        print("Training complete.")

    def plot_losses(self):
        """
        Plots the losses for label and type predictions during training.
        """
        plt.figure(figsize=(10, 5))
        plt.plot(self.train_label_losses, label="Label Loss")
        plt.plot(self.train_type_losses, label="Type Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training Losses")
        plt.legend()
        plt.show()

    def predict(self, X):
        """
        Makes predictions for the given inputs X.

        Parameters:
            X (numpy array or tensor): Input features.

        Returns:
            tuple: (predicted_label, predicted_type) where predicted_label is the binary prediction (0 or 1)
                   and predicted_type is the predicted class for multi-class task.
        """
        self.model.eval()
        with torch.no_grad():
            inputs = torch.tensor(X, dtype=torch.float32).to(self.device)
            label_logits, type_logits = self.model(inputs)
            label_pred = (
                (torch.sigmoid(label_logits) >= 0.5).cpu().numpy().astype(int)
            )  # Convert to 0 or 1 instead of floating point from sigmoid
            type_pred = torch.argmax(type_logits, dim=1).cpu().numpy()

        return label_pred, type_pred

    def evaluate(self, X, y1, y2):
        """
        Evaluates the model on the provided dataset and computes various performance metrics.

        Parameters:
            X (array-like): Input features.
            y1 (array-like): True labels for binary classification.
            y2 (array-like): True labels for multi-class classification.

        Returns:
            tuple: A nested tuple containing:
                - accuracy (float): Overall accuracy of the model.
                - precision (float): Precision score.
                - recall (float): Recall score.
                - f1 (float): F1 score.
                - conf_matrix (array-like): Confusion matrix.
                - class_report (str): Text summary of the precision, recall, and f1-score for each class.
        """
        y1_pred, y2_pred = self.predict(X)

        # Binary classification metrics
        accuracy_label = accuracy_score(y1, y1_pred)
        precision_label = precision_score(y1, y1_pred, zero_division=0)
        recall_label = recall_score(y1, y1_pred, zero_division=0)
        f1_label = f1_score(y1, y1_pred, zero_division=0)
        conf_matrix_label = confusion_matrix(y1, y1_pred)

        # Multi-class classification metrics
        accuracy_type = accuracy_score(y2, y2_pred)
        precision_type = precision_score(
            y2, y2_pred, average="weighted", zero_division=0
        )
        recall_type = recall_score(y2, y2_pred, average="weighted", zero_division=0)
        f1_type = f1_score(y2, y2_pred, average="weighted", zero_division=0)
        conf_matrix_type = confusion_matrix(y2, y2_pred)

        return (
            accuracy_label,
            precision_label,
            recall_label,
            f1_label,
            conf_matrix_label,
        ), (accuracy_type, precision_type, recall_type, f1_type, conf_matrix_type)

    def plot_confusion_matrix(self, X, y1, y2, labels):
        """
        Plots the confusion matrix for the model's predictions.

        Parameters:
            X (array-like): Input features.
            y1 (array-like): True labels for binary classification.
            y2 (array-like): True labels for multi-class classification.
            labels (list): List of label names to be used in the plot axes.
        """
        y1_pred, y2_pred = self.predict(X)

        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        sns.heatmap(
            confusion_matrix(y1, y1_pred),
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=["Not Scam", "Scam"],
            yticklabels=["Not Scam", "Scam"],
        )
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Binary Classification")

        plt.subplot(1, 2, 2)
        sns.heatmap(
            confusion_matrix(y2, y2_pred),
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=labels,
            yticklabels=labels,
        )
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Multi-Class Classification")

        plt.tight_layout()
        plt.show()

# Tune the hyperparameters of the model to get the best performance.

In [8]:
result = pd.DataFrame(columns=[
    "Algorithm",
    "Embedding",
    "Type", 
    "y1_Accuracy",
    "y1_Precision",
    "y1_Recall",
    "y1_F1_Score",
    "y2_Accuracy", 
    "y2_Precision",
    "y2_Recall",
    "y2_F1_Score"
])

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

# Store all results
all_combinations_results = []

param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

for C in param_grid["C"]:
    for kernel in param_grid["kernel"]:
        for gamma in param_grid["gamma"]:
            params = {
                "C": C,
                "kernel": kernel,
                "gamma": gamma
            }
            
            # Create SVM model with current parameters
            base_svm = SVC(**params)
            model = OneVsRestClassifier(base_svm)
            
            current_result = {
                "Algorithm": "SVM",
                "Embedding": "SBERT",
                "Type": "Test Set",
                "C": C,
                "kernel": kernel,
                "gamma": gamma
            }
            
            # Test for both y1 and y2
            for i in range(2):
                print(f"\nTesting SVM for y{i+1} with parameters: C={C}, kernel={kernel}, gamma={gamma}")
                
                # Train model
                model.fit(embedding_data["SBERT"]["X_train"], 
                         embedding_data["SBERT"][f"y{i+1}_train"])
                
                # Make predictions
                test_pred = model.predict(embedding_data["SBERT"]["X_test"])
                
                # Calculate metrics
                test_report = classification_report(
                    embedding_data["SBERT"][f"y{i+1}_test"],
                    test_pred,
                    output_dict=True
                )
                
                # Store metrics for current task
                current_result[f"y{i+1}_Accuracy"] = test_report['accuracy']
                current_result[f"y{i+1}_F1_Score"] = test_report['weighted avg']['f1-score']
            
            all_combinations_results.append(current_result)

# Convert all results to DataFrame
final_results = pd.DataFrame(all_combinations_results)

# Display all results sorted by y1_F1_Score
print("\nAll Results sorted by y1_F1_Score:")
print(final_results.sort_values('y1_F1_Score', ascending=False))

print("\nAll Results sorted by y2_F1_Score:")
print(final_results.sort_values('y2_F1_Score', ascending=False))

# Save all results to CSV
final_results.to_csv("svm_all_results.csv", index=False)

# Display summary statistics
print("\nSummary Statistics:")
print(final_results[['y1_Accuracy', 'y1_F1_Score', 'y2_Accuracy', 'y2_F1_Score']].describe())


Testing SVM for y1 with parameters: C=0.1, kernel=linear, gamma=scale

Testing SVM for y2 with parameters: C=0.1, kernel=linear, gamma=scale

Testing SVM for y1 with parameters: C=0.1, kernel=linear, gamma=auto

Testing SVM for y2 with parameters: C=0.1, kernel=linear, gamma=auto

Testing SVM for y1 with parameters: C=0.1, kernel=rbf, gamma=scale

Testing SVM for y2 with parameters: C=0.1, kernel=rbf, gamma=scale

Testing SVM for y1 with parameters: C=0.1, kernel=rbf, gamma=auto


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Testing SVM for y2 with parameters: C=0.1, kernel=rbf, gamma=auto


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Testing SVM for y1 with parameters: C=1, kernel=linear, gamma=scale

Testing SVM for y2 with parameters: C=1, kernel=linear, gamma=scale

Testing SVM for y1 with parameters: C=1, kernel=linear, gamma=auto

Testing SVM for y2 with parameters: C=1, kernel=linear, gamma=auto

Testing SVM for y1 with parameters: C=1, kernel=rbf, gamma=scale

Testing SVM for y2 with parameters: C=1, kernel=rbf, gamma=scale

Testing SVM for y1 with parameters: C=1, kernel=rbf, gamma=auto


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Testing SVM for y2 with parameters: C=1, kernel=rbf, gamma=auto


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Testing SVM for y1 with parameters: C=10, kernel=linear, gamma=scale

Testing SVM for y2 with parameters: C=10, kernel=linear, gamma=scale

Testing SVM for y1 with parameters: C=10, kernel=linear, gamma=auto

Testing SVM for y2 with parameters: C=10, kernel=linear, gamma=auto

Testing SVM for y1 with parameters: C=10, kernel=rbf, gamma=scale

Testing SVM for y2 with parameters: C=10, kernel=rbf, gamma=scale

Testing SVM for y1 with parameters: C=10, kernel=rbf, gamma=auto

Testing SVM for y2 with parameters: C=10, kernel=rbf, gamma=auto

All Results sorted by y1_F1_Score:
   Algorithm Embedding      Type     C  kernel  gamma  y1_Accuracy  \
10       SVM     SBERT  Test Set  10.0     rbf  scale     0.985000   
6        SVM     SBERT  Test Set   1.0     rbf  scale     0.983333   
8        SVM     SBERT  Test Set  10.0  linear  scale     0.976667   
9        SVM     SBERT  Test Set  10.0  linear   auto     0.976667   
4        SVM     SBERT  Test Set   1.0  linear  scale     0.966667   


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Initialize the multi_task_model for the different embeddings
multi_task_models = {}

for embedding, data in embedding_data.items():
    print(f"Training Multi-Task Model for {embedding}...")

    # Convert sparse matrix to dense if needed
    X_train = (
        data["X_train"].toarray()
        if scipy.sparse.issparse(data["X_train"])
        else data["X_train"]
    )

    multi_task_model = MutltiTaskModel(
        X_train, data["y1_train"], data["y2_train"], num_types=len(le.classes_)
    )

    # Add the model to the dictionary
    multi_task_models[embedding] = multi_task_model

Training Multi-Task Model for DistilBERT...
Training Multi-Task Model for SBERT...


In [11]:
from ray import tune


def train_model(config, embedding):
    # Assume you're working with the DistilBERT embeddings
    data = embedding_data[embedding]
    # Convert to dense if needed
    X_train = (
        data["X_train"].toarray()
        if scipy.sparse.issparse(data["X_train"])
        else data["X_train"]
    )
    X_val = (
        data["X_val"].toarray()
        if scipy.sparse.issparse(data["X_val"])
        else data["X_val"]
    )

    # Instantiate the model with hyperparameters from config.
    model = MutltiTaskModel(
        X_train,
        data["y1_train"],
        data["y2_train"],
        num_types=len(le.classes_),
        batch_size=config["batch_size"],
        lr=config["lr"],
    )

    num_epochs = config["num_epochs"]

    # Training loop
    for epoch in range(num_epochs):
        # Train for one epoch at a time
        model.train(num_epochs=1)

        # Evaluate on validation data after each epoch
        (_, _, _, f1_label, _), (_, _, _, f1_type, _) = model.evaluate(
            X_val, data["y1_val"], data["y2_val"]
        )

        # Create a combined metric. Adjust weights if necessary.
        combined_f1 = (f1_label + f1_type) / 2

        # Report the combined F1 score
        tune.report({"combined_f1": combined_f1})


# Define the hyperparameter search space.
config = {
    "lr": tune.loguniform(1e-5, 1e-2),
    "batch_size": tune.choice([16, 32, 64]),
    "num_epochs": tune.choice([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]),
}

# Run the tuning experiment for each embedding

for embedding in embedding_data.keys():
    analysis = tune.run(
        lambda config: train_model(config, embedding),
        config=config,
        metric="combined_f1",
        mode="max",
        num_samples=10,  # Increase this number for a broader search
    )

    print(f"\nBest hyperparameters for {embedding}:")
    print(analysis.get_best_config(metric="combined_f1", mode="max"))

    # Store the best model for each embedding
    model =  MutltiTaskModel(
        embedding_data[embedding]["X_train"],
        embedding_data[embedding]["y1_train"],
        embedding_data[embedding]["y2_train"],
        num_types=len(le.classes_),
        batch_size=analysis.get_best_config(metric="combined_f1", mode="max")["batch_size"],
        lr=analysis.get_best_config(metric="combined_f1", mode="max")["lr"],
    )

    model.train(
        num_epochs=analysis.get_best_config(metric="combined_f1", mode="max")["num_epochs"]
    )

    mtl_result = model.evaluate(
        embedding_data[embedding]["X_test"],
        embedding_data[embedding]["y1_test"],
        embedding_data[embedding]["y2_test"],
    )

    print(f"\nEvaluation results for {embedding}: {mtl_result}")

    # Create new row DataFrame
    new_row = pd.DataFrame(
        [
            {
                "Algorithm": "Multi-Task Learning",
                "Embedding": embedding,
                "Type": "Test Set",
                "y1_Accuracy": mtl_result[0][0],
                "y1_Precision": mtl_result[0][1],
                "y1_Recall": mtl_result[0][2],
                "y1_F1_Score": mtl_result[0][3],
                "y2_Accuracy": mtl_result[1][0],
                "y2_Precision": mtl_result[1][1],
                "y2_Recall": mtl_result[1][2],
                "y2_F1_Score": mtl_result[1][3],
            }
        ]
    )

    # Concatenate new row to results DataFrame
    result = pd.concat([result, new_row], ignore_index=True)

2025-03-10 20:15:33,550	INFO worker.py:1841 -- Started a local Ray instance.
2025-03-10 20:15:34,076	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2025-03-10 20:15:34,078	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
2025-03-10 20:15:34,104	INFO tensorboardx.py:193 -- pip install "ray[tune]" to see TensorBoard files.


0,1
Current time:,2025-03-10 20:16:02
Running for:,00:00:28.69
Memory:,10.4/16.0 GiB

Trial name,status,loc,batch_size,lr,num_epochs,iter,total time (s),combined_f1
lambda_5e0dd_00000,TERMINATED,127.0.0.1:49351,16,0.00218759,90,90,24.696,0.968847
lambda_5e0dd_00001,TERMINATED,127.0.0.1:49358,32,0.000786052,40,40,9.50231,0.963778
lambda_5e0dd_00002,TERMINATED,127.0.0.1:49355,32,0.00119674,60,60,12.2965,0.966345
lambda_5e0dd_00003,TERMINATED,127.0.0.1:49356,64,1.10158e-05,80,80,10.5472,0.609781
lambda_5e0dd_00004,TERMINATED,127.0.0.1:49352,32,0.00119505,30,30,7.4678,0.966367
lambda_5e0dd_00005,TERMINATED,127.0.0.1:49357,16,0.00168293,10,10,5.01506,0.942076
lambda_5e0dd_00006,TERMINATED,127.0.0.1:49353,32,0.000610464,40,40,9.59968,0.965788
lambda_5e0dd_00007,TERMINATED,127.0.0.1:49354,32,0.00112484,100,100,17.3383,0.970567
lambda_5e0dd_00008,TERMINATED,127.0.0.1:49386,32,7.63098e-05,40,40,5.65983,0.855997
lambda_5e0dd_00009,TERMINATED,127.0.0.1:49400,32,0.000271322,100,100,12.3451,0.96828


[36m(<lambda> pid=49354)[0m Epoch 1/1
[36m(<lambda> pid=49354)[0m Average Loss (Label): 0.6043
[36m(<lambda> pid=49354)[0m Average Loss (Type): 1.5488
[36m(<lambda> pid=49354)[0m Total Loss: 2.1532
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49354)[0m Training complete.


Trial name,combined_f1
lambda_5e0dd_00000,0.968847
lambda_5e0dd_00001,0.963778
lambda_5e0dd_00002,0.966345
lambda_5e0dd_00003,0.609781
lambda_5e0dd_00004,0.966367
lambda_5e0dd_00005,0.942076
lambda_5e0dd_00006,0.965788
lambda_5e0dd_00007,0.970567
lambda_5e0dd_00008,0.855997
lambda_5e0dd_00009,0.96828


[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49353)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49352)[0m 
[36m(<lambda> pid=49358)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49357)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49353)[0m 
[36m(<lambda> pid=49352)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49358)[0m 
[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49353)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49352)[0m 
[36m(<lambda> pid=49358)[0m 
[36m(<lambda> pid=49352)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49358)[0m 
[36m(<lambda> pid=49357)[0m 
[36m(<lambda> pid=49353)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<l

[36m(<lambda> pid=49353)[0m *** SIGSEGV received at time=1741608947 ***
[36m(<lambda> pid=49353)[0m PC: @        0x112bbb8c0  (unknown)  ray::rpc::GcsRpcClient::AddTaskEventData()
[36m(<lambda> pid=49353)[0m     @        0x113587e54  (unknown)  absl::lts_20230802::AbslFailureSignalHandler()
[36m(<lambda> pid=49353)[0m     @        0x192966de4  (unknown)  _sigtramp
[36m(<lambda> pid=49353)[0m     @        0x112bbb784  (unknown)  ray::gcs::TaskInfoAccessor::AsyncAddTaskEventData()
[36m(<lambda> pid=49353)[0m     @        0x112bbb784  (unknown)  ray::gcs::TaskInfoAccessor::AsyncAddTaskEventData()
[36m(<lambda> pid=49353)[0m     @        0x112af5894  (unknown)  ray::core::worker::TaskEventBufferImpl::FlushEvents()
[36m(<lambda> pid=49353)[0m     @        0x1129be7cc  (unknown)  ray::core::CoreWorker::Disconnect()
[36m(<lambda> pid=49353)[0m     @        0x1129bfd08  (unknown)  ray::core::CoreWorker::ForceExit()
[36m(<lambda> pid=49353)[0m     @        0x1129e1070  (unkn

[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49386)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49386)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49386)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49386)[0m 
[36m(<lambda> pid=49351)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<lambda> pid=49354)[0m 
[36m(<lambda> pid=49356)[0m 
[36m(<lambda> pid=49355)[0m 
[36m(<l

2025-03-10 20:16:02,802	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/choonkeatling/ray_results/lambda_2025-03-10_20-15-34' in 0.0046s.
2025-03-10 20:16:02,805	INFO tune.py:1041 -- Total run time: 28.73 seconds (28.69 seconds for the tuning loop).



Best hyperparameters for DistilBERT:
{'lr': 0.001124840994193108, 'batch_size': 32, 'num_epochs': 100}
[36m(<lambda> pid=49351)[0m 
Epoch 1/100
Average Loss (Label): 0.6298
Average Loss (Type): 1.5635
Total Loss: 2.1933

Epoch 11/100
Average Loss (Label): 0.0813
Average Loss (Type): 0.2384
Total Loss: 0.3197

Epoch 21/100
Average Loss (Label): 0.0302
Average Loss (Type): 0.0760
Total Loss: 0.1062

Epoch 31/100
Average Loss (Label): 0.0219
Average Loss (Type): 0.0352
Total Loss: 0.0571

Epoch 41/100
Average Loss (Label): 0.0101
Average Loss (Type): 0.0173
Total Loss: 0.0274

Epoch 51/100
Average Loss (Label): 0.0078
Average Loss (Type): 0.0108
Total Loss: 0.0186

Epoch 61/100
Average Loss (Label): 0.0047
Average Loss (Type): 0.0064
Total Loss: 0.0111

Epoch 71/100
Average Loss (Label): 0.0031
Average Loss (Type): 0.0046
Total Loss: 0.0077

Epoch 81/100
Average Loss (Label): 0.0021
Average Loss (Type): 0.0025
Total Loss: 0.0046

Epoch 91/100
Average Loss (Label): 0.0015
Average Loss (

2025-03-10 20:16:11,363	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


Training complete.

Evaluation results for DistilBERT: ((0.9683333333333334, 0.9838187702265372, 0.9559748427672956, 0.9696969696969697, array([[277,   5],
       [ 14, 304]])), (0.955, 0.9555052857868447, 0.955, 0.9548564125362895, array([[ 35,   1,   0,   1,   0,   0,   0,   0,   0],
       [  2,  18,   0,   0,   1,   0,   0,   0,   0],
       [  0,   0,  47,   2,   0,   0,   0,   0,   0],
       [  0,   0,   0, 279,   1,   0,   2,   0,   0],
       [  0,   0,   0,   1,  28,   0,   0,   1,   0],
       [  0,   0,   0,   2,   0,  22,   0,   0,   0],
       [  1,   0,   0,   4,   0,   1,  86,   2,   0],
       [  0,   0,   0,   0,   0,   0,   2,  15,   0],
       [  0,   0,   0,   2,   0,   0,   1,   0,  43]])))


0,1
Current time:,2025-03-10 20:16:47
Running for:,00:00:35.87
Memory:,10.5/16.0 GiB

Trial name,status,loc,batch_size,lr,num_epochs,iter,total time (s),combined_f1
lambda_74464_00000,TERMINATED,127.0.0.1:49465,64,4.08643e-05,100,100,12.3619,0.783776
lambda_74464_00001,TERMINATED,127.0.0.1:49464,16,7.38242e-05,50,50,16.652,0.918784
lambda_74464_00002,TERMINATED,127.0.0.1:49466,64,0.000527343,60,60,7.96054,0.966951
lambda_74464_00003,TERMINATED,127.0.0.1:49469,32,1.1372e-05,60,60,12.757,0.60866
lambda_74464_00004,TERMINATED,127.0.0.1:49467,32,0.000102706,30,30,7.21534,0.821306
lambda_74464_00005,TERMINATED,127.0.0.1:49470,32,0.000129673,60,60,12.1114,0.943172
lambda_74464_00006,TERMINATED,127.0.0.1:49468,32,0.00889349,70,70,14.0936,0.972013
lambda_74464_00007,TERMINATED,127.0.0.1:49471,32,3.15883e-05,100,100,16.9876,0.827388
lambda_74464_00008,TERMINATED,127.0.0.1:49500,16,0.00266899,100,100,19.6698,0.970511
lambda_74464_00009,TERMINATED,127.0.0.1:49506,16,0.000425288,70,70,13.9727,0.968575


[36m(<lambda> pid=49351)[0m Epoch 1/1[32m [repeated 30x across cluster][0m
[36m(<lambda> pid=49351)[0m Average Loss (Label): 0.0002[32m [repeated 30x across cluster][0m
[36m(<lambda> pid=49351)[0m Average Loss (Type): 0.0013[32m [repeated 30x across cluster][0m
[36m(<lambda> pid=49351)[0m Total Loss: 0.0016[32m [repeated 30x across cluster][0m
[36m(<lambda> pid=49351)[0m Training complete.[32m [repeated 30x across cluster][0m


Trial name,combined_f1
lambda_74464_00000,0.783776
lambda_74464_00001,0.918784
lambda_74464_00002,0.966951
lambda_74464_00003,0.60866
lambda_74464_00004,0.821306
lambda_74464_00005,0.943172
lambda_74464_00006,0.972013
lambda_74464_00007,0.827388
lambda_74464_00008,0.970511
lambda_74464_00009,0.968575


[36m(<lambda> pid=49465)[0m 
[36m(<lambda> pid=49465)[0m Epoch 1/1
[36m(<lambda> pid=49465)[0m Average Loss (Label): 0.6940
[36m(<lambda> pid=49465)[0m Average Loss (Type): 2.2073
[36m(<lambda> pid=49465)[0m Total Loss: 2.9013
[36m(<lambda> pid=49465)[0m Training complete.
[36m(<lambda> pid=49468)[0m Epoch 1/1
[36m(<lambda> pid=49468)[0m Average Loss (Label): 0.3144
[36m(<lambda> pid=49468)[0m Average Loss (Type): 1.0088
[36m(<lambda> pid=49468)[0m Total Loss: 1.3232
[36m(<lambda> pid=49468)[0m 
[36m(<lambda> pid=49468)[0m Training complete.
[36m(<lambda> pid=49470)[0m 
[36m(<lambda> pid=49467)[0m 
[36m(<lambda> pid=49469)[0m 
[36m(<lambda> pid=49466)[0m 
[36m(<lambda> pid=49465)[0m 
[36m(<lambda> pid=49466)[0m 
[36m(<lambda> pid=49464)[0m 
[36m(<lambda> pid=49465)[0m 
[36m(<lambda> pid=49471)[0m 
[36m(<lambda> pid=49468)[0m 
[36m(<lambda> pid=49470)[0m 
[36m(<lambda> pid=49467)[0m 
[36m(<lambda> pid=49469)[0m 
[36m(<lambda> pid=49466)

2025-03-10 20:16:47,253	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/choonkeatling/ray_results/lambda_2025-03-10_20-16-11' in 0.0047s.
2025-03-10 20:16:47,256	INFO tune.py:1041 -- Total run time: 35.89 seconds (35.86 seconds for the tuning loop).



Best hyperparameters for SBERT:
{'lr': 0.00889349277524112, 'batch_size': 32, 'num_epochs': 70}
[36m(<lambda> pid=49500)[0m 
Epoch 1/70
Average Loss (Label): 0.3390
Average Loss (Type): 1.0752
Total Loss: 1.4142

Epoch 11/70
Average Loss (Label): 0.0011
Average Loss (Type): 0.0027
Total Loss: 0.0038

Epoch 21/70
Average Loss (Label): 0.0003
Average Loss (Type): 0.0009
Total Loss: 0.0012

Epoch 31/70
Average Loss (Label): 0.0002
Average Loss (Type): 0.0004
Total Loss: 0.0005

Epoch 41/70
Average Loss (Label): 0.0001
Average Loss (Type): 0.0002
Total Loss: 0.0003

Epoch 51/70
Average Loss (Label): 0.0001
Average Loss (Type): 0.0002
Total Loss: 0.0002

Epoch 61/70
Average Loss (Label): 0.0000
Average Loss (Type): 0.0001
Total Loss: 0.0001

Training complete.

Evaluation results for SBERT: ((0.985, 0.9904761904761905, 0.9811320754716981, 0.985781990521327, array([[279,   3],
       [  6, 312]])), (0.9766666666666667, 0.9773654440980499, 0.9766666666666667, 0.9763848540364887, array([[ 3

In [12]:
result.head()

Unnamed: 0,Algorithm,Embedding,Type,y1_Accuracy,y1_Precision,y1_Recall,y1_F1_Score,y2_Accuracy,y2_Precision,y2_Recall,y2_F1_Score
0,SVM,SBERT,Test Set,0.985,0.99361,0.977987,0.985737,0.98,0.98034,0.98,0.979739
1,Multi-Task Learning,DistilBERT,Test Set,0.968333,0.983819,0.955975,0.969697,0.955,0.955505,0.955,0.954856
2,Multi-Task Learning,SBERT,Test Set,0.985,0.990476,0.981132,0.985782,0.976667,0.977365,0.976667,0.976385


In [13]:
original_result = pd.read_csv("results.csv")
# Get only the SVM with SBERT, and Multi-Task Learning with SBERT and DistilBERT
original_result = original_result[
    original_result["Type"] == "Test Set"
]
original_result = original_result[
    original_result["Algorithm"].isin(
        ["SVM", "Multi-Task Learning"]
    )
]
original_result = original_result[
    original_result["Embedding"].isin(
        ["SBERT", "DistilBERT"]
    )
]

# Remove SVM with DistilBERT
original_result = original_result[
    ~((original_result["Algorithm"] == "SVM") & (original_result["Embedding"] == "DistilBERT"))
]

original_result.head()

Unnamed: 0,Algorithm,Embedding,Type,y1_Accuracy,y1_Precision,y1_Recall,y1_F1_Score,y2_Accuracy,y2_Precision,y2_Recall,y2_F1_Score
19,SVM,SBERT,Test Set,0.9833,0.9904,0.978,0.9842,0.9717,0.9725,0.9717,0.9713
58,Multi-Task Learning,DistilBERT,Test Set,0.975,0.9749,0.978,0.9765,0.9517,0.9541,0.9517,0.9522
59,Multi-Task Learning,SBERT,Test Set,0.9783,0.9811,0.978,0.9795,0.9783,0.9791,0.9783,0.9781


In [14]:
print(original_result.dtypes)
print(result.dtypes)
original_result = original_result.reset_index(drop=True)
result = result.reset_index(drop=True)

Algorithm        object
Embedding        object
Type             object
y1_Accuracy     float64
y1_Precision    float64
y1_Recall       float64
y1_F1_Score     float64
y2_Accuracy     float64
y2_Precision    float64
y2_Recall       float64
y2_F1_Score     float64
dtype: object
Algorithm        object
Embedding        object
Type             object
y1_Accuracy     float64
y1_Precision    float64
y1_Recall       float64
y1_F1_Score     float64
y2_Accuracy     float64
y2_Precision    float64
y2_Recall       float64
y2_F1_Score     float64
dtype: object


In [19]:
# Calculate percentage differences
diff_df = pd.DataFrame()
diff_df["Algorithm"] = result["Algorithm"]
diff_df["Embedding"] = result["Embedding"]

# List of metrics to compare
metrics = [
    "y1_Accuracy",
    "y1_Precision",
    "y1_Recall",
    "y1_F1_Score",
    "y2_Accuracy",
    "y2_Precision",
    "y2_Recall",
    "y2_F1_Score",
]

for metric in metrics:
    diff_df[f"{metric}_diff"] = (
        (result[metric] - original_result[metric]) / original_result[metric] * 100
    ).round(2)

# Add '%' symbol and format the output
for col in diff_df.columns:
    if col.endswith("_diff"):
        diff_df[col] = diff_df[col].apply(
            lambda x: f"{x:+.2f}%" if not pd.isna(x) else "N/A"
        )


diff_df.to_csv("diff_results.csv", index=False)

diff_df.head()

Unnamed: 0,Algorithm,Embedding,y1_Accuracy_diff,y1_Precision_diff,y1_Recall_diff,y1_F1_Score_diff,y2_Accuracy_diff,y2_Precision_diff,y2_Recall_diff,y2_F1_Score_diff
0,SVM,SBERT,+0.17%,+0.32%,-0.00%,+0.16%,+0.85%,+0.81%,+0.85%,+0.87%
1,Multi-Task Learning,DistilBERT,-0.68%,+0.91%,-2.25%,-0.70%,+0.35%,+0.15%,+0.35%,+0.28%
2,Multi-Task Learning,SBERT,+0.68%,+0.96%,+0.32%,+0.64%,-0.17%,-0.18%,-0.17%,-0.18%
