In [1]:
# Login to Weights & Biases for experiment tracking
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqtra0027[0m ([33mailecs-lab-students[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# Initialize a new Weights & Biases run for experiment tracking
run = wandb.init(
    project='Using BERT to classify illicit content on online marketplace (multiclass classification)', 
    job_type="training", 
    resume="allow"
)

In [71]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    default_data_collator,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model

In [4]:
# Configuration settings
SEED = 500
FILE_PATH = "DUTA10K_final.jsonl"
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 128
TEST_SET_SIZE = 0.1
VALIDATION_SET_SIZE = 0.1

In [5]:
# Set Random Seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [6]:
# Load the dataset from a JSONL file
try:
    df = pd.read_json("DUTA10K_final.jsonl", lines=True)
except FileNotFoundError:
    print(f"Error: The file {FILE_PATH} was not found. Please check the path.")
    exit()
except ValueError as e:
    print(f"Error reading JSONL file: {e}. Ensure it's a valid JSONL format.")
    exit()

print(f"Loaded {len(df)} records.")

df = df.dropna(subset=['text', 'category'])
print(f"Using {len(df)} records after dropping NA from text/category.")

# Encode Labels (Multi-class)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category']) # Use 'label' as the standard column name for Trainer
num_labels = len(label_encoder.classes_)
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

print(f"Number of unique classes: {num_labels}")

Loaded 4178 records.
Using 4178 records after dropping NA from text/category.
Number of unique classes: 40


In [7]:
# Initialize BERT Tokenizer 
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

# Split Data into Train, Validation, and Test DataFrames 
# Stratify by the new integer 'label' column
train_val_df, test_df = train_test_split(
    df,
    test_size=TEST_SET_SIZE,
    random_state=SEED,
    # stratify=df['label'] # Stratify based on the encoded labels
)

train_df, eval_df = train_test_split(
    train_val_df,
    test_size=VALIDATION_SET_SIZE / (1 - TEST_SET_SIZE),
    random_state=SEED,
    # stratify=train_val_df['label'] # Stratify based on the encoded labels
)

# Reset indices of the split DataFrames
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(eval_df)}")
print(f"Test samples: {len(test_df)}")

Training samples: 3342
Validation samples: 418
Test samples: 418


In [8]:
# Convert DataFrames to Hugging Face Dataset objects 
# Ensure 'label' column (the encoded one) is used
train_dataset_hf = Dataset.from_pandas(train_df[['text', 'label']])
eval_dataset_hf = Dataset.from_pandas(eval_df[['text', 'label']])
test_dataset_hf = Dataset.from_pandas(test_df[['text', 'label']])

# Tokenize Datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset_hf.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [46]:
# Create model using the custom implementation
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels, # Set to the number of unique categories
    id2label=id2label,     # Pass mapping
    label2id=label2id,     # Pass mapping
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
# Identify explicit target modules for LoRA adaptation within the BERT model's encoder
explicit_target_modules = []
for name, module in model.bert.named_modules():
    if isinstance(module, nn.Linear):
        if 'pooler.dense' not in name:
            explicit_target_modules.append(name)

# Remove duplicates that might arise if a module is listed multiple times
explicit_target_modules = sorted(list(set(explicit_target_modules)))

if not explicit_target_modules:
    print("Warning: No nn.Linear target modules found in model.bert. Falling back to generic list.")
    # Fallback to a generic list if the dynamic search fails (less precise)
    explicit_target_modules = ["query", "key", "value", "dense"]
else:
    print(f"Refined LoRA target modules for BERT (full names within model.bert): {explicit_target_modules}")

Refined LoRA target modules for BERT (full names within model.bert): ['encoder.layer.0.attention.output.dense', 'encoder.layer.0.attention.self.key', 'encoder.layer.0.attention.self.query', 'encoder.layer.0.attention.self.value', 'encoder.layer.0.intermediate.dense', 'encoder.layer.0.output.dense', 'encoder.layer.1.attention.output.dense', 'encoder.layer.1.attention.self.key', 'encoder.layer.1.attention.self.query', 'encoder.layer.1.attention.self.value', 'encoder.layer.1.intermediate.dense', 'encoder.layer.1.output.dense', 'encoder.layer.10.attention.output.dense', 'encoder.layer.10.attention.self.key', 'encoder.layer.10.attention.self.query', 'encoder.layer.10.attention.self.value', 'encoder.layer.10.intermediate.dense', 'encoder.layer.10.output.dense', 'encoder.layer.11.attention.output.dense', 'encoder.layer.11.attention.self.key', 'encoder.layer.11.attention.self.query', 'encoder.layer.11.attention.self.value', 'encoder.layer.11.intermediate.dense', 'encoder.layer.11.output.dense'

In [74]:
# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=explicit_target_modules, # Use the refined list of full module names
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("Trainable parameters after LoRA adaptation:")
model.print_trainable_parameters()

# Enable gradient checkpointing to save memory during training
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

Trainable parameters after LoRA adaptation:
trainable params: 5,339,176 || all params: 114,852,176 || trainable%: 4.6487




In [75]:
# Custom Trainer with Weighted Loss 
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # Added num_items_in_batch to signature
        # num_items_in_batch is now an accepted argument from the Trainer call, can be ignored.
        # Also, pop it from the inputs dictionary if it's there as a key, to prevent passing to model's forward.
        inputs.pop("num_items_in_batch", None)

        labels = inputs.pop("labels")
        
        # Prepare inputs for the model, ensuring only expected keys are passed
        # These are the standard arguments for BERT-like models.
        model_input_args = {
            "input_ids": inputs.get("input_ids"),
            "attention_mask": inputs.get("attention_mask"),
        }
        # Add token_type_ids if it exists in inputs and is not None (BERT uses it)
        if "token_type_ids" in inputs and inputs.get("token_type_ids") is not None:
            model_input_args["token_type_ids"] = inputs["token_type_ids"]
        
        # Filter out any keys with None values before passing to model
        model_input_args = {k: v for k, v in model_input_args.items() if v is not None}

        outputs = model(**model_input_args)
        logits = outputs.get("logits")
        
        # Move class_weights to the same device as logits
        weights = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [76]:
# Define Metrics Computation Function (Multi-class)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # For multi-class, use 'weighted' or 'macro' average for precision, recall, f1
    # 'weighted' accounts for label imbalance.
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1, # Explicitly name it
        'precision': precision,
        'recall': recall
    }

In [79]:
# Update training arguments
training_args = TrainingArguments(
    output_dir="bert_multiclass_v1",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=8,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    # eval_steps=100,
    save_strategy="epoch",
    # save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to=["wandb"],
    remove_unused_columns=True,
    label_names=["labels"],
)

# Create trainer with minimal configuration
trainer = WeightedLossTrainer( # Use the custom trainer
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[early_stop],
    class_weights=class_weights # Pass the computed class weights
)

  super().__init__(*args, **kwargs)


In [80]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.698274,0.521531,0.432743,0.377232,0.521531
2,11.939400,1.596963,0.564593,0.494131,0.506601,0.564593
3,11.034500,1.530189,0.578947,0.515968,0.519949,0.578947
4,10.402600,1.503819,0.593301,0.534353,0.518898,0.593301
5,9.906800,1.496084,0.590909,0.526658,0.507992,0.590909
6,9.452000,1.476042,0.62201,0.57137,0.573222,0.62201
7,9.358200,1.457472,0.617225,0.56555,0.560102,0.617225


TrainOutput(global_step=3336, training_loss=10.239503416797811, metrics={'train_runtime': 2912.7684, 'train_samples_per_second': 9.179, 'train_steps_per_second': 1.145, 'total_flos': 1864531098525696.0, 'train_loss': 10.239503416797811, 'epoch': 7.981448234590066})

In [81]:
# Evaluate the Model on the Test Set 
print("\nEvaluating the fine-tuned LoRA model on the TEST set...")
test_predictions_output = trainer.predict(tokenized_test_dataset)
test_metrics = compute_metrics((test_predictions_output.predictions, test_predictions_output.label_ids))

print("\n=== Test Set Evaluation Results (LoRA Multi-class) ===")
for key, value in test_metrics.items():
    print(f"  {key}: {value:.4f}")

print("\n=== Detailed Classification Report on Test Set (LoRA Multi-class) ===")
y_test_preds = np.argmax(test_predictions_output.predictions, axis=-1)
y_test_true = test_predictions_output.label_ids

# Get unique labels present in the test set predictions and true labels
present_labels = np.unique(np.concatenate((y_test_true, y_test_preds)))
# Filter target_names to only include names for labels present in the test set
# Ensure they are sorted according to present_labels for correct mapping
target_names_for_report = [id2label[label_idx] for label_idx in sorted(present_labels)]
# The 'labels' parameter in classification_report should be the sorted unique labels that correspond to the order of target_names_for_report.
labels_for_report = sorted(present_labels)

print(f"Number of unique labels in test set results: {len(labels_for_report)}")
print(f"Number of target names for report: {len(target_names_for_report)}")


print(classification_report(y_test_true, y_test_preds, labels=labels_for_report, target_names=target_names_for_report, digits=4, zero_division=0))


Evaluating the fine-tuned LoRA model on the TEST set...



=== Test Set Evaluation Results (LoRA Multi-class) ===
  accuracy: 0.6794
  f1: 0.6254
  precision: 0.5975
  recall: 0.6794

=== Detailed Classification Report on Test Set (LoRA Multi-class) ===
Number of unique labels in test set results: 31
Number of target names for report: 31
                                              precision    recall  f1-score   support

                                   Art_Music     0.0000    0.0000    0.0000         1
                             Casino_Gambling     0.0000    0.0000    0.0000         2
                    Counterfeit Credit-Cards     0.6800    1.0000    0.8095        17
                           Counterfeit Money     1.0000    0.5000    0.6667         4
Counterfeit Personal-Identification_Passport     1.0000    0.8000    0.8889         5
                              Cryptocurrency     0.8689    0.9464    0.9060        56
                                Cryptolocker     1.0000    0.9583    0.9787        24
                             

In [82]:
# Finish the Weights & Biases run
wandb.finish()
model.config.use_cache = True

0,1
eval/accuracy,▁▁▂▂▂▂▂▂▂▁▁▃▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▆▇▇▇▇▇▇▇▇████
eval/f1,▁▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▅▆▆▆▇▇▆▇▇█▇██
eval/loss,█▇▇▆▆▆▆▆▆█▇▇▆▆▆▆▆▆█████▇▇▇▇▃▃▂▂▂▂▂▁▁▁▁▁▁
eval/precision,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▅▆▆▆▆▆▆▇▇▇▇██
eval/recall,▁▁▂▂▂▂▂▂▂▁▁▃▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▆▇▇▇▇▇▇▇▇████
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█████████████
eval/samples_per_second,███████████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/steps_per_second,▅▄▃▄▃▄▄▂▄▅▄██▇▇█▅▅▅▄▆▇▆█▃▄▆▃▃▃▅▂▁▇▅▆▆▆▃▆
test/accuracy,▁▁▂█
test/f1,▁▁▂█

0,1
eval/accuracy,0.61722
eval/f1,0.56555
eval/loss,1.45747
eval/precision,0.5601
eval/recall,0.61722
eval/runtime,11.5659
eval/samples_per_second,36.141
eval/steps_per_second,36.141
test/accuracy,0.67943
test/f1,0.62542


In [83]:
# Save trained model and tokenizer
trainer.save_model("bert_multiclass_v1")
tokenizer.save_pretrained("bert_multiclass_v1")

('bert_multiclass_v1/tokenizer_config.json',
 'bert_multiclass_v1/special_tokens_map.json',
 'bert_multiclass_v1/vocab.txt',
 'bert_multiclass_v1/added_tokens.json')