In [1]:
# Login to Weights & Biases for experiment tracking
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqtra0027[0m ([33mailecs-lab-students[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [21]:
# Initialize a new Weights & Biases run for experiment tracking
run = wandb.init(
    project='Using BERT to classify illicit content on online marketplace ver 1 (binary classification)', 
    job_type="training", 
    resume="allow"
)

In [4]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
import torch
import torch.nn as nn
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    default_data_collator,
    EarlyStoppingCallback
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model

In [6]:
# Configuration settings
SEED = 500
FILE_PATH = "DUTA10K_final.jsonl"
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 128
TEST_SET_SIZE = 0.1
VALIDATION_SET_SIZE = 0.1

In [7]:
# Set Random Seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [8]:
# Load the dataset from a JSONL file
try:
    df = pd.read_json("DUTA10K_final.jsonl", lines=True)
except FileNotFoundError:
    print(f"Error: The file {FILE_PATH} was not found. Please check the path.")
    exit()
except ValueError as e:
    print(f"Error reading JSONL file: {e}. Ensure it's a valid JSONL format.")
    exit()

print(f"Loaded {len(df)} records.")
df = df.dropna(subset=['text', 'label'])
df['label'] = df['label'].astype(int)
print(f"Using {len(df)} records after dropping NA.")

Loaded 4178 records.
Using 4178 records after dropping NA.


In [9]:
# Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)

# Split Data into Train, Validation, and Test DataFrames
# First, split into train_val_df and test_df
train_val_df, test_df = train_test_split(
    df,
    test_size=TEST_SET_SIZE,
    random_state=SEED,
    stratify=df['label']
)

# Then, split train_val_df into train_df and eval_df
train_df, eval_df = train_test_split(
    train_val_df,
    test_size=VALIDATION_SET_SIZE / (1 - TEST_SET_SIZE),
    random_state=SEED,
    stratify=train_val_df['label']
)

# Reset indices of the split DataFrames
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(eval_df)}")
print(f"Test samples: {len(test_df)}")

# Convert DataFrames to Hugging Face Dataset objects
train_dataset_hf = Dataset.from_pandas(train_df[['text', 'label']])
eval_dataset_hf = Dataset.from_pandas(eval_df[['text', 'label']])
test_dataset_hf = Dataset.from_pandas(test_df[['text', 'label']])

Training samples: 3342
Validation samples: 418
Test samples: 418


In [10]:
# Tokenize Datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

tokenized_train_dataset = train_dataset_hf.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset_hf.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset_hf.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

tokenized_train_dataset.set_format("torch")
tokenized_eval_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [11]:
# Initialize BERT Model (Base Model)
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2, # For binary classification
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Identify target modules for LoRA adaptation
target_modules = []
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        # Get the last part of the name (e.g., 'query', 'dense')
        target_modules.append(name.split('.')[-1])

# Refine target_modules to specific linear layers relevant for BERT LoRA
# This list is based on common practice for BERT models in PEFT
target_modules = ["query", "key", "value", "dense"]
encoder_linear_layers = []
for name, module in model.bert.named_modules(): # Look inside the 'bert' part
    if isinstance(module, nn.Linear):
        encoder_linear_layers.append(name)
print(f"Explicitly selected LoRA target modules for BERT: {target_modules}")

Explicitly selected LoRA target modules for BERT: ['query', 'key', 'value', 'dense']


In [13]:
# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=target_modules, # Using the refined list for BERT
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS, # For sequence classification
)

# Get the PEFT (Parameter-Efficient Fine-Tuning) model by applying LoRA config
model = get_peft_model(model, lora_config)
print("Trainable parameters after LoRA adaptation:")
model.print_trainable_parameters()

# Enable gradient checkpointing to save memory
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

Trainable parameters after LoRA adaptation:
trainable params: 10,716,674 || all params: 120,200,452 || trainable%: 8.9157


In [14]:
# Define Metrics Computation Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary', zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="bert_binary_ver1",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True if torch.cuda.is_available() else False,
    logging_steps=50,
    report_to=["wandb"],
    seed=SEED,
)

In [16]:
# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4088,0.460736,0.811005,0.0,0.0,0.0
2,0.371,0.417362,0.84689,0.319149,1.0,0.189873
3,0.3776,0.406269,0.849282,0.336842,1.0,0.202532
4,0.3694,0.393243,0.839713,0.323232,0.8,0.202532
5,0.3762,0.390453,0.830144,0.31068,0.666667,0.202532
6,0.3422,0.389488,0.832536,0.326923,0.68,0.21519
7,0.379,0.389597,0.832536,0.326923,0.68,0.21519




TrainOutput(global_step=3336, training_loss=0.4091828100972896, metrics={'train_runtime': 1138.7582, 'train_samples_per_second': 23.478, 'train_steps_per_second': 2.93, 'total_flos': 1974093912496128.0, 'train_loss': 0.4091828100972896, 'epoch': 7.981448234590066})

In [18]:
# Finish the Weights & Biases run
wandb.finish()
model.config.use_cache = True

0,1
eval/accuracy,▁██▆▄▅▅▅
eval/f1,▁███▇███
eval/loss,█▄▃▁▁▁▁▁
eval/precision,▁██▇▆▆▆▆
eval/recall,▁▇██████
eval/runtime,▃█▁▆▇▆▄▅
eval/samples_per_second,▆▁█▃▂▃▅▄
eval/steps_per_second,▆▁█▃▂▃▅▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇████

0,1
eval/accuracy,0.83254
eval/f1,0.32692
eval/loss,0.3896
eval/precision,0.68
eval/recall,0.21519
eval/runtime,11.027
eval/samples_per_second,37.907
eval/steps_per_second,37.907
total_flos,1974093912496128.0
train/epoch,7.98145


In [19]:
# Save trained model and tokenizer
trainer.save_model("bert_binary_ver1")
tokenizer.save_pretrained("bert_binary_ver1")

('bert_binary_ver1/tokenizer_config.json',
 'bert_binary_ver1/special_tokens_map.json',
 'bert_binary_ver1/vocab.txt',
 'bert_binary_ver1/added_tokens.json')

In [23]:
# Evaluate the Model on the Test Set
print("\nEvaluating the fine-tuned LoRA model on the TEST set...")
test_predictions_output = trainer.predict(tokenized_test_dataset)
test_metrics = compute_metrics((test_predictions_output.predictions, test_predictions_output.label_ids))

print("\n=== Test Set Evaluation Results (LoRA Multi-class) ===")
for key, value in test_metrics.items():
    print(f"  {key}: {value:.4f}")

print("\n=== Detailed Classification Report on Test Set (LoRA) ===")
y_test_preds = np.argmax(test_predictions_output.predictions, axis=-1)
y_test_true = test_predictions_output.label_ids
target_names = ['non-illicit', 'illicit']
print(classification_report(y_test_true, y_test_preds, target_names=target_names, digits=4, zero_division=0))
print("\n=== Confusion Matrix on Test Set (LoRA) ===")
print(confusion_matrix(y_test_true, y_test_preds))


Evaluating the fine-tuned LoRA model on the TEST set...



=== Test Set Evaluation Results (LoRA Multi-class) ===
  accuracy: 0.8373
  f1: 0.2444
  precision: 1.0000
  recall: 0.1392

=== Detailed Classification Report on Test Set (LoRA) ===
              precision    recall  f1-score   support

 non-illicit     0.8329    1.0000    0.9088       339
     illicit     1.0000    0.1392    0.2444        79

    accuracy                         0.8373       418
   macro avg     0.9165    0.5696    0.5766       418
weighted avg     0.8645    0.8373    0.7833       418


=== Confusion Matrix on Test Set (LoRA) ===
[[339   0]
 [ 68  11]]
