In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqtra0027[0m ([33mailecs-lab-students[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
run = wandb.init(
    project='Using Llama3.2 to classify illicit content on online marketplace_ver 5', 
    job_type="training", 
    anonymous="allow"
)

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import evaluate
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, DatasetDict
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (LlamaForSequenceClassification, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          DataCollatorWithPadding, 
                          EarlyStoppingCallback)
from sklearn.metrics import (accuracy_score, 
                            precision_score, 
                            recall_score, 
                            f1_score, 
                            classification_report, 
                            confusion_matrix,
                            precision_recall_fscore_support)
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from transformers import pipeline, Trainer
from collections import Counter

In [4]:
# Load JSONL file
file_path = "DUTA10K_final.jsonl"
df = pd.read_json(file_path, lines=True)

In [5]:
# 1a) get sorted list of unique categories
categories = sorted(df["category"].unique())
num_labels = len(categories)

# 1b) build id2label / label2id
id2label = {i: cat for i, cat in enumerate(categories)}
label2id = {cat: i for i, cat in enumerate(categories)}

print(f"{num_labels} categories, e.g.: {categories[:5]} → {label2id[categories[0]]}")

40 categories, e.g.: ['Art_Music', 'Casino_Gambling', 'Counterfeit Credit-Cards', 'Counterfeit Money', 'Counterfeit Personal-Identification_Driving-Licence'] → 0


In [6]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Define split sizes
train_size = 0.8
eval_size = 0.1

# Calculate split indices
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
df_train = df[:train_end].copy()
df_eval = df[train_end:eval_end].copy()
df_test = df[eval_end:].copy()

In [7]:
ds = DatasetDict({
    "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
    "eval":  Dataset.from_pandas(df_eval.reset_index(drop=True)),
    "test":  Dataset.from_pandas(df_test.reset_index(drop=True)),
})

# add a new “label_id” column
def encode_label(ex):
    return {"labels": label2id[ex["category"]]}
ds = ds.map(encode_label, batched=False)

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/417 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

In [8]:
from transformers import DefaultDataCollator
# Load model directly
base_model_name = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 10000

# Preprocess function to tokenize the text data
def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

# Apply preprocessing to the datasets and remove original columns
ds = ds.remove_columns(["source", "lang", "label", "category"])  # drop old binary label if present
tokenized = ds.map(
    preprocess,
    batched=True,
    remove_columns=["text"]      # now only input_ids, attention_mask, labels remain
)

# Set the format of the tokenized datasets to PyTorch tensors
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator = DefaultDataCollator()

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/417 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

In [9]:
# compute weights inversely proportional to class frequency
y_train = np.array(ds["train"]["labels"])
cw = compute_class_weight("balanced", classes=np.arange(num_labels), y=y_train)
class_weights = torch.tensor(cw, dtype=torch.float)  

In [10]:
# Store the original forward method reference FIRST
_orig_forward = LlamaForSequenceClassification.forward

# Define the wrapper function that USES the _orig_forward reference
def actual_forward_wrapper(
    self,                   # Instance of the model
    input_ids=None,
    attention_mask=None,
    labels=None,            # This will capture the labels passed by the Trainer
    **kwargs
):
    # Remove Trainer’s extra argument if present
    kwargs.pop("num_items_in_batch", None)

    # Call the *original* forward method (using the captured _orig_forward)
    outputs = _orig_forward(
        self,
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=None,  # Explicitly set labels to None for the base model call
        **kwargs      # Pass any other captured kwargs
    )
    logits = outputs.logits  # (batch_size, num_labels)

    loss = None
    if labels is not None: # Use the 'labels' captured by *this* function's signature
        # Ensure class weights are on the same device as logits
        # Make sure 'class_weights' is accessible in this scope (defined outside)
        loss_fct = CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, self.config.num_labels),
                        labels.view(-1))

    # Return a full SequenceClassifierOutput
    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

# Replace the class's forward method with the wrapper
LlamaForSequenceClassification.forward = actual_forward_wrapper

In [11]:
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the LlamaForSequenceClassification model with quantization
model = LlamaForSequenceClassification.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    device_map="auto",
    trust_remote_code=True
)

# Set the padding token ID for the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Prepare the model for k-bit training (LoRA compatible)
model = prepare_model_for_kbit_training(model)

# Enable gradient checkpointing to save memory during training
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Ensure class weights are on the same device as the model parameters
class_weights = class_weights.to(next(model.parameters()).device)

In [13]:
# Identify target modules for LoRA adaptation (all linear layers)
target_modules = [n.split(".")[-1]
                  for n, m in model.named_modules()
                  if isinstance(m, torch.nn.Linear)]

# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

# Get the PEFT (Parameter-Efficient Fine-Tuning) model
model = get_peft_model(model, lora_config)

# Print the number of trainable parameters
model.print_trainable_parameters()

trainable params: 48,750,592 || all params: 3,261,623,296 || trainable%: 1.4947


In [14]:
# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )

    return {
        "accuracy":  acc,
        "precision": prec,
        "recall":    rec,
        "f1":        f1,
    }

# Configure training arguments
training_args = TrainingArguments(
    output_dir="llama3_multi_v1",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    num_train_epochs=8,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    logging_steps=50,
    report_to=["wandb"],
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
# Train the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,26.0148,4.220622,0.29976,0.275701,0.29976,0.270639
400,20.5353,3.528178,0.417266,0.376999,0.417266,0.387599
600,14.4576,3.157916,0.541966,0.571509,0.541966,0.530455
800,11.129,2.43907,0.613909,0.647024,0.613909,0.617243
1000,8.1344,2.141205,0.669065,0.684437,0.669065,0.666314
1200,7.7347,1.923116,0.71223,0.729562,0.71223,0.713446
1400,3.9852,1.969856,0.702638,0.708352,0.702638,0.701232
1600,4.0723,1.808722,0.709832,0.738096,0.709832,0.712802
1800,1.6784,1.796435,0.745803,0.765428,0.745803,0.746754
2000,1.9872,1.830853,0.755396,0.76103,0.755396,0.751125


TrainOutput(global_step=3336, training_loss=6.98902466628763, metrics={'train_runtime': 116744.3564, 'train_samples_per_second': 0.229, 'train_steps_per_second': 0.029, 'total_flos': 4.58945513558016e+18, 'train_loss': 6.98902466628763, 'epoch': 7.981448234590066})

In [16]:
# Function to evaluate the model on the test set and print classification report and confusion matrix
def evaluate_on_test(trainer, test_dataset, id2label):
    # Get predictions
    preds_output = trainer.predict(test_dataset)
    y_true = preds_output.label_ids
    y_pred = np.argmax(preds_output.predictions, axis=-1)

    # Which labels actually appear in the test set?
    present_labels = sorted(set(y_true.tolist()))
    present_names  = [id2label[i] for i in present_labels]

    # Print report for only those classes
    print("=== Classification Report ===")
    print(classification_report(
        y_true,
        y_pred,
        labels=present_labels,
        target_names=present_names,
        zero_division=0,
        digits=4
    ))

    # Confusion matrix (same subset of labels)
    print("=== Confusion Matrix ===")
    print(confusion_matrix(
        y_true,
        y_pred,
        labels=present_labels
    ))

In [17]:
# Perform evaluation on the test dataset
evaluate_on_test(trainer, tokenized["test"], id2label)

=== Classification Report ===
                                              precision    recall  f1-score   support

                                   Art_Music     1.0000    0.5000    0.6667         2
                             Casino_Gambling     0.7500    0.6000    0.6667         5
                    Counterfeit Credit-Cards     0.9048    0.9500    0.9268        20
                           Counterfeit Money     1.0000    1.0000    1.0000         2
Counterfeit Personal-Identification_Passport     1.0000    0.7500    0.8571         4
                              Cryptocurrency     0.9623    0.9623    0.9623        53
                                Cryptolocker     0.9412    1.0000    0.9697        16
                               Drugs_Illegal     0.7586    0.8800    0.8148        25
                               Forum_Illegal     0.5000    0.5000    0.5000         2
                                 Forum_Legal     0.5455    0.6667    0.6000         9
                       

In [18]:
# Finish the Weights & Biases run
wandb.finish()
model.config.use_cache = True

0,1
eval/accuracy,▁▃▅▆▆▇▇▇████████
eval/f1,▁▃▅▆▇▇▇▇████████
eval/loss,█▆▅▃▂▁▂▁▁▁▁▁▁▁▁▁
eval/precision,▁▂▅▆▇▇▇▇████████
eval/recall,▁▃▅▆▆▇▇▇████████
eval/runtime,▃▂▆▃▅▁▁▁▁█▇█▆█▇▇
eval/samples_per_second,▆█▃▆▄████▁▁▁▃▁▁▁
eval/steps_per_second,██▁█▁████▁▁▁▁▁▁▁
test/accuracy,▁
test/f1,▁

0,1
eval/accuracy,0.77218
eval/f1,0.76853
eval/loss,1.92418
eval/precision,0.7761
eval/recall,0.77218
eval/runtime,448.1879
eval/samples_per_second,0.93
eval/steps_per_second,0.118
test/accuracy,0.73508
test/f1,0.73227


In [19]:
# Save trained model and tokenizer
trainer.save_model("llama-3.2-fine-tuned-model_ver5")
tokenizer.save_pretrained("llama-3.2-fine-tuned-model_ver5")

('llama-3.2-fine-tuned-model_ver5/tokenizer_config.json',
 'llama-3.2-fine-tuned-model_ver5/special_tokens_map.json',
 'llama-3.2-fine-tuned-model_ver5/tokenizer.json')

In [20]:
from huggingface_hub import login
from peft import PeftModel

# 🔐 Login to HuggingFace
from getpass import getpass
hf_token = getpass("Enter your HuggingFace token: ")
login(token=hf_token)

In [21]:
# 🧠 Base and fine-tuned model paths
base_model = "meta-llama/Llama-3.2-3B"  # You used this in your training code
fine_tuned_model = "llama-3.2-fine-tuned-model_ver5"  # Your output dir from training

# 🔁 Reload tokenizer and base model
print("🔄 Loading base tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

🔄 Loading base tokenizer and model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# 📎 Merge adapter
print("🔗 Merging LoRA adapter with base model...")
model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)
model = model.merge_and_unload()

# 💾 Save locally and push to HF Hub
model_dir = "llama-3.2-fine-tuned-model_ver5"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# ☁️ Push to Hugging Face
model.push_to_hub(model_dir, use_temp_dir=False)
tokenizer.push_to_hub(model_dir, use_temp_dir=False)

🔗 Merging LoRA adapter with base model...


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kevintran0310/llama-3.2-fine-tuned-model_ver5/commit/724514901e104dc957ae3b6c7456ea771b15372b', commit_message='Upload tokenizer', commit_description='', oid='724514901e104dc957ae3b6c7456ea771b15372b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kevintran0310/llama-3.2-fine-tuned-model_ver5', endpoint='https://huggingface.co', repo_type='model', repo_id='kevintran0310/llama-3.2-fine-tuned-model_ver5'), pr_revision=None, pr_num=None)