This version improve the matching code from the answer to fuzzy

In [1]:
# Login to Weights & Biases for experiment tracking
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqtra0027[0m ([33mailecs-lab-students[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# Initialize a new Weights & Biases run
run = wandb.init(
    project='Using Llma3.2 to classify illicit content on online marketplace ver 2 (binary classification)', 
    job_type="training", 
    resume="allow"
)

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import evaluate
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, DatasetDict
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import (LlamaForSequenceClassification, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          DataCollatorWithPadding, 
                          EarlyStoppingCallback)
from sklearn.metrics import (accuracy_score, 
                            precision_score, 
                            recall_score, 
                            f1_score, 
                            classification_report, 
                            confusion_matrix)
from sklearn.model_selection import train_test_split
from transformers import pipeline, Trainer

In [4]:
# Load JSONL file
file_path = "DUTA10K_final.jsonl"
df = pd.read_json(file_path, lines=True)

In [5]:
# Define split sizes and split the DataFrame
n = len(df)
train_end = int(0.8 * n)
eval_end  = train_end + int(0.1 * n)

df_train = df.iloc[:train_end]
df_eval  = df.iloc[train_end:eval_end]
df_test  = df.iloc[eval_end:]

print(f"Train={len(df_train)} | Eval={len(df_eval)} | Test={len(df_test)}")

Train=3342 | Eval=417 | Test=419


In [6]:
# Create a DatasetDict from the pandas DataFrames
ds = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval":  Dataset.from_pandas(df_eval),
    "test":  Dataset.from_pandas(df_test),
})

In [7]:
# Load the base model tokenizer
base_model_name = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 10000  # or 1,024 if you have the headroom

# Preprocess function to tokenize the text data
def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length"   # pad shorter examples up to exactly MAX_LEN
    )

# Apply preprocessing to the datasets
tokenized = ds.map(preprocess, batched=True)

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/417 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

In [8]:
# Prepare Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
# Metrics functions
metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    logits, labels = eval_pred # eval_pred contains predictions and labels
    preds = np.argmax(logits, axis=-1)# Get predicted class IDs
    return metric.compute(predictions=preds, references=labels)

In [10]:
# Configure BitsAndBytes for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load GEMMA‑3 for sequence classification
model = LlamaForSequenceClassification.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    num_labels=2,
    id2label={0:"non-illicit",1:"illicit"},
    label2id={"non-illicit":0,"illicit":1},
    device_map="auto",
    trust_remote_code=True,
)

# Prepare the model for k-bit training (LoRA compatible)
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()# Enable gradient checkpointing to save memory during training

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Identify target modules for LoRA adaptation (all linear layers)
SUPPORTED = (nn.Linear,)
target_modules = [
    name.split(".")[-1]
    for name, m in model.named_modules()
    if isinstance(m, SUPPORTED)
]
print("LoRA will adapt:", target_modules)

LoRA will adapt: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_p

In [12]:
# Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=64, # LoRA rank
    lora_alpha=32, # LoRA scaling factor
    target_modules=target_modules, # Modules to apply LoRA to
    lora_dropout=0.1, # Dropout probability for LoRA layers
    bias="none", # Do not apply bias to LoRA weights
    task_type="SEQ_CLS", # Sequence Classification task
)

# Get the PEFT (Parameter-Efficient Fine-Tuning) model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Print the number of trainable parameters

# Configure training arguments
training_args = TrainingArguments(
    output_dir="llama3_binary_ver2", # Output directory for checkpoints and logs
    per_device_train_batch_size=1, # Batch size per GPU for training
    per_device_eval_batch_size=1, # Batch size per GPU for evaluation
    gradient_accumulation_steps=8, # Number of updates steps to accumulate before performing a backward/update pass
    learning_rate=2e-5, # Initial learning rate for AdamW optimizer
    num_train_epochs=8, # Total number of training epochs
    eval_strategy="epoch", # Evaluation is done at the end of each epoch
    save_strategy="epoch", # Model is saved at the end of each epoch
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="accuracy", # Metric to use to compare models
    fp16=True, # Enable mixed precision training
    logging_steps=50, # Log training loss and learning rate every 50 steps
    report_to=["wandb"], # Report metrics to Weights & Biases
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    # callbacks=[early_stop],
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 97,261,568 || all params: 3,310,017,536 || trainable%: 2.9384


In [13]:
# Train the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4134,0.377155,0.846523,0.418182,0.638889,0.310811
2,0.2392,0.387714,0.868106,0.580153,0.666667,0.513514
3,0.2322,0.367212,0.8753,0.675,0.627907,0.72973
4,0.1522,0.403477,0.894484,0.671642,0.75,0.608108
5,0.0426,0.498085,0.884892,0.641791,0.716667,0.581081
6,0.055,0.576772,0.889688,0.666667,0.71875,0.621622
7,0.0004,0.650193,0.889688,0.676056,0.705882,0.648649


TrainOutput(global_step=3336, training_loss=0.16215479258242413, metrics={'train_runtime': 113593.5562, 'train_samples_per_second': 0.235, 'train_steps_per_second': 0.029, 'total_flos': 4.66690721304576e+18, 'train_loss': 0.16215479258242413, 'epoch': 7.981448234590066})

In [14]:
# Save trained model and tokenizer
trainer.save_model("llama3_binary_ver2")
tokenizer.save_pretrained("llama3_binary_ver2")

('llama3_binary_ver2/tokenizer_config.json',
 'llama3_binary_ver2/special_tokens_map.json',
 'llama3_binary_ver2/tokenizer.json')

In [15]:
# Make sure model & tokenizer are on the right device
device = next(model.parameters()).device
model.to(device)
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )


In [16]:
def predict(text: str):
    # tokenize + move to device
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    ).to(device)
    # forward
    with torch.no_grad():
        logits = model(**inputs).logits  # shape (1,2)
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    idx   = int(np.argmax(probs))
    return {
        "label":    id2label[idx],
        "score":    float(probs[idx]),
        "all_probs": { id2label[i]: float(probs[i]) for i in range(len(probs)) }
    }

In [17]:
# Get predictions on the test set
preds_output = trainer.predict(tokenized["test"])
y_true = preds_output.label_ids # True labels
y_pred = np.argmax(preds_output.predictions, axis=-1) # Predicted labels

In [18]:
# 3. Compute & print metrics
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec  = recall_score(y_true, y_pred, zero_division=0)
f1   = f1_score(y_true, y_pred, zero_division=0)

In [19]:
print(f"Test Accuracy:  {acc:.4f}")
print(f"Test Precision: {prec:.4f}")
print(f"Test Recall:    {rec:.4f}")
print(f"Test F1:        {f1:.4f}")

# Full classification report + confusion matrix
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["non‑illicit","illicit"], zero_division=0))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Test Accuracy:  0.8902
Test Precision: 0.8393
Test Recall:    0.5595
Test F1:        0.6714

Classification Report:
              precision    recall  f1-score   support

 non‑illicit       0.90      0.97      0.93       335
     illicit       0.84      0.56      0.67        84

    accuracy                           0.89       419
   macro avg       0.87      0.77      0.80       419
weighted avg       0.89      0.89      0.88       419

Confusion Matrix:
[[326   9]
 [ 37  47]]


In [20]:
# Finish the Weights & Biases run
wandb.finish()
model.config.use_cache = True # Set use_cache to True for inference after training

0,1
eval/accuracy,▁▄▅█▇▇▇▇
eval/f1,▁▅██▇███
eval/loss,▁▂▁▂▄▆██
eval/precision,▂▃▁█▆▆▅▅
eval/recall,▁▄█▆▆▆▆▇
eval/runtime,▂▃▄▇▁▅▂█
eval/samples_per_second,▄▄▄▄█▄▄▁
eval/steps_per_second,▄▄▄▄█▄▄▁
test/accuracy,▁
test/f1,▁

0,1
eval/accuracy,0.88969
eval/f1,0.67606
eval/loss,0.65019
eval/precision,0.70588
eval/recall,0.64865
eval/runtime,459.0015
eval/samples_per_second,0.908
eval/steps_per_second,0.908
test/accuracy,0.89021
test/f1,0.67143


In [21]:
from huggingface_hub import login
from transformers import AutoTokenizer, LlamaForSequenceClassification
from peft import PeftModel

# 🔐 Login to HuggingFace
from getpass import getpass
hf_token = getpass("Enter your HuggingFace token: ")
login(token=hf_token)

In [22]:
# 🧠 Base and fine-tuned model paths
base_model = "meta-llama/Llama-3.2-3B"  # You used this in your training code
fine_tuned_model = "llama3_binary_ver2"  # Your output dir from training

# 🔁 Reload tokenizer and base model
print("🔄 Loading base tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = LlamaForSequenceClassification.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float32,
    device_map="auto",
    trust_remote_code=True,
)

🔄 Loading base tokenizer and model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# 📎 Merge adapter
print("🔗 Merging LoRA adapter with base model...")
model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)
model = model.merge_and_unload()

# 💾 Save locally and push to HF Hub
model_dir = "llama3_binary_ver2"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# ☁️ Push to Hugging Face
model.push_to_hub(model_dir, use_temp_dir=False)
tokenizer.push_to_hub(model_dir, use_temp_dir=False)

🔗 Merging LoRA adapter with base model...


model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kevintran0310/llama3_binary_ver2/commit/47bc44c5a8840854c9dd1c9e4d46fb5812a58b8f', commit_message='Upload tokenizer', commit_description='', oid='47bc44c5a8840854c9dd1c9e4d46fb5812a58b8f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kevintran0310/llama3_binary_ver2', endpoint='https://huggingface.co', repo_type='model', repo_id='kevintran0310/llama3_binary_ver2'), pr_revision=None, pr_num=None)