This version improve the matching code from the answer to fuzzy

In [1]:
# Login to Weights & Biases for experiment tracking
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mqtra0027[0m ([33mailecs-lab-students[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
# Initialize a new Weights & Biases run for experiment tracking
run = wandb.init(
    project='Using Gemma3_4b to classify illicit content on online marketplace (multiclass classification)_ver2', 
    job_type="training", 
    anonymous="allow"
)

In [23]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModelForSequenceClassification,
    PeftConfig,
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

In [9]:
# Set TOKENIZERS_PARALLELISM to avoid warnings when forking processes
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set PYTORCH_CUDA_ALLOC_CONF for potentially better memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
graphic_card = '0' # choose which graphic card
gpu_device   = 'cuda:0'
os.environ["CUDA_VISIBLE_DEVICES"] = graphic_card
os.environ["CUDA_DEVICE_ORDER"]    = "PCI_BUS_ID"
device       = torch.device(f"cuda:{graphic_card}" if torch.cuda.is_available() else "cpu")

torch.cuda.set_device(0)  
print(torch.cuda.device_count())

1


In [5]:
# Load & inspect data
file_path = "DUTA10K_final.jsonl"
df = pd.read_json(file_path, lines=True)                                
df.dropna(subset=['category', 'text'], inplace=True)
df = df[df['text'].str.strip() != ''] 
df.reset_index(drop=True, inplace=True) 

# Map categories to integer labels
categories = sorted(df["category"].unique())
label2id   = {cat: idx for idx, cat in enumerate(categories)}
id2label   = {idx: cat for cat, idx in label2id.items()}
df["category_id"] = df["category"].map(label2id)

# Build a HuggingFace DatasetDict
ds = Dataset.from_pandas(
    df[["text", "category_id"]]
      .rename(columns={"category_id": "label"})
)

In [6]:
# 80/10/10 split
split1 = ds.train_test_split(test_size=0.2, seed=42)
split2 = split1["test"].train_test_split(test_size=0.5, seed=42)
ds = DatasetDict({
    "train": split1["train"],
    "eval":  split2["train"],
    "test":  split2["test"],
})

In [7]:
# Tokenize the dataset
base_model = "google/gemma-3-4b-it"
tokenizer  = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 512

# Preprocessing function to tokenize text and add labels
def preprocess(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",
    )
    tokens["labels"] = examples["label"]
    return tokens

# Apply preprocessing to the datasets and remove the original 'text' column
tokenized = ds.map(preprocess, batched=True, remove_columns=["text"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/3342 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

In [10]:
# Load Gemma-3 as a CausalLM & swap in a classifier head
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = Gemma3ForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Replace the language modeling head (lm_head) with a new linear layer for classification
model.lm_head = torch.nn.Linear(
    model.config.hidden_size,
    len(categories),
    bias=True
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# Prepare for k-bit tuning & inject LoRA
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

lora_cfg = LoraConfig(
    r=32, # LoRA rank
    lora_alpha=32, # LoRA scaling factor
    target_modules=["gate_proj","down_proj","v_proj","k_proj","q_proj","o_proj","up_proj"], # Modules to apply LoRA to
    lora_dropout=0.1, # Dropout probability for LoRA layers
    bias="none", # Do not apply bias to LoRA weights
    task_type="SEQ_CLS", # Sequence Classification task
)

# Get the PEFT (Parameter-Efficient Fine-Tuning) model
model = get_peft_model(model, lora_cfg)

In [14]:
# Wrap in a SequenceClassification PEFT model
class GEMMA3SeqClassifier(PeftModelForSequenceClassification):
    def __init__(self, peft_config: PeftConfig, base_model: torch.nn.Module):
        super().__init__(base_model, peft_config)
        self.num_labels = len(categories)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        # take the last non-pad token’s logits
        seq_len = attention_mask.sum(dim=1) - 1
        batch_ix = torch.arange(len(seq_len), device=seq_len.device)
        logits = outputs.logits[batch_ix, seq_len, :]
        loss = None
        if labels is not None:
            loss = torch.nn.CrossEntropyLoss()(logits, labels)
        return SequenceClassifierOutput(loss=loss, logits=logits)

# Instantiate the custom wrapped model
model = GEMMA3SeqClassifier(lora_cfg, model)



In [15]:
# Metrics & Trainer setup
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labs  = p.label_ids
    prec, rec, f1, _ = precision_recall_fscore_support(labs, preds, average="weighted", zero_division=0)
    return {
        "accuracy": accuracy_score(labs, preds),
        "precision": prec,
        "recall": rec,
        "f1": f1,
    }

In [17]:
# Training arguments configuration
training_args = TrainingArguments(
    output_dir="gemma3_multiclass_ver2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    num_train_epochs=8,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb",
)

# Initialize the Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
No label_names provided for model class `GEMMA3SeqClassifier`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
# Train the model
trainer.train()

It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.792346,0.449761,0.451158,0.449761,0.387327
2,21.336200,1.185785,0.696172,0.705153,0.696172,0.675173
3,9.352200,1.186292,0.715311,0.703673,0.715311,0.701803
4,5.217700,1.898075,0.708134,0.700239,0.708134,0.696537
5,3.678100,2.17512,0.720096,0.723718,0.720096,0.710796
6,3.486200,2.176732,0.720096,0.719665,0.720096,0.710441
7,3.554100,2.117707,0.732057,0.730424,0.732057,0.723018


TrainOutput(global_step=3336, training_loss=7.34927185204961, metrics={'train_runtime': 16853.9598, 'train_samples_per_second': 1.586, 'train_steps_per_second': 0.198, 'total_flos': 2.6784702607220736e+17, 'train_loss': 7.34927185204961, 'epoch': 7.981448234590066})

In [19]:
# Function to evaluate the model on the test set and print classification report and confusion matrix
def evaluate_on_test(trainer, test_dataset, id2label):
    # Get predictions
    preds_output = trainer.predict(test_dataset)
    y_true = preds_output.label_ids
    y_pred = np.argmax(preds_output.predictions, axis=-1)

    # Which labels actually appear in the test set?
    present_labels = sorted(set(y_true.tolist()))
    present_names  = [id2label[i] for i in present_labels]

    # Print report for only those classes
    print("=== Classification Report ===")
    print(classification_report(
        y_true,
        y_pred,
        labels=present_labels,
        target_names=present_names,
        zero_division=0,
        digits=4
    ))

    # Confusion matrix (same subset of labels)
    print("=== Confusion Matrix ===")
    print(confusion_matrix(
        y_true,
        y_pred,
        labels=present_labels
    ))

In [24]:
# Perform evaluation on the test dataset
evaluate_on_test(trainer, tokenized["test"], id2label)

=== Classification Report ===
                                              precision    recall  f1-score   support

                                   Art_Music     1.0000    0.5000    0.6667         2
                             Casino_Gambling     1.0000    1.0000    1.0000         2
                    Counterfeit Credit-Cards     0.8800    0.9565    0.9167        23
                           Counterfeit Money     1.0000    0.8571    0.9231         7
Counterfeit Personal-Identification_Passport     1.0000    0.5000    0.6667         4
                              Cryptocurrency     0.9149    0.8269    0.8687        52
                                Cryptolocker     0.8889    0.6667    0.7619        12
                               Drugs_Illegal     0.7692    0.8696    0.8163        23
                               Forum_Illegal     0.6667    0.4000    0.5000         5
                                 Forum_Legal     0.5000    0.3750    0.4286         8
                       

In [25]:
# Finish the Weights & Biases run
wandb.finish()
model.config.use_cache = True # Set use_cache to True for optimized inference after training

0,1
eval/accuracy,▁▇█▇████
eval/f1,▁▇█▇████
eval/loss,▅▁▁▆██▇█
eval/precision,▁▇▇▇████
eval/recall,▁▇█▇████
eval/runtime,▁▂▁▂██▄▄
eval/samples_per_second,█▆█▇▁▁▅▅
eval/steps_per_second,█▆█▇▁▁▅▅
test/accuracy,▁▁▁
test/f1,▁▁▁

0,1
eval/accuracy,0.73206
eval/f1,0.72302
eval/loss,2.11771
eval/precision,0.73042
eval/recall,0.73206
eval/runtime,74.022
eval/samples_per_second,5.647
eval/steps_per_second,0.716
test/accuracy,0.67943
test/f1,0.66276


In [26]:
# Save trained model and tokenizer
trainer.save_model("gemma3_multiclass_ver2")
tokenizer.save_pretrained("gemma3_multiclass_ver2")

('gemma3_multiclass_ver2/tokenizer_config.json',
 'gemma3_multiclass_ver2/special_tokens_map.json',
 'gemma3_multiclass_ver2/chat_template.jinja',
 'gemma3_multiclass_ver2/tokenizer.model',
 'gemma3_multiclass_ver2/added_tokens.json',
 'gemma3_multiclass_ver2/tokenizer.json')

In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel

# 🔐 Login to HuggingFace
from getpass import getpass
hf_token = getpass("Enter your HuggingFace token: ")
login(token=hf_token)

In [None]:
# 🧠 Base and fine-tuned model paths
base_model = "google/gemma-3-4b-it"
fine_tuned_model = "gemma3_multiclass_ver2" 

# 🔁 Reload tokenizer and base model
print("🔄 Loading base tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = GemmaForSequenceClassification.from_pretrained(
    base_model,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

In [None]:
# 📎 Merge adapter
print("🔗 Merging LoRA adapter with base model...")
model = PeftModel.from_pretrained(base_model_reload, fine_tuned_model)
model = model.merge_and_unload()

# 💾 Save locally and push to HF Hub
model_dir = "gemma3_multiclass_ver2"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# ☁️ Push to Hugging Face
model.push_to_hub(model_dir, use_temp_dir=False)
tokenizer.push_to_hub(model_dir, use_temp_dir=False)