In [2]:
!pip install -q bitsandbytes accelerate transformers datasets  peft tqdm evaluate scikit-learn huggingface_hub

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [25]:
import transformers
print(transformers.__version__)

4.55.4


In [8]:
from huggingface_hub import notebook_login

In [9]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
import numpy as np, torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          BitsAndBytesConfig, Trainer, TrainingArguments)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn.functional as F

MODEL = "meta-llama/Llama-3.2-1B-Instruct"
USE_CLASS_WEIGHTS = False  # flip to True if your labels are imbalanced

In [11]:
# 1) Data
ds = load_dataset("ag_news")
train_val = ds["train"].train_test_split(test_size=0.1, seed=42, stratify_by_column="label")
train_ds, val_ds, test_ds = train_val["train"], train_val["test"], ds["test"]
num_labels = ds["train"].features["label"].num_classes  # 4

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [15]:
ds["train"]["text"][1234],ds["train"]["label"][1234]

('Yahoo to Sell Domain Names (AP) AP - Yahoo Inc. plans to start selling Internet domain names Tuesday as part of its expanding services for small businesses.',
 3)

In [18]:
print(ds["train"].features)

{'text': Value('string'), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])}


In [19]:
# 2) Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
tok.pad_token = tok.eos_token
tok.pad_token_id = tok.eos_token_id

def preprocess(ex):
    return tok(ex["text"], truncation=True, max_length=512)

def to_torch(dataset):
    d = dataset.map(preprocess, batched=True)
    d.set_format(type="torch", columns=["input_ids","attention_mask","label"])
    return d

train_t, val_t, test_t = to_torch(train_ds), to_torch(val_ds), to_torch(test_ds)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [20]:
# 3) Model + LoRA (8-bit base)
quant = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_compute_dtype=torch.float16)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=num_labels, quantization_config=quant, device_map="auto"
)
lora = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="SEQ_CLS",
    target_modules=["q_proj","k_proj","v_proj","o_proj"], modules_to_save=["score"]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora)
model.config.pad_token_id = tok.pad_token_id


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# 4) Optional class weights (only if imbalanced)
class_weights = None
if USE_CLASS_WEIGHTS:
    counts = np.bincount(train_t["label"].numpy(), minlength=num_labels)
    w = (counts.sum() / (num_labels * counts)).astype(np.float32)  # inverse-freq normalized
    class_weights = torch.tensor(w, dtype=torch.float32)

# 5) Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(logits, axis=1)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec}

# 6) Trainer (override loss only if using weights)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # if not using weights, fall back to default
        if not USE_CLASS_WEIGHTS:
            return super().compute_loss(model, inputs, return_outputs=return_outputs, **kwargs)

        # Hugging Face Trainer expects labels in inputs
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # weighted loss
        loss = F.cross_entropy(
            logits, 
            labels, 
            weight=class_weights.to(logits.device)
        )

        return (loss, outputs) if return_outputs else loss


args = TrainingArguments(
    output_dir="ag_news_lora",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    num_train_epochs=3,               # bump to 3–5 for real runs
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    # fp16=True,
    bf16=True, # set to True if using A100 or 3090+ with latest drivers
    gradient_checkpointing=False, # disable for speed unless memory is tight
    warmup_ratio=0.1
)

trainer = WeightedTrainer(
    model=model, args=args,
    train_dataset=train_t, eval_dataset=val_t,
    processing_class=tok, compute_metrics=compute_metrics
)

In [31]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 