In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset
import evaluate
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType

dataset = load_dataset("ag_news")

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(dataset["train"].features["label"].names)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_proj", "key_proj", "value_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)
model = get_peft_model(model, lora_config)

def preprocess(ex):
    return tokenizer(
        ex["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_train = dataset["train"].map(preprocess, batched=True)
encoded_test = dataset["test"].map(preprocess, batched=True)

encoded_train = encoded_train.rename_column("label", "labels")
encoded_test = encoded_test.rename_column("label", "labels")
encoded_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./deberta-ag-news-lora",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())


In [None]:
# ClassLabel(num_classes=4, names=['World', 'Sports', 'Business', 'Sci/Tech'])

# DatasetDict({
#     train: Dataset({
#         features: {
#             text:  string               # news article text
#             label: ClassLabel           # target label (0‒3)
#         },
#         num_rows: 120000
#     }),
#     test: Dataset({
#         features: {
#             text:  string
#             label: ClassLabel
#         },
#         num_rows: 7600
#     })
# })

# Explanation of preprocessing:
# (a) Dataset is sliced into 1000 rows → temporary batch dict {"text": [...1000], "label":[...]}
# (b) preprocess(ex) → returns tokenized dict {"input_ids": [1000×128], "attention_mask": [...]}
# (c) input_ids and attention_mask are added as new columns → dataset remains with 120,000 rows

# Example (after preprocessing):
# index    text         label   input_ids (128)         attention_mask (128)
#   0      "..."        2       [101, 7592, ...]         [1, 1, ..., 0, 0]
#   1      "..."        1       [101, 2423, ...]         [1, 1, ..., 0, 0]

# input_ids length is 128 because we use max_length=128 with truncation=True

# Preprocessing (tokenizer stage):
# [1000 articles] → tokenizer → [input_ids (1000 × 128)]

# ↓ (after full tokenization)
# Final dataset has 120,000 rows

# At training time:
# [120,000 rows] → batch size = 16
# → [16 rows] passed to model per GPU step