In [None]:
!pip install -U datasets
!pip install evaluate bitsandbytes

Collecting evaluate
  Using cached evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Using cached nvidia_c

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset
import evaluate
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType

dataset = load_dataset("ag_news")

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(dataset["train"].features["label"].names)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_proj", "key_proj", "value_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)
model = get_peft_model(model, lora_config)

def preprocess(ex):
    return tokenizer(
        ex["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_train = dataset["train"].map(preprocess, batched=True)
encoded_test = dataset["test"].map(preprocess, batched=True)

encoded_train = encoded_train.rename_column("label", "labels")
encoded_test = encoded_test.rename_column("label", "labels")
encoded_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="./deberta-ag-news-lora",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
print(trainer.evaluate())

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
500,1.3679,1.356997,0.397368
1000,0.6891,0.674039,0.696842
1500,0.6345,0.595483,0.701842
2000,0.5336,0.575323,0.704342
2500,0.563,0.557439,0.783158
3000,0.5816,0.540642,0.838947
3500,0.5604,0.53152,0.853816
4000,0.4567,0.426971,0.870132
4500,0.3611,0.376831,0.881447
5000,0.3299,0.36033,0.893553


{'eval_loss': 0.27855920791625977, 'eval_accuracy': 0.9140789473684211, 'eval_runtime': 11.2245, 'eval_samples_per_second': 677.091, 'eval_steps_per_second': 21.204, 'epoch': 3.0}


In [None]:
# ClassLabel(num_classes=4, names=['World', 'Sports', 'Business', 'Sci/Tech'])

# DatasetDict({
#     train: Dataset({
#         features: {
#             text:  string               # news article text
#             label: ClassLabel           # target label (0‒3)
#         },
#         num_rows: 120000
#     }),
#     test: Dataset({
#         features: {
#             text:  string
#             label: ClassLabel
#         },
#         num_rows: 7600
#     })
# })

# Explanation of preprocessing:
# (a) Dataset is sliced into 1000 rows → temporary batch dict {"text": [...1000], "label":[...]}
# (b) preprocess(ex) → returns tokenized dict {"input_ids": [1000×128], "attention_mask": [...]}
# (c) input_ids and attention_mask are added as new columns → dataset remains with 120,000 rows

# Example (after preprocessing):
# index    text         label   input_ids (128)         attention_mask (128)
#   0      "..."        2       [101, 7592, ...]         [1, 1, ..., 0, 0]
#   1      "..."        1       [101, 2423, ...]         [1, 1, ..., 0, 0]

# input_ids length is 128 because we use max_length=128 with truncation=True

# Preprocessing (tokenizer stage):
# [1000 articles] → tokenizer → [input_ids (1000 × 128)]

# ↓ (after full tokenization)
# Final dataset has 120,000 rows

# At training time:
# [120,000 rows] → batch size = 16
# → [16 rows] passed to model per GPU step

In [None]:
# 1. Clone the GitHub repository (already done)
git clone https://github.com/hoonyy24/llm-engineering.git

# 2. Move into the cloned repository
cd llm-engineering

# 3. Check the path of the source file you want to copy
ls /home/lee/Deploying

# 4. Copy the notebook into the target subdirectory
cp "/home/lee/Deploying/Deberta Text classification.ipynb" "Sentence Classifcation/"

# 5. Stage, commit, and push to GitHub
git add "Sentence Classifcation/Deberta Text classification.ipynb"
git commit -m "Add DeBERTa notebook"  # -m stands for "message"
git push origin main

In [None]:
Command	Meaning
mkdir -p folder/subfolder	-p stands for "parent" – it creates all necessary parent directories in the path if they don't exist
cp	Stands for "copy" – used to copy files or directories
cd ..	Moves to the parent directory (one level up)
ls	Lists files and directories in the current or specified path