# Starter Notebook

Install and import required libraries

In [1]:
!pip install --upgrade pip
!pip install transformers bitsandbytes torch tf-keras
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install scikit-learn
!pip install sentencepiece



In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm
2025-04-16 13:05:18.551092: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-16 13:05:18.559296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744823118.568839  992321 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744823118.571446  992321 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744823118.579819  992321 computation_placer.cc:177] computation placer already r

## Load Tokenizer and Preprocess Data

In [3]:
import torch
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))

True
12.4
NVIDIA RTX A2000 12GB


In [4]:
from transformers import MarianMTModel, MarianTokenizer
import random

base_model = 'roberta-base'

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,                  # or load_in_8bit=True for 8-bit
#     bnb_4bit_compute_dtype=torch.float16, 
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
# )

# # Load the model in 4-bit precision
# model = RobertaForSequenceClassification.from_pretrained(
#     base_model,
#     quantization_config=bnb_config,
#     device_map="auto",  # auto device placement across GPUs/CPU
#     # id2label=id2label,  # if you have a custom id2label
# )

# peft_config = LoraConfig(
#     r=8,                         # Example rank
#     lora_alpha=32,              # LoRA scaling factor
#     lora_dropout=0.05,
#     bias="none",
#     target_modules=["query","key","value"],
#     task_type="SEQ_CLS",
# )

# lora_model = get_peft_model(model, peft_config)
# lora_model.print_trainable_parameters()


dataset = load_dataset('ag_news', split='train')



src_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
src_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")

tgt_tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
tgt_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-de-en")

def backtranslate(text):
    # English -> German
    inputs = src_tokenizer(text, return_tensors="pt")
    german_tokens = src_model.generate(**inputs)
    german_text = src_tokenizer.decode(german_tokens[0], skip_special_tokens=True)
    
    # German -> English
    inputs = tgt_tokenizer(german_text, return_tensors="pt")
    english_tokens = tgt_model.generate(**inputs)
    english_text = tgt_tokenizer.decode(english_tokens[0], skip_special_tokens=True)
    
    return english_text

def augment_with_backtranslation(examples, prob=0.3):
    # We'll do back-translation on ~30% of samples as an example
    augmented_texts = []
    for txt in examples["text"]:
        if random.random() < prob:
            new_txt = backtranslate(txt)
            augmented_texts.append(new_txt)
        else:
            augmented_texts.append(txt)
    return {
        "text": augmented_texts,
        "label": examples["label"],
    }

tokenizer = RobertaTokenizer.from_pretrained(base_model)

augmented_dataset = dataset.map(augment_with_backtranslation, batched=True)


def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Map: 100%|██████████| 120000/120000 [14:36:57<00:00,  2.28 examples/s]  


In [5]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [6]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

In [7]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=1280, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print("Number of train samples:", len(train_dataset))
print("Number of eval samples:", len(eval_dataset))

Number of train samples: 118720
Number of eval samples: 1280


In [8]:
import torch
from torch.quantization import quantize_dynamic

# Suppose 'model' is your RobertaForSequenceClassification (already fine-tuned)
quantized_model = quantize_dynamic(
    model, 
    {torch.nn.Linear},  # Which layers to quantize (usually linear layers)
    dtype=torch.qint8
)

# Now you have a model with int8 weights (on CPU).
# Evaluate or do inference:
quantized_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (key): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (value): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
              (dropout):

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [9]:
# # PEFT Config
# peft_config = LoraConfig(
#     r=2,
#     lora_alpha=4,
#     lora_dropout=0.05,
#     bias = 'none',
#     target_modules = ['query'],
#     task_type="SEQ_CLS",
# )

# Configure LoRA
peft_config = LoraConfig(
    r=10,  # LoRA rank
    lora_alpha=20,  # Alpha parameter for scaling
    lora_dropout=0.05, # Dropout probability for LoRA layers
    target_modules=["query", "key"], # Apply LoRA to these layers
    bias="none",  # Don't train bias parameters
    task_type="SEQ_CLS", # Specify the task type
)

In [10]:
import bitsandbytes as bnb
print(bnb)

dir(bnb)

<module 'bitsandbytes' from '/home/joey/sp25-dl/project2/env/lib/python3.11/site-packages/bitsandbytes/__init__.py'>


['MatmulLtState',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__pdoc__',
 '__spec__',
 '__version__',
 'adam',
 'autograd',
 'bmm_cublas',
 'cextension',
 'consts',
 'cuda_specs',
 'functional',
 'matmul',
 'matmul_4bit',
 'matmul_cublas',
 'mm_cublas',
 'modules',
 'nn',
 'optim',
 'research',
 'triton',
 'utils']

In [11]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [12]:
# print("Trainable parameters:")
# for name, param in peft_model.named_parameters():
#     if param.requires_grad:
#         print(name)

In [13]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 962,308 || all params: 125,611,016 || trainable%: 0.7661


## Training Setup

In [14]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [15]:
# Setup Training args
output_dir = "results"

training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy="steps",
    logging_steps=100,
    learning_rate=1e-5,
    max_steps=2400,
    num_train_epochs=1,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64, # or 128
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant': True},
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

### Start Training

In [16]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
100,1.3862,1.378577,0.419531
200,1.3759,1.369483,0.370312
300,1.3683,1.358985,0.496094
400,1.356,1.343863,0.6375
500,1.3348,1.319851,0.745313
600,1.2979,1.278865,0.841406
700,1.2445,1.216538,0.824219
800,1.1792,1.117253,0.870313
900,1.0638,0.990896,0.86875
1000,0.9375,0.850001,0.867188


In [17]:
import torch
from torch.quantization import quantize_dynamic

# Suppose 'model' is your RobertaForSequenceClassification (already fine-tuned)
quantized_model = quantize_dynamic(
    peft_model, 
    {torch.nn.Linear},  # Which layers to quantize (usually linear layers)
    dtype=torch.qint8
)

# Now you have a model with int8 weights (on CPU).
# Evaluate or do inference:
quantized_model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): DynamicQuantizedLinear(in_features=768, out_features=768, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=Fals

### Run Inference on eval_dataset

In [18]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [19]:
import torch.utils.data as data_utils

# Check evaluation accuracy
testset = load_dataset('ag_news', split='test')

tokenized_testset = testset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_testset = tokenized_testset.rename_column("label", "labels")
indices = torch.arange(1280)
tokenized_testset_sub = data_utils.Subset(tokenized_testset, indices)

_, _ = evaluate_model(peft_model, tokenized_testset_sub, True, 64, data_collator)

100%|██████████| 20/20 [00:13<00:00,  1.53it/s]

Evaluation Metric: {'accuracy': 0.87421875}





### Run Inference on unlabelled dataset

In [20]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:02<00:00, 3508.01 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [21]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [01:43<00:00,  9.67it/s]

Inference complete. Predictions saved to inference_output.csv



