# Starter Notebook

Install and import required libraries

In [1]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Load Tokenizer and Preprocess Data

In [3]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Make train and eval split

In [5]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=1280, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print("Number of train samples:", len(train_dataset))
print("Number of eval samples:", len(eval_dataset))

Number of train samples: 118720
Number of eval samples: 1280


## Training Setup

In [6]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [7]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

## Design Space Exploration

### Config

In [8]:
output_base_dir = "dse_results" # base directory for all DSE runs
os.makedirs(output_base_dir, exist_ok=True)

# hyperparameter ranges for DSE
learning_rates = [1e-5, 2.5e-5, 5e-5]
lora_ranks = [4, 5, 6, 7] 
lora_alpha_scaling = [2, 3, 4]


### Design Space Exploration Loop

In [None]:
import gc # Garbage collector for potentially clearing GPU memory

results = []

for lr in learning_rates:
    for rank in lora_ranks:
        for alpha_scale in lora_alpha_scaling:
            alpha = rank * alpha_scale
            run_name = f"lr_{lr}_rank_{rank}_alpha_{alpha}"
            print(f"\n--- Starting Run: {run_name} ---")

            # Define output directory for this specific run
            current_output_dir = os.path.join(output_base_dir, run_name)
            os.makedirs(current_output_dir, exist_ok=True)

            # 1. Load Base Model (Load fresh for each run)
            print("Loading base model...")
            
            model = RobertaForSequenceClassification.from_pretrained(
                base_model,
                id2label=id2label)

            # Move model to GPU if possible
            if torch.cuda.is_available():
                model.to('cuda')

            # Configure LoRA
            print(f"Configuring LoRA with r={rank}, alpha={alpha}")
            peft_config = LoraConfig(
                r=rank,  # LoRA rank
                lora_alpha=alpha,  # Alpha parameter for scaling
                lora_dropout=0.05, # Dropout probability for LoRA layers
                target_modules=["query", "key", "value"], # Apply LoRA to these layers
                bias="none",  # Don't train bias parameters
                task_type="SEQ_CLS", # Specify the task type
            )

            peft_model = get_peft_model(model, peft_config)

            print("PEFT Model Configured:")
            peft_model.print_trainable_parameters()

            training_args = TrainingArguments(
                output_dir=current_output_dir,
                report_to=None,
                eval_strategy="steps",
                logging_steps=100,
                learning_rate=lr,
                max_steps=1600,
                num_train_epochs=1,
                use_cpu=False,
                dataloader_num_workers=4,
                per_device_train_batch_size=16,
                per_device_eval_batch_size=64, # or 128
                optim="adamw_torch",
                gradient_checkpointing=False,
                gradient_checkpointing_kwargs={'use_reentrant': True},
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False
            )

            trainer = Trainer(
                model=peft_model,
                args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                data_collator=data_collator,
            )

            # 6. Train the model
            print("Starting training...")
            try:
                train_result = trainer.train()
                print("Training finished.")
                trainer.save_model() # Optionally save the final model for the best run later

                # 7. Evaluate the model after training
                print("Evaluating model on evaluation set...")
                eval_metrics, _ = evaluate_model(
                    peft_model,  # Use the trained model from the trainer
                    eval_dataset,
                    labelled=True,
                    batch_size=training_args.per_device_eval_batch_size,
                    data_collator=data_collator
                )
                final_accuracy = eval_metrics.get('accuracy', float('nan'))

            except Exception as e:
                print(f"!!! ERROR during training/evaluation for {run_name}: {e}")
                final_accuracy = float('nan')  # Record failure

            # 8. Store results
            results.append({
                "learning_rate": lr,
                "lora_rank": rank,
                "lora_alpha": alpha,
                "accuracy": final_accuracy,
                "output_dir": current_output_dir
            })
            print(f"Run {run_name} completed. Accuracy: {final_accuracy:.4f}")

            # 9. Clean up memory (Important!)
            del model
            del peft_model
            del trainer
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run: lr_1e-05_rank_4_alpha_8 ---
Loading base model...


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Configuring LoRA with r=4, alpha=8
PEFT Model Configured:
trainable params: 814,852 || all params: 125,463,560 || trainable%: 0.6495
Starting training...


Step,Training Loss,Validation Loss


## Post-DSE Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert results to DataFrame for easy viewing/sorting
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="accuracy", ascending=False)

print("Top 5 Results:")
print(results_df.head())

# Save results to CSV
results_csv_path = os.path.join(output_base_dir, "dse_summary.csv")
results_df.to_csv(results_csv_path, index=False)
print(f"\nFull DSE results saved to: {results_csv_path}")


In [None]:
sns.set_theme(style="whitegrid")

# Filter out failed runs (where accuracy is NaN) for plotting
plot_df = results_df.dropna(subset=['accuracy']).copy()

if not plot_df.empty:
    # 1. Heatmaps (Accuracy vs Rank/Alpha for each Learning Rate)
    print("Generating heatmaps...")
    unique_lrs = sorted(plot_df['learning_rate'].unique())
    n_lrs = len(unique_lrs)
    
    # Determine grid size for subplots
    ncols = min(3, n_lrs) # Max 3 columns
    nrows = (n_lrs + ncols - 1) // ncols
    
    fig_heatmap, axes_heatmap = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 5), squeeze=False)
    axes_heatmap = axes_heatmap.flatten() # Flatten to easily iterate

    plot_idx = 0
    for i, lr in enumerate(unique_lrs):
        ax = axes_heatmap[i]
        subset_df = plot_df[plot_df['learning_rate'] == lr]
        
        if not subset_df.empty:
            try:
                # Pivot table for heatmap: index=rows, columns=cols, values=color
                pivot_table = subset_df.pivot_table(index='lora_alpha', columns='lora_rank', values='accuracy')
                
                sns.heatmap(pivot_table, annot=True, fmt=".4f", cmap="viridis", linewidths=.5, ax=ax, cbar_kws={'label': 'Accuracy'})
                ax.set_title(f'Accuracy (LR={lr:.0e})') # Scientific notation for LR
                ax.set_xlabel("LoRA Rank (r)")
                ax.set_ylabel("LoRA Alpha")
                plot_idx += 1
            except Exception as e:
                print(f"Could not generate heatmap for LR={lr}: {e}")
                ax.set_title(f'Heatmap Failed (LR={lr:.0e})')
                ax.text(0.5, 0.5, 'Plotting Error', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)

    # Hide any unused subplots
    for j in range(plot_idx, len(axes_heatmap)):
        fig_heatmap.delaxes(axes_heatmap[j])

    plt.suptitle('DSE Accuracy: LoRA Rank vs. Alpha (per Learning Rate)', fontsize=16, y=1.02)
    plt.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust layout to prevent title overlap

    # Save the heatmap figure
    heatmap_path = os.path.join(output_base_dir, "dse_heatmap_accuracy.png")
    try:
        plt.savefig(heatmap_path, bbox_inches='tight')
        print(f"Heatmaps saved to: {heatmap_path}")
    except Exception as e:
        print(f"Error saving heatmap: {e}")
    # plt.show() # Uncomment to display plots interactively


    # 2. Faceted Line Plots (Accuracy vs Rank, Lines=Alpha, Facets=LR)
    print("Generating faceted line plots...")
    try:
        # Ensure alpha is treated as a category for distinct lines/colors if desired,
        # but seaborn usually handles numeric hues well. Convert if colors seem wrong.
        # plot_df['lora_alpha_cat'] = plot_df['lora_alpha'].astype(str)

        g = sns.relplot(
            data=plot_df,
            x='lora_rank',
            y='accuracy',
            hue='lora_alpha',  # Different color lines for each alpha
            col='learning_rate', # Separate plots for each learning rate
            kind='line',
            marker='o', # Add markers to show the discrete points tested
            palette='viridis', # Use a nice color palette
            col_wrap=min(3, n_lrs), # Wrap columns if many LRs
            height=4,
            aspect=1.2
        )

        g.figure.suptitle('DSE Accuracy vs. LoRA Rank (Lines: Alpha, Facets: LR)', fontsize=16, y=1.03)
        g.set_axis_labels("LoRA Rank (r)", "Validation Accuracy")
        g.set_titles("LR = {col_name:.0e}") # Format titles
        g.tight_layout(rect=[0, 0, 1, 0.97]) # Adjust layout

        # Save the line plot figure
        lineplot_path = os.path.join(output_base_dir, "dse_lineplot_accuracy.png")
        plt.savefig(lineplot_path, bbox_inches='tight')
        print(f"Faceted line plots saved to: {lineplot_path}")
        plt.show() # Uncomment to display plots interactively
        plt.close(fig_heatmap) # Close previous figure explicitly if not showing
        plt.close(g.figure) # Close the relplot figure

    except Exception as e:
        print(f"Could not generate faceted line plot: {e}")

else:
    print("No valid results (NaN accuracies) found in results_df. Skipping visualization.")


### Run Inference on unlabelled dataset

In [None]:
# Load best model from DSE

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:02<00:00, 3620.66 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [01:46<00:00,  9.42it/s]

Inference complete. Predictions saved to inference_output.csv



