# A7: Training Distillation vs LoRA

##  0.Import Necessary Libraries

In [12]:
import torch.nn as nn
import torch
import datasets
import evaluate

from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm 
from peft import PeftModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


 ##  1.Toxic Comment Dataset (1 point)

Find and load a dataset that includes toxic comments or hate speech. This dataset will be used for training and evaluating the models. (1 point)

In [14]:
# Loading Dataset from tweet_eval
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
    "hate": ("text", None) 
}

task_name = "hate"
raw_datasets = datasets.load_dataset("cardiffnlp/tweet_eval", task_name)
print(raw_datasets)
# Expected output: train, validation, test splits with 'text' and 'label' columns



DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2970
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})


In [15]:
# Extract label information
label_list = raw_datasets['train'].features['label'].names  # ['non-hate', 'hate']
label2id = {v: i for i, v in enumerate(label_list)}
id2label = {i: v for v, i in label2id.items()}
label2id



{'non-hate': 0, 'hate': 1}

In [16]:
id2label

{0: 'non-hate', 1: 'hate'}

In [17]:
# Number of labels 
num_labels = len(label_list)  # Should be 2
print(f"Number of labels: {num_labels}")

Number of labels: 2


In [None]:
# Dataset Preprocessing
teacher_id = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(teacher_id)

def tokenize_function(examples):
    sentence1_key, sentence2_key = task_to_keys[task_name]
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*args, max_length=128, truncation=True)
    return result

# Apply tokenization
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Remove unnecessary columns ('text') and rename 'label' to 'labels'
column_dataset = [item for item in task_to_keys[task_name] if item is not None]  # ['text']
tokenized_datasets = tokenized_datasets.remove_columns(column_dataset)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Create subsets using full available sizes ## Just use full dataset since it is little
small_train_dataset = tokenized_datasets["train"].shuffle(seed=1150).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=1150).select(range(len(tokenized_datasets["validation"])))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=1150).select(range(len(tokenized_datasets["test"])))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=64, collate_fn=data_collator)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=64, collate_fn=data_collator)
test_dataloader = DataLoader(small_test_dataset, batch_size=64, collate_fn=data_collator)

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

 ##  2.Odd Layer vs Even Layer Training (2 points)
 Based on the case-studies/distilBERT.ipynb, modify as follows:
1) Train the student model using the odd layers {1, 3, 5, 7, 9, 11} from the 12-layer teacher to the 6-layer student. (1 point)
2) Train the student model using the even layers {2, 4, 6, 8, 10, 12} from the 12-layer teacher to the 6-layer student. (1 point)

In [4]:
# Teacher Model Setup

from transformers import AutoModelForSequenceClassification

teacher_model = AutoModelForSequenceClassification.from_pretrained(
    teacher_id, 
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
teacher_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Student Initialization Function

from transformers import BertConfig

def initialize_student_model(teacher_model, teacher_layers):
    # Get teacher configuration and halve layers
    configuration = teacher_model.config.to_dict()
    configuration['num_hidden_layers'] = 6
    configuration = BertConfig.from_dict(configuration)
    
    # Create student model
    student_model = type(teacher_model)(configuration)
    
    # Copy embeddings
    student_model.bert.embeddings.load_state_dict(teacher_model.bert.embeddings.state_dict())
    
    # Copy specified teacher layers to student layers
    for student_idx, teacher_idx in enumerate(teacher_layers):
        student_layer = student_model.bert.encoder.layer[student_idx]
        teacher_layer = teacher_model.bert.encoder.layer[teacher_idx]
        student_layer.load_state_dict(teacher_layer.state_dict())
    
    # Copy pooler (if present) and classifier
    if hasattr(teacher_model.bert, 'pooler'):
        student_model.bert.pooler.load_state_dict(teacher_model.bert.pooler.state_dict())
    student_model.classifier.load_state_dict(teacher_model.classifier.state_dict())
    
    return student_model

# Define layer indices  (wrote in 0 based indices)
odd_layers = [0, 2, 4, 6, 8, 10]  # 1-based: 1,3,5,7,9,11
even_layers = [1, 3, 5, 7, 9, 11]  # 1-based: 2,4,6,8,10,12

# Initialize student models
student_odd = initialize_student_model(teacher_model, odd_layers).to(device)
student_even = initialize_student_model(teacher_model, even_layers).to(device)

In [6]:
# Training Loop
from transformers import AdamW, get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr = 5e-5

def train_student(student_model, name):
    optimizer = AdamW(student_model.parameters(), lr=lr)
    lr_scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
    progress_bar = tqdm(range(num_training_steps))
    train_losses = []
    
    for epoch in range(num_epochs):
        student_model.train()
        total_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                teacher_outputs = teacher_model(**batch)
            student_outputs = student_model(**batch)
            loss = student_outputs.loss  # Classification loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
        avg_loss = total_loss / len(train_dataloader)
        train_losses.append(avg_loss)
        print(f"{name} - Epoch {epoch+1}: Train Loss {avg_loss:.4f}")
    return train_losses

# Train both models
odd_losses = train_student(student_odd, "Odd Layers")
even_losses = train_student(student_even, "Even Layers")

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 20%|██        | 141/705 [00:38<02:29,  3.76it/s]

Odd Layers - Epoch 1: Train Loss 0.5008


 40%|████      | 282/705 [01:19<01:58,  3.57it/s]

Odd Layers - Epoch 2: Train Loss 0.3481


 60%|██████    | 423/705 [02:00<01:02,  4.52it/s]

Odd Layers - Epoch 3: Train Loss 0.2173


 80%|████████  | 564/705 [02:40<00:31,  4.53it/s]

Odd Layers - Epoch 4: Train Loss 0.1141


100%|██████████| 705/705 [03:20<00:00,  3.51it/s]


Odd Layers - Epoch 5: Train Loss 0.0645


 20%|█▉        | 140/705 [00:39<02:35,  3.63it/s]

Even Layers - Epoch 1: Train Loss 0.5013


 40%|████      | 282/705 [01:18<01:46,  3.96it/s]

Even Layers - Epoch 2: Train Loss 0.3530


 60%|██████    | 423/705 [01:56<01:11,  3.92it/s]

Even Layers - Epoch 3: Train Loss 0.2310


 80%|████████  | 564/705 [02:35<00:31,  4.47it/s]

Even Layers - Epoch 4: Train Loss 0.1311


100%|██████████| 705/705 [03:14<00:00,  3.63it/s]

Even Layers - Epoch 5: Train Loss 0.0835





 ##  3.LoRA (Low-Rank Adaptation) (1 point)
 Implement LoRA to train the 12-layer student model. 

In [None]:
#Setup Lora Model
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSequenceClassification

# Initialize 12-layer model
student_lora = AutoModelForSequenceClassification.from_pretrained(
    teacher_id,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
).to(device)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16, 
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

# Apply LoRA
student_lora = get_peft_model(student_lora, lora_config)

# Training
optimizer = AdamW(student_lora.parameters(), lr=lr)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
progress_bar = tqdm(range(num_training_steps))
lora_losses = []

for epoch in range(num_epochs):
    student_lora.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = student_lora(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    avg_loss = total_loss / len(train_dataloader)
    lora_losses.append(avg_loss)
    print(f"LoRA - Epoch {epoch+1}: Train Loss {avg_loss:.4f}")

# Save model
student_lora.save_pretrained("student_lora")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 141/705 [00:34<02:10,  4.33it/s]

LoRA - Epoch 1: Train Loss 0.6415


 40%|████      | 282/705 [01:09<01:37,  4.36it/s]

LoRA - Epoch 2: Train Loss 0.5701


 60%|██████    | 423/705 [01:44<01:04,  4.36it/s]

LoRA - Epoch 3: Train Loss 0.5315


 80%|████████  | 565/705 [02:19<00:29,  4.75it/s]

LoRA - Epoch 4: Train Loss 0.5064


100%|██████████| 705/705 [02:54<00:00,  4.36it/s]

LoRA - Epoch 5: Train Loss 0.4949


 ##  4.Evaluation and Analysis (1 point)
 1) Evaluate the models on the test set, and analyze the performance of the models trained with Odd Layers, Even Layers, and LoRA. Discuss the differences in performance across the three methods. (0.5 point)
2) Discuss the challenges encountered during the implementation, specifically comparing distillation fine-tuning models (Odd and Even Layer) with LoRA fine-tuning. Propose improvements or modifications to address the challenges. (0.5 point)

In [8]:
#Evaluation Code

def evaluate_model(model, name, test_dataloader, device):
    # Set model to evaluation mode
    model.eval()
    
    # Load metrics fresh for each model
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    
    # Define loss function (CrossEntropyLoss for classification)
    loss_fn = nn.CrossEntropyLoss()
    
    # Initialize variables for loss accumulation
    total_loss = 0.0
    total_samples = 0
    
    # Disable gradient computation for evaluation
    with torch.no_grad():
        for batch in test_dataloader:
            # Move batch to the correct device
            batch = {k: v.to(device) for k, v in batch.items()}
            # Get model outputs
            outputs = model(**batch)
            logits = outputs.logits
            labels = batch["labels"]
            
            # Compute loss for the batch
            loss = loss_fn(logits, labels)
            batch_loss = loss.item() * len(labels)  # Multiply by batch size
            total_loss += batch_loss
            total_samples += len(labels)
            
            # Get predictions (argmax over logits)
            predictions = logits.argmax(dim=-1)
            
            # Add batch results to metrics
            accuracy_metric.add_batch(predictions=predictions, references=labels)
            f1_metric.add_batch(predictions=predictions, references=labels)
    
    # Calculate average loss
    avg_loss = total_loss / total_samples
    
    # Compute final accuracy and F1-score
    accuracy = accuracy_metric.compute()['accuracy']
    f1 = f1_metric.compute()['f1']
    
    # Return metrics
    return avg_loss, accuracy, f1

In [None]:
# Evaluate each model
odd_loss, odd_acc, odd_f1 = evaluate_model(student_odd, "Odd Layers", test_dataloader, device)
even_loss, even_acc, even_f1 = evaluate_model(student_even, "Even Layers", test_dataloader, device)
lora_loss, lora_acc, lora_f1 = evaluate_model(student_lora, "LoRA", test_dataloader, device)

# Print the table
print("| Model Type | Test Set Loss | Test Set Accuracy | F1-Score |")
print("|------------|---------------|-------------------|----------|")
print(f"| Odd Layer  | {odd_loss:.4f}        | {odd_acc*100:.2f}%           | {odd_f1:.2f}    |")
print(f"| Even Layer | {even_loss:.4f}        | {even_acc*100:.2f}%           | {even_f1:.2f}    |")
print(f"| LoRA       | {lora_loss:.4f}        | {lora_acc*100:.2f}%           | {lora_f1:.2f}    |")

| Model Type | Test Set Loss | Test Set Accuracy | F1-Score |
|------------|---------------|-------------------|----------|
| Odd Layer  | 2.7225        | 52.32%           | 0.63    |
| Even Layer | 2.2576        | 51.14%           | 0.63    |
| LoRA       | 0.8840        | 53.77%           | 0.63    |


In [None]:
# Save Model and Data

base_model_save_path = "./lora_base_model"
adapter_save_path = "./lora_adapter"
base_model = student_lora.base_model  

# Save the base model
base_model.save_pretrained(base_model_save_path)

# Save the LoRA adapter weights and configuration
student_lora.save_pretrained(adapter_save_path)

# Save the tokenizer
tokenizer.save_pretrained(base_model_save_path)

('./lora_base_model\\tokenizer_config.json',
 './lora_base_model\\special_tokens_map.json',
 './lora_base_model\\vocab.txt',
 './lora_base_model\\added_tokens.json',
 './lora_base_model\\tokenizer.json')