In [None]:
!pip uninstall -y datasets
!pip install datasets==2.18.0
!pip install evaluate


# Fan, A., Grave, E., & Joulin, A. (2019). "Reducing Transformer Depth on Demand with Structured Dropout".

In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# --- LayerSkip utility for pruning entire transformer blocks ---
class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

# --- Data/Eval helpers ---
def preprocess_function(examples, tok, max_length=128):
    return tok(examples['premise'],
               examples['hypothesis'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

# --- Standard fine-tuning ---
def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] MNLI Acc: {acc:.4f}")
    return model

# --- LayerDrop-style layer pruning ---
def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    # Randomly choose layers to drop
    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        # Replace the whole transformer block with identity
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

# --- Main ---
def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load MNLI (subsample for speed)
    train_ds = load_dataset("glue", "mnli", split="train").shuffle(seed).select(range(2000))
    dev_ds   = load_dataset("glue", "mnli", split="validation_matched").select(range(1000))

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["premise","hypothesis","idx"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["premise","hypothesis","idx"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=128)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)




# --- LayerSkip utility for pruning entire transformer blocks ---

class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)


# --- Data/Eval helpers ---
def preprocess_function(examples, tok, max_length=64):
    return tok(examples['sentence1'],
               examples['sentence2'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

# --- Standard fine-tuning ---
def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] MRPC Acc: {acc:.4f}")
    return model

# --- LayerDrop-style layer pruning ---
def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    # Randomly choose layers to drop
    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        # Replace the whole transformer block with identity
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

# --- Main ---
def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load MRPC (subsample for speed)
    train_ds = load_dataset("glue", "mrpc", split="train").shuffle(seed).select(range(1000))
    dev_ds   = load_dataset("glue", "mrpc", split="validation")

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["sentence1","sentence2","idx"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["sentence1","sentence2","idx"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

def preprocess_function(examples, tok, max_length=64):
    # SST-2 has only 'sentence'
    return tok(examples['sentence'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] SST-2 Acc: {acc:.4f}")
    return model

def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# load & preprocess SST-2 subset
    train_ds = load_dataset("glue", "sst2", split="train").shuffle(seed).select(range(5000))
    dev_ds   = load_dataset("glue", "sst2", split="validation")



    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["sentence"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["sentence"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

def preprocess_function(examples, tok, max_length=64):
    return tok(examples['sentence'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] CoLA Acc: {acc:.4f}")
    return model

def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load CoLA
    train_ds = load_dataset("glue", "cola", split="train")
    dev_ds   = load_dataset("glue", "cola", split="validation")

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["sentence"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["sentence"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

def preprocess_function(examples, tok, max_length=64):
    return tok(examples['question'],
               examples['sentence'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] QNLI Acc: {acc:.4f}")
    return model

def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load QNLI

    train_ds = load_dataset("glue", "qnli", split="train").shuffle(seed).select(range(5000))
    dev_ds   = load_dataset("glue", "qnli", split="validation")



    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["question","sentence"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["question","sentence"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

def preprocess_function(examples, tok, max_length=64):
    return tok(examples['question1'],
               examples['question2'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] QQP Acc: {acc:.4f}")
    return model

def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load QQP (subsample for speed; remove .select(range(1000)) for full)
    train_ds = load_dataset("glue", "qqp", split="train").select(range(2000))
    dev_ds   = load_dataset("glue", "qqp", split="validation")

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["question1","question2","idx"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["question1","question2","idx"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random
from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import evaluate
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# --- LayerSkip utility for pruning entire transformer blocks ---
class SkipLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        if return_dict:
            from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
            return BaseModelOutputWithPastAndCrossAttentions(
                last_hidden_state=hidden_states,
                hidden_states=None,
                attentions=None,
                past_key_values=None,
                cross_attentions=None
            )
        else:
            return (hidden_states, None, None)

# --- Data/Eval helpers ---
def preprocess_function(examples, tok, max_length=64):
    return tok(examples['sentence1'],
               examples['sentence2'],
               truncation=True,
               padding='max_length',
               max_length=max_length)

def evaluate_model(model, dl, device):
    model.eval()
    metric = evaluate.load("accuracy")
    preds, labs = [], []
    with torch.no_grad():
        for b in dl:
            ids = b['input_ids'].to(device)
            mask = b['attention_mask'].to(device)
            labs.extend(b['labels'].cpu().numpy())
            out = model(input_ids=ids, attention_mask=mask)
            preds.extend(torch.argmax(out.logits, -1).cpu().numpy())
    return metric.compute(predictions=preds, references=labs)["accuracy"]

# --- Standard fine-tuning ---
def finetune_model(model, train_loader, dev_loader, device, epochs):
    model.train()
    opt   = torch.optim.Adam(model.parameters(), lr=2e-5)
    sched = get_linear_schedule_with_warmup(opt,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for b in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(input_ids=b['input_ids'].to(device),
                            attention_mask=b['attention_mask'].to(device),
                            labels=b['labels'].to(device))
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        acc = evaluate_model(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] RTE Acc: {acc:.4f}")
    return model

# --- LayerDrop-style layer pruning ---
def layerdrop_prune(model, num_prune=4, seed=42):
    layers = model.roberta.encoder.layer
    total_layers = len(layers)
    hidden_size = layers[0].output.dense.out_features

    rng = np.random.default_rng(seed)
    prune_idxs = rng.choice(total_layers, size=num_prune, replace=False)
    prune_idxs = sorted(list(prune_idxs))
    print(f"LayerDrop: Pruning layers {prune_idxs} out of {total_layers}")

    for idx in prune_idxs:
        layers[idx] = SkipLayer(hidden_size)
    return prune_idxs

# --- Main ---
def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load RTE (no subsampling, small dataset)
    train_ds = load_dataset("glue", "rte", split="train")
    dev_ds   = load_dataset("glue", "rte", split="validation")

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                         batched=True,
                         remove_columns=["sentence1","sentence2","idx"])\
                    .rename_column("label","labels")
    dev   = dev_ds.map(lambda ex: preprocess_function(ex, tokenizer),
                       batched=True,
                       remove_columns=["sentence1","sentence2","idx"])\
                  .rename_column("label","labels")

    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=64)
    train_loader = DataLoader(train, batch_size=8, shuffle=True, collate_fn=collator)
    dev_loader   = DataLoader(dev, batch_size=16, shuffle=False, collate_fn=collator)

    # --- Stage 1: Full fine-tuning ---
    print("\n=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)
    model = finetune_model(model, train_loader, dev_loader, device, epochs=6)
    acc_full = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 6-epoch full fine-tuning: {acc_full:.4f}")

    # --- Stage 2: LayerDrop Pruning ---
    print("\n=== Stage 2: LayerDrop Pruning (Remove 4 layers) ===")
    prune_idxs = layerdrop_prune(model, num_prune=4, seed=seed)

    # --- Stage 3: Fine-tune pruned model ---
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune_model(model, train_loader, dev_loader, device, epochs=5)
    acc_pruned = evaluate_model(model, dev_loader, device)
    print(f"\nAccuracy after 5-epoch post-pruning fine-tuning: {acc_pruned:.4f}")

    print(f"Pruned layer indices: {prune_idxs}")

if __name__ == "__main__":
    main()


In [None]:
import numpy as np
import random
import math
import warnings

# Monkey-patch numpy.array to ignore the copy argument (for NumPy 2.0 compatibility)
_np_array = np.array
def _patched_array(obj, *args, copy=False, **kwargs):
    return _np_array(obj, *args, **kwargs)
np.array = _patched_array

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader
import evaluate

from transformers import (
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
from datasets import load_dataset

from collections import defaultdict

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# 1) SKIP FFN LAYER (for pruning)
class SkipFF(nn.Module):
    def forward(self, hidden_states, input_tensor=None):
        return input_tensor

def prune_fixed_layers(model, prune_idxs=[0, 5, 7, 10]):
    for idx in prune_idxs:
        layer = model.roberta.encoder.layer[idx]
        layer.intermediate.dense = nn.Identity()
        layer.output = SkipFF()
    print("Pruned layers:", prune_idxs)

# 2) LoRA block (optional, can be removed if not using LoRA)
class LoRA(nn.Module):
    def __init__(self, W0, r=2, alpha=1.0):
        super().__init__()
        self.register_buffer("W0", W0.clone().detach())
        L, M = W0.shape
        self.B = nn.Parameter(torch.randn(L, r) * 0.01)
        self.A = nn.Parameter(torch.zeros(r, M))
        self.scaling = alpha / r
    def forward(self):
        return self.W0 + self.scaling * (self.B @ self.A)

def apply_lora_to_all_layers(model, r=2, alpha=1.0):
    loras = {}
    for idx, layer in enumerate(model.roberta.encoder.layer):
        if not hasattr(layer.output, 'dense'):
            continue
        W0 = layer.output.dense.weight.data
        lora = LoRA(W0, r, alpha).to(W0.device)
        def fwd(x, layer=layer, lora=lora):
            return F.linear(x, lora(), layer.output.dense.bias)
        layer.output.dense.forward = fwd
        loras[idx] = lora
    return loras

# 3) STS-B Evaluation
def evaluate_stsb(model, dataloader, device):
    model.eval()
    metric = evaluate.load("glue", "stsb")
    preds, refs = [], []
    with torch.no_grad():
        for batch in dataloader:
            out = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
            )
            # flatten predictions
            p = out.logits.squeeze(-1).cpu().tolist()
            preds.extend(p if isinstance(p, list) else [p])
            # flatten references (handle [[5.0], [4.75], ...])
            r = batch["labels"].cpu().tolist()
            for x in r:
                if isinstance(x, (list, tuple, np.ndarray)):
                    refs.append(float(x[0]))
                else:
                    refs.append(float(x))
    return metric.compute(predictions=preds, references=refs)

# 4) Fine-tuning block
def finetune(train_loader, dev_loader, device, model, epochs=6, lr=2e-5):
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    sched = get_linear_schedule_with_warmup(
        opt, num_warmup_steps=0, num_training_steps=len(train_loader)*epochs
    )
    scaler = GradScaler()
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            opt.zero_grad()
            with autocast():
                out = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=batch["attention_mask"].to(device),
                    labels=batch["labels"].to(device),
                )
                scaler.scale(out.loss).backward()
            scaler.step(opt)
            scaler.update()
            sched.step()
        metrics = evaluate_stsb(model, dev_loader, device)
        print(f"[Epoch {epoch+1}] STS-B Pearson: {metrics['pearson']:.4f}, Spearman: {metrics['spearmanr']:.4f}")
    return model

# 5) Main Entrypoint
def main():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
    train_ds = load_dataset("glue", "stsb", split="train").shuffle(seed)
    dev_ds   = load_dataset("glue", "stsb", split="validation")

    def preprocess(ex):
        return tokenizer(
            ex["sentence1"], ex["sentence2"],
            truncation=True, padding="max_length", max_length=128
        )

    train_ds = train_ds.map(preprocess, batched=True)
    dev_ds   = dev_ds.map(preprocess, batched=True)
    train_ds = train_ds.map(lambda x: {"labels": float(x["label"])}, batched=False)
    dev_ds   = dev_ds.map(lambda x: {"labels": float(x["label"])}, batched=False)
    train_ds = train_ds.remove_columns(["sentence1", "sentence2", "label", "idx"])
    dev_ds   = dev_ds.remove_columns(["sentence1", "sentence2", "label", "idx"])
    train_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    dev_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    collator     = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=128)
    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  collate_fn=collator)
    dev_loader   = DataLoader(dev_ds,   batch_size=16, shuffle=False, collate_fn=collator)

    # Stage 1: Full fine-tuning
    print("=== Stage 1: Full Fine-Tuning (No Pruning) ===")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1).to(device)
    model.gradient_checkpointing_enable()
    model = finetune(train_loader, dev_loader, device, model, epochs=6, lr=2e-5)
    metrics = evaluate_stsb(model, dev_loader, device)
    print(f"Pearson after 6-epoch fine-tuning: {metrics['pearson']:.4f}")

    # Stage 2: Prune fixed layers
    print("\n=== Stage 2: Prune Layers [0,5,7,10] ===")
    prune_fixed_layers(model, prune_idxs=[0, 5, 7, 10])

    # Stage 3: Fine-tune pruned model
    print("\n=== Stage 3: Fine-Tune Pruned Model (5 epochs) ===")
    model = finetune(train_loader, dev_loader, device, model, epochs=5, lr=1e-5)
    metrics = evaluate_stsb(model, dev_loader, device)
    print(f"Pearson after 5-epoch post-pruning fine-tuning: {metrics['pearson']:.4f}")

if __name__ == "__main__":
    main()
