<a href="https://colab.research.google.com/github/hadywalied/DistillPegasus/blob/main/StudentPegasus_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets sentencepiece
!pip install tensorboard scikit-learn psutil sacrebleu rouge-score tensorflow_datasets pytorch-lightning matplotlib git-python faiss-cpu streamlit elasticsearch nltk pandas datasets fire pytest conllu sentencepiece protobuf
!pip install jax jaxlib
!#pip install torch-lr-finder
!pip install wandb



# *Imports* *and* *drive storage*

In [None]:
# imports 
import logging
from transformers import PegasusTokenizerFast, PegasusForConditionalGeneration,PegasusConfig
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM, Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl, EarlyStoppingCallback
import datasets

import torch
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F

from typing import Callable, Dict, Iterable, List, Tuple, Union
from transformers import EvalPrediction, PreTrainedTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import AdamW
import wandb
import gc

import numpy as np

In [None]:
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
student_decoders = 4

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

directory = f'/content/drive/MyDrive/GP/dataset'
if not os.path.exists(directory):
    os.makedirs(directory)

PATH = f'/content/drive/MyDrive/GP/student_{student_decoders}'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_name = 'gigaword'
model_name = f'google/pegasus-{data_name}'

# Preparing Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preparing the Student

In [None]:
#Student configuration

import warnings
import torch
from torch import nn
from typing import Optional, Tuple, List, Union
from transformers import PegasusModel, PegasusConfig, PegasusForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, PreTrainedModel
from transformers import SummarizationPipeline

students_config_book = {
    '2': PegasusConfig(encoder_layers=2, decoder_layers=2),
    '4': PegasusConfig(encoder_layers=4, decoder_layers=4),
    '6': PegasusConfig(encoder_layers=6, decoder_layers=6),
    '8': PegasusConfig(encoder_layers=8, decoder_layers=8),
    '10': PegasusConfig(encoder_layers=10, decoder_layers=10),
    '12': PegasusConfig(encoder_layers=12, decoder_layers=12),
    '16': PegasusConfig(encoder_layers=16, decoder_layers=16)
}


LAYERS_TO_COPY = {   
    4:{
        1: [0],
        2: [0, 3],
        3: [0, 1, 3],
        4: [0, 1, 2, 3],
    },
    8:{
        1: [0],
        2: [0, 7],
        3: [0, 4, 7],
        4: [0, 3, 6, 7],
        6: [0, 2, 3, 5, 6, 7],
        8: list(range(8)),  
    },    
    12: {
        1: [0],
        2: [0, 11],
        3: [0, 6, 11],
        4: [0, 4, 9, 11],
        6: [0, 2, 5, 8, 10, 11],
        8: [0, 1, 3, 5, 7, 9, 10, 11],
        12: list(range(12)),  
    },
    16: {  # maps  num layers in student -> which teacher layers to copy
        1: [0],
        2: [0, 15],
        3: [0, 8, 15],
        4: [0, 5, 10, 15],
        6: [0, 3, 6, 9, 12, 15],
        8: [0, 2, 4, 6, 8, 10, 12, 15],
        9: [0, 1, 3, 5, 7, 9, 11, 13, 15],
        12: [0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 13, 15],
        16: list(range(16)),
    },}
LAYERS_TO_SUPERVISE = {
    # maps  num layers in student -> which teacher layers to copy.
    8: {1: [5], 2: [3, 5], 3: [1, 4, 5], 4: [1, 2, 4, 5]},
    12: {1: [11], 2: [5, 11], 3: [3, 7, 11], 4:[1, 3, 7, 11],6: [1, 3, 5, 8, 10, 11], 8:[1,2,3,5,7,8,9,11] },
    16: {1: [15], 4: [4, 9, 12, 15], 8: [1, 3, 5, 7, 9, 11, 13, 15], 12:[1,2,3,5,7,8,9,11,12,13,14,15]},
}


def copy_layers(src_layers: nn.ModuleList, dest_layers: nn.ModuleList, layers_to_copy) -> None:
    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
    assert len(dest_layers) == len(
        layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
    dest_layers.load_state_dict(layers_to_copy.state_dict())

# Copied from transformers.models.bart.modeling_bart.shift_tokens_right


def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
    # replace possible -100 values in labels by `pad_token_id`
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


def pick_layers_to_copy(n_student, n_teacher):
    try:
        val = LAYERS_TO_COPY[n_teacher][n_student]
        return val
    except KeyError:
        if n_student != n_teacher:
            warnings.warn(
                f"no hardcoded layers to copy for teacher {n_teacher} -> student {n_student}, defaulting to first {n_student}"
            )
        return list(range(n_student))

def get_layers_to_supervise(n_student, n_teacher) -> List[int]:
    """Used or the --supervise_forward kwarg"""
    if n_student > n_teacher:
        raise ValueError(f"Cannot perform intermediate supervision for student {n_student} > teacher {n_teacher}")
    elif n_teacher == n_student:
        return list(range(n_teacher))
    elif n_student == 1:
        return [n_teacher - 1]
    else:
        return LAYERS_TO_SUPERVISE[n_teacher][n_student]

def create_student_with_configuration(teacher,
                                      e=None,
                                      d=None,
                                      copy_first_teacher_layers = False,
                                      save_path='./student'):

    teacher.eval()
    teacher_e, teacher_d = teacher.config.encoder_layers, teacher.config.decoder_layers
    init_kwargs = teacher.config.to_diff_dict()
    if e is None:
        e = teacher_e
    if d is None:
        d = teacher_d
    init_kwargs.update({"encoder_layers": e, "decoder_layers": d})
    student_cfg = teacher.config_class(**init_kwargs)
    student = AutoModelForSeq2SeqLM.from_config(student_cfg)
    # Start by copying the full teacher state dict this will copy the first N teacher layers to the student.
    info = student.load_state_dict(teacher.state_dict(), strict=False)
    # every student key should have a teacher keys.
    assert info.missing_keys == [], info.missing_keys

    if copy_first_teacher_layers:  # Our copying is done. We just log and save
        e_layers_to_copy, d_layers_to_copy = list(range(e)), list(range(d))
        #student.save_pretrained(save_path)
        return student, e_layers_to_copy, d_layers_to_copy

    # Decide which layers of the teacher to copy. Not exactly alternating -- we try to keep first and last layer.
    e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e)
    d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d)

    copy_layers(teacher.model.encoder.layers,
                student.model.encoder.layers, e_layers_to_copy)
    copy_layers(teacher.model.decoder.layers,
                student.model.decoder.layers, d_layers_to_copy)

    student.config.init_metadata = dict(
        teacher_type=teacher.config.model_type,
        copied_encoder_layers=e_layers_to_copy,
        copied_decoder_layers=d_layers_to_copy,
    )
    #student.save_pretrained(save_path)
    # Save information about copying for easier reproducibility

    return student, e_layers_to_copy, d_layers_to_copy
#student = create_student_with_configuration(teacher,
#                                      e=4,
#                                      d=4,
#                                      copy_first_teacher_layers = False,
#                                      save_path='./student')
#import gc
#del copy_teacher
gc.collect()


4

In [None]:
teacher = AutoModelForSeq2SeqLM.from_pretrained(model_name)

student, e_layers_list, d_layers_list = create_student_with_configuration(
                                      teacher,
                                      e=16,
                                      d=student_decoders,
                                      copy_first_teacher_layers = False,
                                      save_path=PATH)
student.to('cuda')


del teacher
gc.collect()
torch.cuda.empty_cache()


In [None]:
for param in student.model.shared.parameters():
  param.requires_grad = False
for param in student.model.encoder.embed_tokens.parameters():
  param.requires_grad = False
for param in student.model.encoder.embed_positions.parameters():
  param.requires_grad = False
for param in student.model.decoder.embed_tokens.parameters():
  param.requires_grad = False
for param in student.model.decoder.embed_positions.parameters():
  param.requires_grad = False

#Data Preparation

In [None]:
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels["input_ids"])


def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding='longest')
    decodings = tokenizer(labels, truncation=True, padding='longest')
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset

In [None]:
dataset = datasets.load_dataset(data_name)
train_texts, train_labels = dataset['train']['document'][:50000], dataset['train']['summary'][:50000]
valid_texts, valid_labels = dataset['validation']['document'][:1000], dataset['validation']['summary'][:1000]
test_texts, test_labels = dataset['test']['document'][:1000], dataset['test']['summary'][:1000]
train_dataset, valid_dataset, test_dataset = prepare_data(model_name, train_texts, train_labels,valid_texts, valid_labels,test_texts, test_labels)

del dataset 
del (test_texts, test_labels, valid_texts, valid_labels, train_texts, train_labels)
gc.collect()



0

#Trainer

In [None]:
class MyCallback(TrainerCallback):
    """
    A :class:`~transformers.TrainerCallback` that handles the default flow of the training loop for logs, evaluation
    and checkpoints.
    """

    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        gc.collect()
        torch.cuda.empty_cache()

class MyTrainer(Trainer):
    
    def shift_tokens_right(self, input_ids, pad_token_id):
      """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
      prev_output_tokens = input_ids.clone()
      #print(pad_token_id, input_ids)
      x= (input_ids.ne(pad_token_id).sum(dim=1) - 1)
      index_of_eos = x.unsqueeze(-1)
      prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
      prev_output_tokens[:, 1:] = input_ids[:, :-1]
      return prev_output_tokens
    
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        pad_token_id = tokenizer.pad_token_id
        decoder_input_ids = self.shift_tokens_right(labels, pad_token_id)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        #print(f"input: {input_ids}, masks: {attention_mask}")
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        decoder_input_ids=decoder_input_ids,
                        output_hidden_states=False,
                        output_attentions=False,
                        use_cache=False)
        logits = outputs["logits"]
        #print(f'logits: {logits}, labels: {labels}')
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.shape[-1]),labels.view(-1))
        #print(loss)
        return (loss, outputs) if return_outputs else loss



In [None]:
metric = datasets.load_metric('rouge')

In [None]:
training_args = TrainingArguments(
    output_dir=f'{PATH}/output',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f'{PATH}/logs',            # directory for storing logs
    logging_steps=10,
    #load_best_model_at_end  = True,
    #evaluation_strategy = 'epoch',
    #fp16 = True,
    #fp16_full_eval = True,
    #resume_from_checkpoint = f'{PATH}/output/checkpoint-{}'
)


trainer = MyTrainer(
    model=student,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset ,            # evaluation dataset
    compute_metrics = metric,
    tokenizer = tokenizer,
    callbacks = [MyCallback]
)



In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhadywalied[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss
10,10.9756
20,10.9382
30,10.9449
40,10.8978
50,10.8989
60,10.8903
70,10.8455
80,10.8066
90,10.7582
100,10.7331


Step,Training Loss
10,10.9756
20,10.9382
30,10.9449
40,10.8978
50,10.8989
60,10.8903
70,10.8455
80,10.8066
90,10.7582
100,10.7331
