In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install evaluate
!pip install sacrebleu



In [None]:
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import math

# Set up models

---



Load pretrained gpt2-medium model from huggingface

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
with torch.no_grad():
  model.resize_token_embeddings((len(tokenizer)))
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ft_model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
with torch.no_grad():
  ft_model.resize_token_embeddings((len(tokenizer)))
ft_model.config.pad_token_id = tokenizer.pad_token_id

Freeze parameters for LoRA model

In [None]:
for param in model.parameters():
  param.requires_grad = False

Create LoRA attention layer, where LoRA is applied to Q and V values

In [None]:
class LoraConv1d(nn.Module):
    def __init__(self, layer, features, rank, alpha):
        super().__init__()
        self.layer = layer
        self.layer.weight.require_grad = False
        self.lora_a = nn.Parameter(layer.weight.new_zeros((features, rank)))
        self.lora_b = nn.Parameter(layer.weight.new_zeros((rank, features)))

        self.lora_a2 = nn.Parameter(layer.weight.new_zeros((features, rank)))
        self.lora_b2 = nn.Parameter(layer.weight.new_zeros((rank, features)))

        nn.init.kaiming_uniform_(self.lora_a, a=math.sqrt(5))
        nn.init.zeros_(self.lora_b)
        nn.init.kaiming_uniform_(self.lora_a2, a=math.sqrt(5))
        nn.init.zeros_(self.lora_b2)
        self.alpha = alpha / rank

    def forward(self, x):
        device = self.lora_a.device
        s = x.shape
        wq = x @ self.lora_a @ self.lora_b
        wv = x @ self.lora_a2 @ self.lora_b2
        lr_qkv = torch.concat((wq, torch.zeros((s)).to(device), wv), dim=-1)
        return self.layer(x) + self.alpha * (lr_qkv)


Replace Attention weights with LoRA version of attention

In [None]:
rank = 4
alpha = 32
c_attn_in = 1024
for block in model.transformer.h:
    block.attn.c_attn = LoraConv1d(block.attn.c_attn, c_attn_in, rank, alpha)

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)

In [None]:
lora_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
ft_trainable_params = sum(p.numel() for p in ft_model.parameters())
print("Number of trainable params in LoRA model:")
print(lora_trainable_params)
print("Number of trainable params in regular model:")
print(ft_trainable_params)

393216

# Load and process dataset

In [None]:
from datasets import load_dataset

og_dataset = load_dataset("e2e_nlg")

In [None]:
og_dataset = og_dataset.rename_column("meaning_representation", "text")
og_dataset = og_dataset.rename_column("human_reference", "labels")

Combine text and labels to single sentence, with = as sep token

In [None]:
def add_end(example):
  example["text"] = example["text"] + " = "
  example["labels"] = example["labels"] + tokenizer.eos_token
  return example

dataset = og_dataset.map(add_end)

Tokenize dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], examples["labels"], padding="max_length", max_length=128, add_special_tokens=True)

def tokenize_test(examples):
    return tokenizer(examples["text"], padding=False)



train_dataset = dataset["train"].map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
eval_dataset = dataset["validation"].map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
test_dataset = dataset["test"].map(tokenize_test, batched=True, remove_columns=dataset["train"].column_names)

Initialize DataCollator

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training Models

Training configurations

In [None]:
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
weight_decay = 0.01
dropout_prob = 0.1
batch_size = 8
epoch = 5
warmup_steps = 500
label_smooth = 0.1
learning_r = 0.0002
learning_rate_schedule = "linear"
evaluation_strategy = "epoch"


beam_size = 10
length_penalty = 0.8
no_repeat_ngram_size = 4

output_dir = '/result/'

training_args = TrainingArguments(
    output_dir = output_dir,
    weight_decay = weight_decay,
    learning_rate = learning_r,
    evaluation_strategy = evaluation_strategy,
    warmup_steps = warmup_steps,
    num_train_epochs = epoch,
    label_smoothing_factor = 0.1,
    save_strategy = "epoch"
)


Train LoRA model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

Train Finetuned model

In [None]:
trainer = Trainer(
    model=ft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

# Generate results

Get References

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

refs = open("refs.txt", 'w', encoding="utf-8")
count = 1
indices = [0]
#test dataset contains multiple references per input.
#indices stores first instance of each input
past = dataset["test"][0]["text"]
for i in range(len(dataset["test"])):
  p = dataset["test"][i]["text"]
  if p != past:
    count += 1
    indices.append(i)
    refs.write("\n")
  past = p
  refs.write(dataset["test"][i]["labels"])
  refs.write("\n")
refs.close()

Generate Outputs

In [None]:
from transformers import GenerationConfig
num_beams = 10
length_p = 0.9
no_repeat_ngram_size=4

gc = GenerationConfig(
    max_length=128,
    num_beams=num_beams,
    length_penalty=length_p,
    no_repeat_ngram_size=no_repeat_ngram_size,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
  )

In [None]:
lora_outputs = []
for i in indices:
    x = torch.tensor([test_dataset[i]["input_ids"], test_dataset[i]["attention_mask"]]).to(model.device)
    output = model.generate(x, generation_config=gc)
    output = tokenizer.decode(output[0]).split("=")[-1] #ignore input
    lora_outputs.append(output)
assert len(lora_outputs) == count

In [None]:
out = open("outputs.txt", 'w', encoding='utf-8')
for output in pp:
  if "<|" in output:
    new_out = output.split("<|")[0] ## removes padding if present
  out.write(new_out)
  out.write("\n")
out.close()

In [None]:
ft_outputs = []
for i in indices:
    x = torch.tensor([test_dataset[i]["input_ids"], test_dataset[i]["attention_mask"]]).to(ft_model.device)
    output = ft_model.generate(x, generation_config=gc)
    output = tokenizer.decode(output[0]).split("=")[-1] #ignore input
    ft_outputs.append(output)
assert len(ft_outputs) == count

In [None]:
out = open("ft-outputs.txt", 'w', encoding='utf-8')
for output in pp:
  if "<|" in output:
    new_out = output.split("<|")[0] ## removes padding if present
  out.write(new_out)
  out.write("\n")
out.close()

# Evaluate Results

In [None]:
!git clone https://github.com/tuetschek/e2e-metrics.git

fatal: destination path 'e2e-metrics' already exists and is not an empty directory.


In [None]:
import sys
sys.path.insert(0, 'e2e-metrics')

In [None]:
!pip install -r e2e-metrics/requirements.txt

In [None]:
!curl -L https://cpanmin.us | perl - App::cpanminus
!cpanm XML::Twig


Evaluate LoRA model

In [None]:
!/e2e-metrics/measure_scores.py refs.txt lora_outputs.txt

Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 129948 tokens at 337157.27 tokens per second.
PTBTokenizer tokenized 17167 tokens at 97048.95 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.449
computing Rouge score...
ROUGE_L: 0.693
computing CIDEr score...
CIDEr: 2.322
Creating temp directory  /tmp/e2e-eval-s04yz03q
Running MTEval to compute BLEU & NIST...
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14 at /content/e2e-metrics/mteval/mteval-v13a-sig.pl line 993.
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14 at /content/e2e-metrics/mteval/mteval-v13a-sig.pl line 993.
MT evaluation scorer began on 2024 May 2 at 01:14:46
command line:  /content/e2e-metrics/mt

Evaluate Fine Tuned model

In [None]:
!/content/e2e-metrics/measure_scores.py refs.txt ft_outputs.txt

Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 129948 tokens at 490178.18 tokens per second.
PTBTokenizer tokenized 16324 tokens at 101861.22 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.443
computing Rouge score...
ROUGE_L: 0.688
computing CIDEr score...
CIDEr: 2.185
Creating temp directory  /tmp/e2e-eval-xocg7pm0
Running MTEval to compute BLEU & NIST...
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14 at /content/e2e-metrics/mteval/mteval-v13a-sig.pl line 993.
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14 at /content/e2e-metrics/mteval/mteval-v13a-sig.pl line 993.
MT evaluation scorer began on 2024 May 2 at 00:44:51
command line:  /content/e2e-metrics/m