In [None]:
import transformers

In [None]:
# Install desired version of transformers
!pip uninstall transformers
!pip install -e /content/drive/MyDrive/transformers

Found existing installation: transformers 4.40.1
Uninstalling transformers-4.40.1:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.40.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.40.1
Obtaining file:///content/drive/MyDrive/transformers
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sacremoses (from transformers==3.2.0)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: transformers
  Building editable for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

!pip install unidecode



In [None]:
import torch.nn.init as init
class PrefixTuning(nn.Module):

    def __init__(self, pretrained_config, prompt_len=48, hidden_dim = 800):

        super().__init__()

        self.match_n_layer = pretrained_config.num_layers
        self.match_n_head = pretrained_config.num_heads
        self.n_embd = pretrained_config.d_model
        self.match_n_embd = self.n_embd // self.match_n_head

        # Config of Pre-Trained LM
        # torch.tensor([0, 1, 2, .. , prefix_len-1])
        self.pretrained_config = pretrained_config
        self.pre_prompt = torch.arange(prompt_len)

        # Embedding
        self.wte = nn.Embedding(num_embeddings=prompt_len, embedding_dim=self.n_embd)
        # Reparameterization
        self.control_trans = nn.Sequential(
            nn.Linear(self.n_embd, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 2 * self.match_n_layer * self.n_embd)
        )


        self.wte2 = nn.Embedding(num_embeddings=prompt_len, embedding_dim=self.n_embd)
        # Reparameterization
        self.control_trans2 = nn.Sequential(
            nn.Linear(self.n_embd, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 2 * self.match_n_layer * self.n_embd)
        )


        self.wte_enc = nn.Embedding(prompt_len, self.n_embd)
        self.control_trans_enc = nn.Sequential(
                        nn.Linear(self.n_embd, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, self.match_n_layer * 2 * self.n_embd))

        self.prompt_len = prompt_len
        self.dropout = nn.Dropout(0.1)

    def forward(self, batch_size, device, sample_size = 1):
        # Shape: batch_size, prompt_len
        input_tokens = self.pre_prompt.unsqueeze(0).expand(batch_size, -1).to(device)
        # Shape: batch_size, prompt_len, d_model
        temp_control = self.wte(input_tokens)
        # Shape: batch_size, prompt_len, d_model
        past_key_values = self.control_trans(temp_control)


        temp_control2 = self.wte2(input_tokens)
        past_key_values2 = self.control_trans2(temp_control2)  # bsz, seqlen, layer*emb

        temp_control_enc = self.wte_enc(input_tokens)
        past_key_values_enc = self.control_trans_enc(temp_control_enc)  # bsz, seqlen, layer*emb


        if sample_size > 1:
            past_key_values = torch.cat(sample_size * [past_key_values])

        bsz, seqlen, _ = past_key_values.shape
        past_key_values = past_key_values.view(bsz, seqlen, self.match_n_layer * 2, self.match_n_head,
                                               self.match_n_embd)
        past_key_values = self.dropout(past_key_values)
        past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2)

        if sample_size > 1:
            past_key_values2 = torch.cat(sample_size * [past_key_values2])

        past_key_values2 = past_key_values2.view(bsz, seqlen, self.match_n_layer * 2, self.match_n_head,
                                                   self.match_n_embd)
        past_key_values2 = self.dropout(past_key_values2)
        past_key_values2 = past_key_values2.permute([2, 0, 3, 1, 4]).split(2)


        bsz_enc, seqlen, _ = past_key_values_enc.shape
        past_key_values_enc = past_key_values_enc.view(bsz_enc, seqlen, self.match_n_layer * 2, self.match_n_head,
                                                     self.match_n_embd)
        past_key_values_enc = self.dropout(past_key_values_enc)
        past_key_values_enc = past_key_values_enc.permute([2, 0, 3, 1, 4]).split(2)

        result = []
        for i, key_val in enumerate(past_key_values):
            temp_dict = {'self': {"prev_key": key_val[0].contiguous(),
                                  "prev_value": key_val[1].contiguous()
                                 },
                        }
            key_val2 = past_key_values2[i]
            temp_dict['encoder_decoder'] = {"prev_key": key_val2[0].contiguous(),
                                                "prev_value": key_val2[1].contiguous()
                                                }
            key_val_enc = past_key_values_enc[i]
            temp_dict['encoder'] = {"prev_key": key_val_enc[0].contiguous(),
                                        "prev_value": key_val_enc[1].contiguous()
                                        }
            result.append(temp_dict)

        return result

In [None]:
import copy
class LineByLineData2TextTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    def __init__(self, tokenizer: T5Tokenizer, file_path: str, block_size: int, eos_tok:str):

        with open(file_path, encoding="utf-8") as f:
            lines = [line.split('||') for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()
                                                                             and len(line.split('||')) ==2 )]
        src_lines, tgt_lines = list(zip(*lines))
        src_lines = list(src_lines)
        tgt_lines = list(tgt_lines)

        srcs = []
        tgts = []

        for src, tgt in zip(src_lines, tgt_lines):
            input = '{} {}'.format(src, eos_tok)
            target = '{} {}'.format(tgt, eos_tok)
            srcs.append(input)
            tgts.append(target)


        batch_encoding_src = tokenizer(srcs, is_split_into_words=False)
        batch_encoding_tgt = tokenizer(tgts, is_split_into_words=False)

        self.srcs = batch_encoding_src["input_ids"]
        self.labels = batch_encoding_tgt["input_ids"]

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.srcs)

    # def __getitem__(self, i) -> torch.Tensor:
    def __getitem__(self, i):
        return self.srcs[i], self.labels[i]

    def collate_fn(self, batch):
      max_len_data=0
      max_len_label=0
      for description, target in batch:
          if len(description)>max_len_data: max_len_data=len(description)
          if len(target)>max_len_label: max_len_label=len(target)

      attn_masks=[]
      targets=[]
      descriptions=[]

      for description, target in batch:
          description.extend([self.tokenizer.pad_token_id]*(max_len_data-len(description)))
          descriptions.append(description)

          attn_mask=[int(e!=self.tokenizer.pad_token_id) for e in description]
          attn_masks.append(attn_mask)

          target.extend([-100]*(max_len_label-len(target)))
          targets.append(target)

      return torch.LongTensor(descriptions), torch.LongTensor(attn_masks), torch.LongTensor(targets)


In [None]:
def read_e2e_files(path, tokenizer):
    file_dict = {}

    with open(path, 'r') as f:
        for line in f:
            src, tgt = line.strip().split('||')
            if src not in file_dict:
                file_dict[src] = []
            print()
            file_dict[src].append(tgt)
    return file_dict

def write_e2e_corr(prompt_lst, file_dict, corr_path):
    print(len(prompt_lst))
    with open(corr_path, 'w') as f:
        for x in prompt_lst:
            for line in file_dict[x]:
                if not line.strip():
                    print('PROBLEM', line,'PROBLEM',file_dict[x] )
                else:
                    print(line, file=f)
            print('', file=f)

def write_e2e_src(prompt_lst, corr_path):
    with open(corr_path, 'w') as f:
        for x in prompt_lst:
            print(x, file=f)
    return

In [None]:
def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

In [None]:
import json
import yaml

import numpy as np
import torch
from pathlib import Path
from transformers.configuration_t5 import T5Config
from transformers import Adafactor, get_linear_schedule_with_warmup
from transformers.modeling_t5 import T5ForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(101)

prefix_size = 5
batch_size = 5
learning_rate = 5e-5
epochs = 10
gradient_accumulation_steps = 1


config = T5Config.from_pretrained('t5-base')
config.use_prefix = True
config.preseqlen = prefix_size

    # Pre-Trained T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')
tokenizer.max_length = 512


    # Pre-Trained T5 Model
model = T5ForConditionalGeneration.from_pretrained('t5-base', config=config).to(device)
model.resize_token_embeddings(len(tokenizer))

freeze_params(model.shared)
for d in [model.encoder, model.decoder]:
      freeze_params(d.embed_tokens)

prefix_model = PrefixTuning(model.config, prefix_size).to(device)

    # Initialize datasets and dataloaders
dataset_train = LineByLineData2TextTextDataset(
        tokenizer,
        "/content/drive/MyDrive/E2E/src1_train.txt",
        tokenizer.max_length,
        tokenizer.eos_token)
dataset_eval = LineByLineData2TextTextDataset(
        tokenizer,
        "/content/drive/MyDrive/E2E/src1_valid.txt",
        tokenizer.max_length,
        tokenizer.eos_token)

dataloader_train = DataLoader(dataset_train, batch_size= batch_size, shuffle=True, collate_fn=dataset_train.collate_fn)
dataloader_eval = DataLoader(dataset_eval, batch_size= batch_size, shuffle=False, collate_fn=dataset_eval.collate_fn)

optimizer = Adafactor(prefix_model.parameters(),
                      lr=learning_rate,
                      scale_parameter=False,
                      relative_step=False)

total_training_steps = epochs * len(dataloader_train)

scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=2000,
    num_training_steps=total_training_steps,
)

for epoch in range(epochs):
  prefix_model.train()
  epoch_loss = 0
  for step, (data, attention_mask, target) in enumerate(dataloader_train):

      data = data.to(device)
      attention_mask = attention_mask.to(device)
      target = target.to(device)

      prefix = prefix_model(batch_size=data.shape[0], device=device)
      outputs = model(input_ids=data, attention_mask=attention_mask, labels=target, past_key_values=prefix, use_cache = False, use_prefix  = True)

      loss = outputs[0]

      if (step + 1) % gradient_accumulation_steps == 0:
        loss.backward()

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        prefix_model.zero_grad()

      epoch_loss += loss.item()

  print("Epoch " + str(epoch) + " loss: " + str(epoch_loss/len(dataloader_train)))

  # Evaluate on entire validation set after an epoch of training
  if epoch % 1 == 0:
    with torch.no_grad():
      prefix_model.eval()

      file_dict = {}

      for step, (data, attention_mask, target) in enumerate(dataloader_eval):
        data = data.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        prefix = prefix_model(batch_size=data.shape[0], device=device, sample_size = 5)
        outputs = model.generate(input_ids = data, attention_mask=attention_mask, num_beams=5, max_length = 384, past_key_values = prefix, use_prefix = True, use_cache = True)
        output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        filtered_tensor = torch.where(target == -100, torch.tensor(0, device=device), target)
        ground_truth = tokenizer.batch_decode(filtered_tensor, skip_special_tokens=True)

        for i in range(len(output_text)):
          if output_text[i] not in file_dict:
            file_dict[output_text[i]] = []
          file_dict[output_text[i]].append(ground_truth[i])

      ref_file = f'/content/drive/MyDrive/E2E/eval/gold/base/prefix_dataset{"base"}_model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.txt'
      pred_file = f'/content/drive/MyDrive/E2E/eval/src/base/prefix_dataset{"base"}_model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.txt'
      results_file = f'/content/drive/MyDrive/E2E/eval/metrics/base/prefix_dataset{"e2e"}_model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.txt'

      write_e2e_corr(list(file_dict.keys()), file_dict, ref_file)
      write_e2e_src(list(file_dict.keys()), pred_file)
      !python /content/drive/MyDrive/E2E/e2e-metrics/measure_scores.py {ref_file} {pred_file} -p  -t -H >> {results_file}

      model_name = f'/content/drive/MyDrive/E2E/models/base/prefix_dataset{"e2e"}_model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}best.pt'
      torch.save(prefix_model.state_dict(), model_name)

Init the T5ForConditionalGeneration Model with config.use_prefix=True, config.preseqlen=5


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1630.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))


Epoch 0 loss: 1.161553771724234
Epoch 1 loss: 1.00836163456001
Epoch 2 loss: 0.9848511714324875
Epoch 3 loss: 0.970779952700195
494
Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 121004 tokens at 595901.65 tokens per second.
PTBTokenizer tokenized 14412 tokens at 130949.63 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.487
computing Rouge score...
ROUGE_L: 0.748
computing CIDEr score...
CIDEr: 2.518
Running Py-MTEval metrics...
Epoch 4 loss: 0.9602026917606249
Epoch 5 loss: 0.9520102148495007
Epoch 6 loss: 0.9448587456951665
465
Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 121004 tokens at 566953.92 tokens per second.
PTBTokenizer tokenized 13676 tokens at 124351.87 tokens per se

In [None]:
!python /content/drive/MyDrive/E2E/e2e-metrics/measure_scores.py {ref_file} {pred_file} -p  -t -H >> {results_file}

In [None]:
from transformers.configuration_t5 import T5Config

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

prefix_size = 5
batch_size = 5
learning_rate = 5e-5
epochs = 10
gradient_accumulation_steps = 1


config = T5Config.from_pretrained('t5-base')
config.use_prefix = True
config.preseqlen = prefix_size

    # Pre-Trained T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')
tokenizer.max_length = 512


    # Pre-Trained T5 Model
model = T5ForConditionalGeneration.from_pretrained('t5-base', config=config).to(device)
model.resize_token_embeddings(len(tokenizer))

freeze_params(model.shared)
for d in [model.encoder, model.decoder]:
      freeze_params(d.embed_tokens)

prefix_model = PrefixTuning(model.config, prefix_size).to(device)
prefix_model.load_state_dict(torch.load("/content/drive/MyDrive/E2E/models/base/HAprefix_datasete2e_model_base_lr5e-05_prefixlen5_epoch9best.pt"))
prefix_model.eval()

dataset_test = LineByLineData2TextTextDataset(
        tokenizer,
        "/content/drive/MyDrive/E2E/src1_test.txt",
        tokenizer.max_length,
        tokenizer.eos_token)
dataloader_test = DataLoader(dataset_test, batch_size= batch_size, shuffle=True, collate_fn=dataset_test.collate_fn)

with torch.no_grad():
    prefix_model.eval()
    file_dict = {}

    for step, (data, attention_mask, target) in enumerate(dataloader_test):
      data = data.to(device)
      attention_mask = attention_mask.to(device)
      target = target.to(device)

      prefix = prefix_model(batch_size=data.shape[0], device=device, sample_size = 5)
      outputs = model.generate(input_ids = data, attention_mask=attention_mask, num_beams=5, max_length = 384, past_key_values = prefix, use_prefix = True, use_cache = True)
      output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

      filtered_tensor = torch.where(target == -100, torch.tensor(0, device=device), target)
      ground_truth = tokenizer.batch_decode(filtered_tensor, skip_special_tokens=True)
      for i in range(len(output_text)):
        if output_text[i] not in file_dict:
          file_dict[output_text[i]] = []
        file_dict[output_text[i]].append(ground_truth[i])

    ref_file = f'/content/drive/MyDrive/E2E/eval/gold/base/prefix_dataset{"e2e"}_lr{learning_rate}_prefixlen{prefix_size}_test.txt'
    pred_file = f'/content/drive/MyDrive/E2E/eval/src/base/prefix_dataset{"e2e"}_lr{learning_rate}_prefixlen{prefix_size}_test.txt'
    results_file = f'/content/drive/MyDrive/E2E/eval/metrics/base/5prefix_dataset{"e2e"}_lr{learning_rate}_prefixlen{prefix_size}_test.txt'

    write_e2e_corr(list(file_dict.keys()), file_dict, ref_file)
    write_e2e_src(list(file_dict.keys()), pred_file)
    !python /content/drive/MyDrive/E2E/e2e-metrics/measure_scores.py {ref_file} {pred_file} -p  -t -H >> {results_file}

Init the T5ForConditionalGeneration Model with config.use_prefix=True, config.preseqlen=5
576
Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 132563 tokens at 627841.78 tokens per second.
PTBTokenizer tokenized 16742 tokens at 145886.48 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.465
computing Rouge score...
ROUGE_L: 0.705
computing CIDEr score...
CIDEr: 2.462
Running Py-MTEval metrics...
