In [1]:
import transformers

In [2]:
!pip uninstall transformers -y
!pip install -e /content/drive/MyDrive/transformers

Found existing installation: transformers 4.40.1
Uninstalling transformers-4.40.1:
  Successfully uninstalled transformers-4.40.1
Obtaining file:///content/drive/MyDrive/transformers
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sacremoses (from transformers==3.2.0)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: transformers
  Building editable for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-3.2.0-0.editable-py3-none-any.whl size=15625 sha256=53f02dcfba9d0da7063b47d73e432cb28d538593772fb3d5616a0828473756d9
  Stored in directory: /tmp/pip

In [4]:
import json

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer

!pip install unidecode



In [5]:
import torch.nn.init as init
class PrefixTuning(nn.Module):

    def __init__(self, pretrained_config, prompt_len=48, hidden_dim = 800):

        super().__init__()

        self.match_n_layer = pretrained_config.num_layers
        self.match_n_head = pretrained_config.num_heads
        self.n_embd = pretrained_config.d_model
        self.match_n_embd = self.n_embd // self.match_n_head

        # Config of Pre-Trained LM
        # torch.tensor([0, 1, 2, .. , prefix_len-1])
        self.pretrained_config = pretrained_config
        self.pre_prompt = torch.arange(prompt_len)

        # Embedding
        self.wte = nn.Embedding(num_embeddings=prompt_len, embedding_dim=self.n_embd)
        # Reparameterization
        self.control_trans = nn.Sequential(
            nn.Linear(self.n_embd, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 2 * self.match_n_layer * self.n_embd)
        )


        self.wte2 = nn.Embedding(num_embeddings=prompt_len, embedding_dim=self.n_embd)
        # Reparameterization
        self.control_trans2 = nn.Sequential(
            nn.Linear(self.n_embd, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 2 * self.match_n_layer * self.n_embd)
        )


        self.wte_enc = nn.Embedding(prompt_len, self.n_embd)
        self.control_trans_enc = nn.Sequential(
                        nn.Linear(self.n_embd, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, hidden_dim),
                        nn.Tanh(),
                        nn.Linear(hidden_dim, self.match_n_layer * 2 * self.n_embd))

        self.prompt_len = prompt_len
        self.dropout = nn.Dropout(0.1)

    def forward(self, batch_size, device, sample_size = 1):
        # Shape: batch_size, prompt_len
        input_tokens = self.pre_prompt.unsqueeze(0).expand(batch_size, -1).to(device)
        # Shape: batch_size, prompt_len, d_model
        temp_control = self.wte(input_tokens)
        # Shape: batch_size, prompt_len, d_model
        past_key_values = self.control_trans(temp_control)


        temp_control2 = self.wte2(input_tokens)
        past_key_values2 = self.control_trans2(temp_control2)  # bsz, seqlen, layer*emb

        temp_control_enc = self.wte_enc(input_tokens)
        past_key_values_enc = self.control_trans_enc(temp_control_enc)  # bsz, seqlen, layer*emb


        if sample_size > 1:
            past_key_values = torch.cat(sample_size * [past_key_values])

        bsz, seqlen, _ = past_key_values.shape
        past_key_values = past_key_values.view(bsz, seqlen, self.match_n_layer * 2, self.match_n_head,
                                               self.match_n_embd)
        past_key_values = self.dropout(past_key_values)
        past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2)

        if sample_size > 1:
            past_key_values2 = torch.cat(sample_size * [past_key_values2])

        past_key_values2 = past_key_values2.view(bsz, seqlen, self.match_n_layer * 2, self.match_n_head,
                                                   self.match_n_embd)
        past_key_values2 = self.dropout(past_key_values2)
        past_key_values2 = past_key_values2.permute([2, 0, 3, 1, 4]).split(2)


        bsz_enc, seqlen, _ = past_key_values_enc.shape
        past_key_values_enc = past_key_values_enc.view(bsz_enc, seqlen, self.match_n_layer * 2, self.match_n_head,
                                                     self.match_n_embd)
        past_key_values_enc = self.dropout(past_key_values_enc)
        past_key_values_enc = past_key_values_enc.permute([2, 0, 3, 1, 4]).split(2)

        result = []
        for i, key_val in enumerate(past_key_values):
            temp_dict = {'self': {"prev_key": key_val[0].contiguous(),
                                  "prev_value": key_val[1].contiguous()
                                 },
                        }
            key_val2 = past_key_values2[i]
            temp_dict['encoder_decoder'] = {"prev_key": key_val2[0].contiguous(),
                                                "prev_value": key_val2[1].contiguous()
                                                }
            key_val_enc = past_key_values_enc[i]
            temp_dict['encoder'] = {"prev_key": key_val_enc[0].contiguous(),
                                        "prev_value": key_val_enc[1].contiguous()
                                        }
            result.append(temp_dict)

        return result

In [13]:
import copy
import unidecode
from torch.nn.utils.rnn import pad_sequence
class LineByLineWebNLGTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """

    def __init__(self, tokenizer: T5Tokenizer, file_path: str, eos_tok:str, is_eval = False):

        with open(file_path) as f:
            lines_dict = json.load(f)

        full_src_lst = []
        full_tgt_lst = []
        full_rela_lst = []

        for i, example in enumerate(lines_dict['entries']):
            sents = example[str(i + 1)]['lexicalisations']
            triples = example[str(i + 1)]['modifiedtripleset']
            rela_lst = []
            temp_triples = ""
            for j, tripleset in enumerate(triples):
                subj, rela, obj = tripleset['subject'], tripleset['property'], tripleset['object']
                rela_lst.append(rela)
                if i > 0:
                  temp_triples += ' | '

                temp_triples += '{} : {} : {}'.format(subj, rela, obj)

            temp_triples = temp_triples.strip()

            for sent in sents:
                if sent["comment"] == 'good':
                    full_tgt_lst.append(sent["lex"])
                    full_src_lst.append(temp_triples)
                    full_rela_lst.append(rela_lst)


        assert len(full_rela_lst) == len(full_src_lst)
        assert len(full_rela_lst) == len(full_tgt_lst)


        full_src_list = [unidecode.unidecode(sent) for sent in full_src_lst]
        full_tgt_list = [unidecode.unidecode(sent) for sent in full_tgt_lst]

        srcs = []
        tgts = []

        for src, tgt in zip(full_src_lst, full_tgt_lst):
            input = '{} {}'.format(src, eos_tok)
            target = '{} {}'.format(tgt, eos_tok)
            srcs.append(src)
            tgts.append(tgt)

        batch_encoding_src = tokenizer(srcs, add_special_tokens= True, is_split_into_words=False)
        batch_encoding_tgt = tokenizer(tgts, add_special_tokens= True, is_split_into_words=False)

        self.srcs = batch_encoding_src["input_ids"]
        self.labels = batch_encoding_tgt["input_ids"]

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.srcs)

    # def __getitem__(self, i) -> torch.Tensor:
    def __getitem__(self, i):
        return self.srcs[i], self.labels[i]

    def collate_fn(self, batch):
      max_len_data=0
      max_len_label=0
      for description, target in batch:
          if len(description)>max_len_data: max_len_data=len(description)
          if len(target)>max_len_label: max_len_label=len(target)

      attn_masks=[]
      targets=[]
      descriptions=[]

      for description, target in batch:
          description.extend([self.tokenizer.pad_token_id]*(max_len_data-len(description)))
          descriptions.append(description)

          attn_mask=[int(e!=self.tokenizer.pad_token_id) for e in description]
          attn_masks.append(attn_mask)

          target.extend([-100]*(max_len_label-len(target)))
          targets.append(target)

      return torch.LongTensor(descriptions), torch.LongTensor(attn_masks), torch.LongTensor(targets)

In [8]:
def read_webnlg_files(path, tokenizer):
    file_dict = {}

    with open(path) as f:
        lines_dict = json.load(f)

    full_src_lst = []
    total_count = 0

    for i, example in enumerate(lines_dict['entries']):
        sents = example[str(i + 1)]['lexicalisations']
        triples = example[str(i + 1)]['modifiedtripleset']

        temp_triples = ""
        for j, tripleset in enumerate(triples):
            subj, rela, obj = tripleset['subject'], tripleset['property'], tripleset['object']

            if i > 0:
              temp_triples += ' | '

            temp_triples += '{} : {} : {}'.format(subj, rela, obj)

        temp_triples = temp_triples.strip()
        temp_triples = '{} {}'.format(temp_triples, tokenizer.eos_token)

        for sent in sents:
            if (temp_triples) not in file_dict:
                file_dict[temp_triples] = []
                full_src_lst.append(temp_triples)
            file_dict[temp_triples].append(sent["lex"])

    return file_dict


def write_e2e_corr(prompt_lst, file_dict, corr_path):
    with open(corr_path, 'w') as f:
        for x in prompt_lst:
            for line in file_dict[x]:
                if not line.strip():
                    print('PROBLEM', line,'PROBLEM',file_dict[x] )
                else:
                    print(line, file=f)
            print('', file=f)

def write_e2e_src(prompt_lst, corr_path):
    with open(corr_path, 'w') as f:
        for x in prompt_lst:
            print(x, file=f)
    return

In [9]:
!chmod +x /content/drive/MyDrive/web_nlg/evaluation/webnlg-automatic-evaluation/multi-bleu.perl

In [10]:
import copy
from torch.nn.utils.rnn import pad_sequence
import unidecode
class EvalTestDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach
    soon.
    """
    def __init__(self, tokenizer: T5Tokenizer, file_path: str):
        self.prompt_text_dict = read_webnlg_files(file_path, tokenizer)
        self.prompt_texts = list(self.prompt_text_dict.keys())
        for i in range(len(self.prompt_texts)):
            self.prompt_texts[i] = unidecode.unidecode(self.prompt_texts[i])

        self.prompt_texts = tokenizer(self.prompt_texts, add_special_tokens=True, truncation=False,
                                   is_split_into_words=False)['input_ids']
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.prompt_texts)

    def __get_dict__(self):
        return self.prompt_text_dict


    # def __getitem__(self, i) -> torch.Tensor:
    def __getitem__(self, i):
        return self.prompt_texts[i]

    def collate_fn(self, batch):
      max_len_data=0
      for description in batch:
          if len(description)>max_len_data: max_len_data=len(description)

      descriptions=[]
      attn_masks=[]
      for description in batch:
          description.extend([self.tokenizer.pad_token_id]*(max_len_data-len(description)))
          descriptions.append(description)

          attn_mask=[int(e!=self.tokenizer.pad_token_id) for e in description]
          attn_masks.append(attn_mask)

      return torch.LongTensor(descriptions), torch.LongTensor(attn_masks)


In [11]:
def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

In [14]:
from pickle import encode_long
import json
import yaml

import os
import logging
import numpy as np
import torch
from pathlib import Path
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers.configuration_t5 import T5Config
from transformers import T5Tokenizer, Adafactor, get_linear_schedule_with_warmup
from transformers.modeling_t5 import T5ForConditionalGeneration

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(101)

prefix_size = 15
batch_size = 5
learning_rate = 5e-5
epochs = 30
gradient_accumulation_steps = 1


config = T5Config.from_pretrained('t5-base')
config.use_prefix = True
config.preseqlen = prefix_size

    # Pre-Trained T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

    # Pre-Trained T5 Model
model = T5ForConditionalGeneration.from_pretrained('t5-base', config=config).to(device)
model.resize_token_embeddings(len(tokenizer))

freeze_params(model.shared)
for d in [model.encoder, model.decoder]:
      freeze_params(d.embed_tokens)

prefix_model = PrefixTuning(model.config, prefix_size).to(device)

    # Initialize datasets and dataloaders
dataset_train = LineByLineWebNLGTextDataset(
        tokenizer,
        "/content/drive/MyDrive/web_nlg/train.json",
        tokenizer.eos_token)
dataset_test = EvalTestDataset(tokenizer, "/content/drive/MyDrive/web_nlg/test.json")

dataloader_train = DataLoader(dataset_train, batch_size= batch_size, shuffle=True, collate_fn=dataset_train.collate_fn)
dataloader_test = DataLoader(dataset_test, batch_size= batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)

total_training_steps = epochs * len(dataloader_train)
optimizer = Adafactor(prefix_model.parameters(),
                      lr=learning_rate,
                      scale_parameter=False,
                      relative_step=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=2000,
    num_training_steps=total_training_steps,
)

Init the T5ForConditionalGeneration Model with config.use_prefix=True, config.preseqlen=15




In [None]:
from pickle import encode_long
import json
import yaml

import os
import logging
import numpy as np
import torch
from pathlib import Path
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers.configuration_t5 import T5Config
from transformers import T5Tokenizer, Adafactor, get_linear_schedule_with_warmup
from transformers.modeling_t5 import T5ForConditionalGeneration



for epoch in range(epochs):
  prefix_model.train()
  epoch_loss = 0

  for step, (data, attention_mask, target) in enumerate(dataloader_train):
      data = data.to(device)
      attention_mask = attention_mask.to(device)
      target = target.to(device)

      prefix = prefix_model(batch_size = data.shape[0], device = device)

      outputs = model(input_ids=data, attention_mask=attention_mask, labels=target, past_key_values=prefix, use_cache = False, use_prefix  = True)

      loss = outputs[0]
      loss.backward()

      if (step + 1) % gradient_accumulation_steps == 0:

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        prefix_model.zero_grad()


      epoch_loss += loss.item() * gradient_accumulation_steps

  if (epoch % 4 == 0) and epoch != 0:
    with torch.no_grad():
      prefix_model.eval()

      pred_file = f'/content/drive/MyDrive/web_nlg/preds/base/model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.txt'
      generated_seqs = []

      for step, (data, attention_mask) in enumerate(dataloader_test):
        data = data.to(device)
        attention_mask = attention_mask.to(device)

        prefix = prefix_model(batch_size=data.shape[0], device=device, sample_size = 5)
        outputs = model.generate(input_ids = data, attention_mask=attention_mask, early_stopping =  False, top_p = .9, num_beams=5, past_key_values = prefix, use_prefix = True, use_cache = True)
        output_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for i in range(len(output_texts)):
          if (len(output_texts[i]) == 0):
            generated_seqs.append("UNKNOWN TOKENS")
          else:
            generated_seqs.append(output_texts[i])

      write_e2e_src(generated_seqs, pred_file)
      model_name = f'/content/drive/MyDrive/web_nlg/models/model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.pt'
      output_file = f'/content/drive/MyDrive/web_nlg/eval/base/model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.txt'
      name = f'model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}'

      !bash /content/drive/MyDrive/web_nlg/evaluation/run_eval_on_webnlg.sh {pred_file} {output_file} {name} {"test"}
      model_name = f'/content/drive/MyDrive/web_nlg/models/model_{"base"}_lr{learning_rate}_prefixlen{prefix_size}_epoch{epoch}.pt'
      torch.save(prefix_model.state_dict(), model_name)
  print("Epoch " + str(epoch) + " loss: " + str(epoch_loss/len(dataloader_train)))

Epoch 0 loss: 1.134943574799248
Epoch 1 loss: 0.7463782113195292
Epoch 2 loss: 0.6939649259994496
Epoch 3 loss: 0.6631703717243985
Files creating finished for:  model_base_lr5e-05_prefixlen100_epoch4
ALL:
SEEN:
UNSEEN:
Epoch 4 loss: 0.6402403124203795
Epoch 5 loss: 0.6198029185259061
Epoch 6 loss: 0.6031720535186392
Epoch 7 loss: 0.5873453777433268
Files creating finished for:  model_base_lr5e-05_prefixlen100_epoch8
ALL:
SEEN:
UNSEEN:
Epoch 8 loss: 0.5723025972403369
Epoch 9 loss: 0.5605976353819261
Epoch 10 loss: 0.5484320304182134


In [None]:
dataset_test = EvalTestDataset(tokenizer, "/content/drive/MyDrive/web_nlg/test.json")
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)
with torch.no_grad():
    pred_file = f'/content/drive/MyDrive/web_nlg/preds/TEST1.txt'
    generated_seqs = []
    file_dict = dataset_test.__get_dict__()

    for key in file_dict.keys():
        generated_seqs.append(file_dict[key][0])

    write_e2e_src(generated_seqs, pred_file)
    output_file = f'/content/drive/MyDrive/web_nlg/eval/TEST_SCRIPT'
    name = f'TEST'
    !bash /content/drive/MyDrive/web_nlg/evaluation/run_eval_on_webnlg.sh {pred_file} {output_file} {name} {"test"}

NameError: name 'EvalTestDataset' is not defined

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(101)
prefix_size = 50
batch_size = 5


tokenizer = T5Tokenizer.from_pretrained('t5-large')

model = T5ForConditionalGeneration.from_pretrained('t5-large').to(device)
model.resize_token_embeddings(len(tokenizer))
for param in model.parameters():
    param.requires_grad=False
dataset_test = EvalTestDataset(tokenizer, "/content/drive/MyDrive/web_nlg/test.json")
#dataset_test = LineByLineWebNLGTextDataset(
#        tokenizer,
#        "/content/drive/MyDrive/web_nlg/test.json",
#        "<start>",
#        tokenizer.eos_token)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn)

prefix_model = PrefixTuning(model.config, prefix_size).to(device)
prefix_model.load_state_dict(torch.load("/content/drive/MyDrive/web_nlg/models/HELLOmodel_base_lr4e-5_prefixlen50_epoch3.pt"))
prefix_model.eval()

with torch.no_grad():
    pred_file = f'/content/drive/MyDrive/web_nlg/preds/base/MORR1.txt'
    generated_seqs = []
    gold = []

    for step, (data) in enumerate(dataloader_test):
      data = data.to(device)
      prefix = prefix_model(batch_size=data.shape[0], device=device)
      outputs = model.generate(data, num_beams=5, prompt=prefix, length_penalty = 1.2)
      output_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
      for i in range(len(output_texts)):
        if (len(output_texts[i]) == 0):
          generated_seqs.append("UNKNOWN TOKENS")
        else:
          generated_seqs.append(output_texts[i])

    write_e2e_src(generated_seqs, pred_file)
    output_file = f'/content/drive/MyDrive/web_nlg/eval/MORR1.txt'
    name = f'model_{"base"}_lr{.1}_prefixlen{prefix_size}_epoch{17}'
    !bash /content/drive/MyDrive/web_nlg/evaluation/run_eval_on_webnlg.sh {pred_file} {output_file} {name} {"test"}

Files creating finished for:  model_base_lr0.1_prefixlen50_epoch17
ALL:
SEEN:
UNSEEN:
