In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive/totto_data"

totto_dev_data.jsonl  totto_train_data.jsonl  unlabeled_totto_test_data.jsonl


In [None]:
!cp "/content/drive/My Drive/totto_data/totto_train_data.jsonl" -r "totto"
!cp "/content/drive/My Drive/totto_data/totto_dev_data.jsonl" -r "totto"

In [1]:
!pip install pytorch_lightning=='0.7.5'

In [2]:
!pip install transformers=='2.9.0'

In [3]:
# Import libraries
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [None]:
torch.__version__

'1.6.0+cu101'

In [None]:
import copy
import json
from absl import app
from absl import flags
import pandas as pd
import six

In [None]:
trainDataLst = []

with open('totto/totto_train_data.jsonl', 'r') as fd:
  for l in fd:
    trainDataLst.append(l)

In [None]:
valDataLst = []

with open('totto/totto_dev_data.jsonl', 'r') as fd:
  for l in fd:
    valDataLst.append(l)

In [None]:
len(trainDataLst), len(valDataLst)

(81710, 7700)

In [None]:
['table', 'table_webpage_url', 'table_page_title', 'table_section_title', 'table_section_text', 
 'highlighted_cells', 'example_id', 'sentence_annotations']

['table',
 'table_webpage_url',
 'table_page_title',
 'table_section_title',
 'table_section_text',
 'highlighted_cells',
 'example_id',
 'sentence_annotations']

In [None]:
def get_highligted_data(trainInput):

  highCellVals = []
  #table_headers = [h['value'] for h in trainInput['table'][0]]
  table_page_title = ['table_page_title',trainInput['table_page_title']]
  highCellVals.append(table_page_title)
  table_section_title = ['table_section_title',trainInput['table_section_title']]
  highCellVals.append(table_section_title)
  table_section_text = ['table_section_text',trainInput['table_section_text']]
  highCellVals.append(table_section_text)
  for hc in trainInput['highlighted_cells']:
    highCellVals.append([trainInput['table'][hc[0]][hc[1]]['value']])

  tar_txt = ['final_sentence',trainInput['sentence_annotations'][0]['final_sentence']]
  highCellVals.append(tar_txt)

  return highCellVals

In [4]:
trainDataPrep1 = []

for i,entry in enumerate(trainDataLst[:-1]):
  print(i)
  trainDataPrep1.append(get_highligted_data(json.loads(entry)))

In [None]:
trainDataPrep1[1]

[['table_page_title', 'List of Chicago Bears first-round draft picks'],
 ['table_section_title', 'Player selections'],
 ['table_section_text', ''],
 ['2018'],
 ['Roquan Smith'],
 ['Linebacker'],
 ['Georgia'],
 ['final_sentence',
  'The Chicago Bears recent first round selection (2018) was Roquan Smith, an inside linebacker from Georgia.']]

In [None]:
## CREATE TRAINING DATA
trainDataLst = []
prefix = 'sent_generator'
for tde in trainDataPrep1:
  param_lst = []
  for in_ent in tde[:-1]:
    if in_ent[-1] != '':
      param_lst.append(in_ent[-1])
  trainDataLst.append([prefix, '&&'.join(param_lst), tde[-1][-1]])

In [None]:
#trainDF = pd.DataFrame(trainDataLst, columns = ['prefix','input','target'])

In [None]:
#trainDF.head(10)

In [5]:
valDataPrep1 = []

for i,entry in enumerate(valDataLst[:-1]):
  print(i)
  valDataPrep1.append(get_highligted_data(json.loads(entry)))

In [None]:
# CREATE TRAINING DATA
valDataLst = []
prefix = 'sent_generator'
for tde in valDataPrep1:
  param_lst = []
  for in_ent in tde[:-1]:
    if in_ent[-1] != '':
      param_lst.append(in_ent[-1])
  valDataLst.append([prefix, '&&'.join(param_lst), tde[-1][-1]])

#valDF = pd.DataFrame(valDataLst, columns = ['prefix','input','target'])

In [None]:
#valDF.head(2)

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.proc_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir= "",  # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=1e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=2,
    eval_batch_size=2,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

In [None]:
class tottoDataset(Dataset):
  def __init__(self, tokenizer, dataLst, type_path,  max_len=512):
    #self.pos_file_path = os.path.join(data_dir, type_path, 'pos')
    #self.neg_file_path = os.path.join(data_dir, type_path, 'neg')
    
    #self.pos_files = glob.glob("%s/*.txt" % self.pos_file_path)
    #self.neg_files = glob.glob("%s/*.txt" % self.neg_file_path)
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self.dataLst = dataLst
    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    self._buil_examples_from_files()

  def _buil_examples_from_files(self):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    for text in self.dataLst:
      
      line = text[1].strip()
      #line = REPLACE_NO_SPACE.sub("", line) 
      #line = REPLACE_WITH_SPACE.sub("", line)
      line = line + ' </s>'

      target = text[2] + " </s>"
      
       # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [line], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [None]:
# trainDF = pd.DataFrame(trainDataLst, columns = ['prefix','input','target'])
# valDF = pd.DataFrame(valDataLst, columns = ['prefix','input','target'])

In [None]:
#tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
#dataset = tottoDataset(tokenizer, valDataLst, type_path='test')
#len(dataset)

In [None]:
#len(dataset), len(valDataLst)

In [None]:
# data = dataset[10]
# print(tokenizer.decode(data['source_ids']))
# print(tokenizer.decode(data['target_ids']))

In [None]:
!mkdir -p t5_totto

In [None]:
args_dict.update({'output_dir': 't5_totto', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [None]:
def get_dataset(tokenizer, type_path, args):
  return tottoDataset(tokenizer=tokenizer, dataLst=trainDataLst, type_path=type_path, max_len=args.max_seq_length)

In [None]:
model = T5FineTuner(args)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
   

In [None]:
trainer = pl.Trainer(**train_params)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model)

INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



In [None]:
!mkdir t5_base_totto_pre_trained

In [None]:
## save the model this way so next time you can load it using T5ForConditionalGeneration.from_pretrained
model.model.save_pretrained('t5_base_imdb_sentiment')

In [None]:
testDataLst = []

with open('totto/unlabeled_totto_test_data.jsonl', 'r') as fd:
  for l in fd:
    testDataLst.append(l)

In [None]:
testDataLst[0]

'{"table": [[{"value": "nuclide symbol", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "Z(p)", "is_header": true, "column_span": 1, "row_span": 1}, {"value": "N(n)", "is_header": true, "column_span": 1, "row_span": 1}, {"value": "isotopic mass (u)", "is_header": true, "column_span": 1, "row_span": 1}, {"value": "half-life", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "decay mode(s)", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "daughter isotope(s)", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "nuclear spin and parity", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "representative isotopic composition (mole fraction)", "is_header": true, "column_span": 1, "row_span": 2}, {"value": "range of natural variation (mole fraction)", "is_header": true, "column_span": 1, "row_span": 2}], [{"value": "excitation energy", "is_header": true, "column_span": 3, "row_span": 1}], [{"value": "14F", "is_header": fal