In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!pip install transformers datasets pytorch_lightning rouge_score nltk
import os
import sys
import numpy as np
import transformers
import datasets
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import re
from torch.nn import CrossEntropyLoss, MultiheadAttention
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import pytorch_lightning as pl
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import (
     AdamW,
     T5ForConditionalGeneration,
     T5Tokenizer,
     get_linear_schedule_with_warmup,
     AlbertConfig,
     AlbertModel,
     AlbertTokenizer,
     AlbertTokenizerFast,
     get_constant_schedule_with_warmup,
     get_cosine_schedule_with_warmup,
 )
from transformers.modeling_outputs import MultipleChoiceModelOutput
from functools import partial
from typing import Optional, Dict
from rouge_score import rouge_scorer
from utils import load_data, load_data_workhorse

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
torch.cuda.is_available()

True

# Load TOEFL dataset

In [3]:
train_data, dev_data, test_data = load_data("/content/gdrive/MyDrive/TOEFL-QA-master/data")

In [4]:
def convert_form(data_dict):
    new_dict = {}
    new_dict["sentences_word"] = [] #each word is treated spereately and sentence is viewed as a list
    new_dict["sentences_join"] = [] #words in sentence are joined, each sentence is viewed as a string
    new_dict["questions_word"] = []
    new_dict["questions_join"] = []
    new_dict["answers_word"] = []
    new_dict["answers_join"] = []
    new_dict["options_word"] = [] #should be a list of list, word-level
    new_dict["options_join"] = []
    new_dict["context"] = []
    for file in data_dict.keys():
        #can also have word-based version
        new_dict["sentences_word"].append(data_dict[file]["sentences"])
        sentences_list = [(lambda words_list: ' '.join(words_list)) (words_list) for words_list in data_dict[file]["sentences"]]
        new_dict["sentences_join"].append(sentences_list)
        new_dict["context"] = [(lambda sentence_list: ' '.join(sentence_list)) (sentence_list) for sentence_list in new_dict["sentences_join"]]
        
        new_dict['questions_join'].append(" ".join(data_dict[file]["question"]))
        new_dict['questions_word'].append(data_dict[file]["question"])
        
        new_dict['answers_join'].append(" ".join(data_dict[file]["answer"]))
        new_dict['answers_word'].append(data_dict[file]["answer"])
        
        new_dict["options_join"].append([(lambda words_list: ' '.join(words_list)) (words_list) for words_list in data_dict[file]["options"]])
        new_dict["options_word"].append(data_dict[file]["options"])
    return new_dict

In [5]:
train_transform = convert_form(train_data)
dev_transform = convert_form(dev_data)
test_transform = convert_form(test_data)

In [6]:
f = open("/content/gdrive/MyDrive/RACE/train_race_middle.json", 'r')
train_race_middle = json.load(f)
f = open("/content/gdrive/MyDrive/RACE/train_race_high.json", 'r')
train_race_high = json.load(f)
f = open("/content/gdrive/MyDrive/RACE/val_race_high.json", 'r')
val_race_high = json.load(f)
f = open("/content/gdrive/MyDrive/RACE/val_race_middle.json", 'r')
val_race_middle = json.load(f)

In [7]:
train_transform['questions_join'][0]

'based on the conversation , what can be conducted about dialect accommodation'

In [8]:
train_race_middle['questions_join'][0]

'What does Robbie want to do on the rainy day ? '

In [9]:
def merge_datasets(datasets):
  new_dict = {}
  for key in datasets[0].keys():
    new_dict[key] = []
    for d in datasets:
      assert(key in d)
      new_dict[key] += d[key]
  return new_dict

In [10]:
combined_train = merge_datasets([train_transform, train_race_middle, train_race_high])
combined_val = merge_datasets([dev_transform, val_race_middle, val_race_high])
train = datasets.Dataset.from_dict(combined_train)
val = datasets.Dataset.from_dict(combined_val)
test = datasets.Dataset.from_dict(test_transform)

# tokenize the input

In [11]:
from transformers import AutoTokenizer, T5Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_4

In [12]:
train

Dataset({
    features: ['sentences_word', 'sentences_join', 'questions_word', 'questions_join', 'answers_word', 'answers_join', 'options_word', 'options_join', 'context'],
    num_rows: 88583
})

# Tokenize And Encode Answer As Target

In [13]:
answer_encoding = tokenizer(
     train['answers_join'],
     max_length=24,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
 )
 
labels = answer_encoding["input_ids"]

In [14]:
#define pytorch dataset for TOEFL-QA
class BioQADataset(Dataset):
    def __init__(
        self,
        dataframe,
        tokenizer:T5Tokenizer,
        source_max_token_len: int = 396,
        source_stride: int = 128,
        target_max_token_len: int = 32,
    ):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_max_token_len =  source_max_token_len
        self.source_stride = source_stride
        self.target_max_token_len =  target_max_token_len
        
        #tokenize input
        self.source_encoding = tokenizer(
            self.dataframe["questions_join"],
            self.dataframe["context"],
            truncation="only_second",
            max_length=self.source_max_token_len,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_attention_mask=True,
            stride=self.source_stride,
            padding="max_length",
            return_tensors="pt",
        )
        
        #tokenize output
        self.target_encoding = tokenizer(
            self.dataframe["answers_join"],
            truncation=True,
            max_length=self.target_max_token_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt",
        )
        #this is the mapping from indices of samples to indices of question
        self.sample2indices = self.source_encoding.pop("overflow_to_sample_mapping")
        
    def __len__(self):
        return(len(self.source_encoding))
    
    def __getitem__(self, idx: int):
        label_idx = self.sample2indices[idx] # index of corresponding label
        labels = self.target_encoding['input_ids'][label_idx]
        labels[labels==0] = -100
        
        return dict(
            question=self.dataframe[idx]["questions_join"],
            context=self.dataframe[idx]['context'],
            answer=self.dataframe[idx]['answers_join'],
            input_ids=self.source_encoding["input_ids"][idx].flatten(),
            attention_mask=self.source_encoding['attention_mask'][idx].flatten(),
            labels=labels.flatten(),
        )

In [15]:
class BioDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_data,
        dev_data,
        test_data,
        tokenizer:T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 396,
        source_stride: int = 128,
        target_max_token_len: int = 32,
    ):
        super().__init__()
        self.train_data = train_data
        self.dev_data = dev_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.source_stride = source_stride
        self.target_max_token_len = target_max_token_len
        
    def setup(self):
        self.train_dataset = BioQADataset(
            self.train_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        self.dev_dataset = BioQADataset(
            self.dev_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        self.test_dataset = BioQADataset(
            self.test_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4,
        )
    def val_dataloader(self):
        return DataLoader(
            self.dev_dataset,
            batch_size=self.batch_size,
            num_workers=4,
        )
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4,
        )

In [16]:
BATCH_SIZE = 4
N_EPOCHS = 6
data_module = BioDataModule(train, val, test, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [17]:
data_module

<__main__.BioDataModule at 0x7fe5a8f820d0>

In [18]:
#model definition
class QAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small", return_dict=True)
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer


# genrating answer

In [19]:
model = QAModel()

trainer = pl.Trainer(
     max_epochs=20,
     gpus=0,
     progress_bar_refresh_rate = 30
 )
trainer.fit(model, data_module)

  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`."
  f"DataModule.{name} has already been called, so it will not be called again. "

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# question answering example

In [20]:
i = 3

source_encoding=tokenizer(
       test[i]["questions_join"],
       test[i]['context'],
       max_length = 396,
       padding="max_length",
       truncation="only_second",
       return_attention_mask=True,
       add_special_tokens=True,
       return_tensors="pt"
   )
generated_ids = model.model.generate(
       input_ids=source_encoding["input_ids"],
       attention_mask=source_encoding["attention_mask"],
       num_beams=1,  # greedy search
       max_length=24,
       repetition_penalty=2.5,
       early_stopping=True,
       use_cache=True	
)

In [21]:
preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generated_ids]

In [22]:
print(" ".join(preds))

paul i mean, will it cover everything we've done since the mid term exam


In [23]:
print(test[i]["questions_join"])
print(test[i]["context"])
print(test[i]["answers_join"])

what aspect of the hydrologic cycle is the student confused about
hi . paul . what can i do for you ? i have a question about the final exam . i mean , will it cover everything we've done all term ? or just what we've been doing since the mid term exam . everything we've done all term . oh , boy . you know , i am still not too clear about the hydrologic cycle , um , the transfer of water back and forth between the earth and the atmosphere . i really blew the question about it on the mid term exam . i want to do better on the final exam . but i am still having trouble with it . well , uh , have you been to the tutoring center ? no , not for geography anyway . isn't that just for when you need help with writing , like an essay or a research paper . oh , no . you can get tutoring in a lot of subjects . some graduate students from this department tutor there . that's good to know . but i hardly go there because i have a part time job . i never seem to be free when they are open . well , th