In [18]:
import os
import sys
import numpy as np
import transformers
import datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

In [19]:
from datasets import load_dataset, load_metric
from transformers import (
     AdamW,
     T5ForConditionalGeneration,
     T5Tokenizer,
     get_linear_schedule_with_warmup,
 )

# Load, Preprocess Data

In [12]:
sys.path.append("./TOEFL-QA")
from utils import load_data, load_data_workhorse

In [13]:
train_data, dev_data, test_data = load_data("TOEFL-QA/data")

In [14]:
def convert_form(data_dict):
    new_dict = {}
    new_dict["sentences_word"] = [] #each word is treated spereately and sentence is viewed as a list
    new_dict["sentences_join"] = [] #words in sentence are joined, each sentence is viewed as a string
    new_dict["questions_word"] = []
    new_dict["questions_join"] = []
    new_dict["answers_word"] = []
    new_dict["answers_join"] = []
    new_dict["options_word"] = [] #should be a list of list, word-level
    new_dict["options_join"] = []
    new_dict["context"] = []
    for file in data_dict.keys():
        #can also have word-based version
        new_dict["sentences_word"].append(data_dict[file]["sentences"])
        sentences_list = [(lambda words_list: ' '.join(words_list)) (words_list) for words_list in data_dict[file]["sentences"]]
        new_dict["sentences_join"].append(sentences_list)
        new_dict["context"] = [(lambda sentence_list: ' '.join(sentence_list)) (sentence_list) for sentence_list in new_dict["sentences_join"]]
        
        new_dict['questions_join'].append(" ".join(data_dict[file]["question"]))
        new_dict['questions_word'].append(data_dict[file]["question"])
        
        new_dict['answers_join'].append(" ".join(data_dict[file]["answer"]))
        new_dict['answers_word'].append(data_dict[file]["answer"])
        
        new_dict["options_join"].append([(lambda words_list: ' '.join(words_list)) (words_list) for words_list in data_dict[file]["options"]])
        new_dict["options_word"].append(data_dict[file]["options"])
    return new_dict

In [26]:
train_transform = convert_form(train_data)
train = datasets.Dataset.from_dict(train_transform) #here the dataset refers to the transformer dataset

In [22]:
dev_transform = convert_form(dev_data)
dev = datasets.Dataset.from_dict(dev_transform) #here the dataset refers to the transformer dataset

In [23]:
test_transform = convert_form(test_data)
test = datasets.Dataset.from_dict(test_transform)

In [27]:
train[0]["questions_join"]

'why does the student go to the career services office'

In [28]:
train[0]["answers_join"]

'to find out he is allowed to attend the career fair'

In [29]:
train[0]["context"]

"hi , do you have a minute ? sure , how can i help you ? i have a couple of questions about the career fair next week . um well , are seniors the only ones who can go ? i mean , you know , they are finishing school this year and getting their degrees and everything . and , well , it seems like businesses would wanna talk to them and not first year students like me . no , no , the career fair is opened to all our students and we encourage anyone who's interested to go check it out . well , that's good to know . you've seen the flyers and posters around campus , i assume . sure , can't miss them . i mean , they all say where and when the fair is , just not who should attend . actually they do , but it's in the small print . uh , we should probably make that part easier to reach , shouldn't we ? i'll make a note of that right now . so , do you have any other questions ? yes , actually i do now . um since i'd only be going to familiarize myself with the process , you know , check it out , 

In [30]:
train[0]["sentences_join"]

['hi , do you have a minute ?',
 'sure , how can i help you ?',
 'i have a couple of questions about the career fair next week .',
 'um well , are seniors the only ones who can go ?',
 'i mean , you know , they are finishing school this year and getting their degrees and everything .',
 'and , well , it seems like businesses would wanna talk to them and not first year students like me .',
 "no , no , the career fair is opened to all our students and we encourage anyone who's interested to go check it out .",
 "well , that's good to know .",
 "you've seen the flyers and posters around campus , i assume .",
 "sure , can't miss them .",
 'i mean , they all say where and when the fair is , just not who should attend .',
 "actually they do , but it's in the small print .",
 "uh , we should probably make that part easier to reach , shouldn't we ?",
 "i'll make a note of that right now .",
 'so , do you have any other questions ?',
 'yes , actually i do now .',
 "um since i'd only be going to

# tokenize the input

In [33]:
from transformers import AutoTokenizer, T5Tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_4

In [34]:
train[0]["questions_join"]

'why does the student go to the career services office'

In [35]:
encode = tokenizer(train[0]["questions_join"], train[0]['context'])['input_ids']

Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors


In [36]:
max_length = 384

In [37]:
tokenized_trainset = tokenizer(
    train["questions_join"],
    train["context"],
    truncation="only_second",
    max_length=384,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    stride=128,
    padding="max_length",
)

In [38]:
sample_mapping = tokenized_trainset.pop("overflow_to_sample_mapping") 

In [39]:
sample_mapping #map from indices of training instance to actual index question to account for truncation

[0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 7,
 7,
 7,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 16,
 16,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 19,
 19,
 19,
 20,
 20,
 20,
 21,
 21,
 21,
 21,
 21,
 22,
 22,
 22,
 22,
 23,
 23,
 23,
 23,
 23,
 24,
 24,
 24,
 25,
 25,
 25,
 25,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 29,
 29,
 29,
 29,
 29,
 30,
 30,
 30,
 30,
 31,
 31,
 31,
 32,
 32,
 32,
 32,
 32,
 33,
 33,
 33,
 33,
 34,
 34,
 34,
 34,
 34,
 35,
 35,
 35,
 35,
 36,
 36,
 36,
 36,
 36,
 37,
 37,
 37,
 37,
 37,
 38,
 38,
 38,
 39,
 39,
 39,
 40,
 40,
 40,
 40,
 40,
 41,
 41,
 41,
 41,
 41,
 42,
 42,
 42,
 43,
 43,
 43,
 43,
 43,
 44,
 44,
 44,
 45,
 45,
 45,
 45,
 45,
 46,
 46,
 46,
 46,
 46,
 47,
 47,
 47,
 47,
 47,
 48,
 48,
 48,
 49,
 49,
 49,
 50,
 50,
 50,
 50,
 50,
 51,
 51,
 5

In [42]:
tokenizer.decode(tokenized_trainset["input_ids"][2]) #example question and context pair

"why does the student go to the career services office</s>, i haven't declared a major yet, but i'm strongly considering accounting. see, that's part of the reason i wanna go to the fair, to help me decide if that's what i really want to study. that's very wise. well, i suggest that you get on the computer and learn more about the accounting companies in particular that would be attending. you can learn a lot about companies from their internet websites. then prepare a list of questions. questions, hmm so, in a way, i'll be interviewing them? that's one way of looking at it. think about it for a second. what do you want to know about working for an accounting firm? well, there is the job itself, and salary of course, and working conditions, i mean, would i have an office, or would i work in a big room with a zillion other employees, and and maybe about opportunities for advancement. see? those're all important things to know. after you do some research, you'll be able to tailor your qu

# Tokenize And Encode Answer As Target

In [44]:
answer_encoding = tokenizer(
     train['answers_join'],
     max_length=24,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
 )
 
labels = answer_encoding["input_ids"]

In [45]:
#define pytorch dataset for TOEFL-QA
class BioQADataset(Dataset):
    def __init__(
        self,
        dataframe,
        tokenizer:T5Tokenizer,
        source_max_token_len: int = 396,
        source_stride: int = 128,
        target_max_token_len: int = 32,
    ):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.source_max_token_len =  source_max_token_len
        self.source_stride = source_stride
        self.target_max_token_len =  target_max_token_len
        
        #tokenize input
        self.source_encoding = tokenizer(
            self.dataframe["questions_join"],
            self.dataframe["context"],
            truncation="only_second",
            max_length=self.source_max_token_len,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            return_attention_mask=True,
            stride=self.source_stride,
            padding="max_length",
            return_tensors="pt",
        )
        
        #tokenize output
        self.target_encoding = tokenizer(
            self.dataframe["answers_join"],
            truncation=True,
            max_length=self.target_max_token_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt",
        )
        #this is the mapping from indices of samples to indices of question
        self.sample2indices = self.source_encoding.pop("overflow_to_sample_mapping")
        
    def __len__(self):
        return(len(self.source_encoding))
    
    def __getitem__(self, idx: int):
        label_idx = self.sample2indices[idx] # index of corresponding label
        labels = self.target_encoding['input_ids'][label_idx]
        labels[labels==0] = -100
        
        return dict(
            question=self.dataframe[idx]["questions_join"],
            context=self.dataframe[idx]['context'],
            answer=self.dataframe[idx]['answers_join'],
            input_ids=self.source_encoding["input_ids"][idx].flatten(),
            attention_mask=self.source_encoding['attention_mask'][idx].flatten(),
            labels=labels.flatten(),
        )

In [49]:
class BioDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_data,
        dev_data,
        test_data,
        tokenizer:T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 396,
        source_stride: int = 128,
        target_max_token_len: int = 32,
    ):
        super().__init__()
        self.train_data = train_data
        self.dev_data = dev_data
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.source_stride = source_stride
        self.target_max_token_len = target_max_token_len
        
    def setup(self):
        self.train_dataset = BioQADataset(
            self.train_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        self.dev_dataset = BioQADataset(
            self.dev_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        self.test_dataset = BioQADataset(
            self.test_data,
            self.tokenizer,
            self.source_max_token_len,
            self.source_stride,
            self.target_max_token_len,
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4,
        )
    def val_dataloader(self):
        return DataLoader(
            self.dev_dataset,
            batch_size=self.batch_size,
            num_workers=4,
        )
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4,
        )

In [50]:
BATCH_SIZE = 4
N_EPOCHS = 6
data_module = BioDataModule(train, dev, test, tokenizer, batch_size=BATCH_SIZE)
data_module.setup() 

In [51]:
#model definition
class QAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small", return_dict=True)
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer


# genrating answer

In [52]:
model = QAModel()

trainer = pl.Trainer(
     max_epochs=20,
     gpus=1,
     progress_bar_refresh_rate = 30
 )
trainer.fit(model, data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 10.76 GiB total capacity; 185.79 MiB already allocated; 4.56 MiB free; 196.00 MiB reserved in total by PyTorch)

# question answering example

In [159]:
source_encoding=tokenizer(
       test[0]["questions_join"],
       test[0]['context'],
       max_length = 396,
       padding="max_length",
       truncation="only_second",
       return_attention_mask=True,
       add_special_tokens=True,
       return_tensors="pt"
   )
generated_ids = model.model.generate(
       input_ids=source_encoding["input_ids"],
       attention_mask=source_encoding["attention_mask"],
       num_beams=1,  # greedy search
       max_length=24,
       repetition_penalty=2.5,
       early_stopping=True,
       use_cache=True	
)

In [160]:
preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generated_ids]

In [161]:
print(" ".join(preds))

bart k was born in austria hungary.


In [53]:
print(test[0]["questions_join"])
print(test[0]["context"])
print(test[0]["answers_join"])

according to the professor, why was bartk music popular in austria hungary
so i just finished reviewing your papers on the influence of nationalism on the composers music . and initially i was surprised none of you chose to write about b la bart k , that is until i remembered we haven't had a chance to discuss him in class yet . he was a wonderful and ground breaking composer . b la bart k was a hungarian , whose life stretched from the late nineteenth century to the middle of twentieth century . but he was not a fan of the romantic style of music that was popular in his homeland during his youth . wait , hungary wasn't a country in 19 , was it ? you are right . i should have been clear . bartok was born in austria hungary , a nation that broke apart when he was about forty years old . actually , the town where he was born is presently part of romania . the political history of that region is complex . suffice to say that bartok is generally known as a hungarian composer . so during ba