In [None]:
% mkdir data
% cd data
! git clone https://github.com/iamyuanchung/TOEFL-QA.git
% cd ..

In [None]:
! pip install transformers
! pip install sentencepiece
! pip install rouge-score
! pip install -U nltk
! pip install datasets

# Imports

In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from tqdm.notebook import tqdm
import re
from pprint import pprint
import sentencepiece
import nltk
from rouge_score import rouge_scorer
nltk.download('all')
from nltk.translate import meteor_score
from datasets import load_dataset

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [None]:
import importlib
util = importlib.import_module("data.TOEFL-QA.utils")
TOEFL_PATH = "./data/TOEFL-QA/data/"
raw = util.load_data(TOEFL_PATH)
train_raw, dev_raw, test_raw = tuple(raw)
dataset = load_dataset("race", "high")

# Options

In [9]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print('Using device:', device)

Using device: cpu


In [10]:
PRETRAINED_MODEL = 't5-base'
DIR = "question_generator/"
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 200
USE_ANSWERS = False

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
qg_model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

# Utility Functions

In [12]:
def get_sent_str(sentence_list):
    sent = " ".join(sentence_list)
    sent = re.sub(r" (?P<punc>[.?,])", r"\1", sent)
    return sent

def get_sent_list(sentences):
    sent_list = []
    for sent in sentences:
        sent_list.append(get_sent_str(sent))
    return sent_list

In [13]:
def get_contexts(sentences):
    out = []
    for i in range(3, len(sentences)+1):
        out.append(" ".join([get_sent_str(sent) for sent in sentences[i-3:i]]))
    return out

def encode_contexts(inputs, answers=None):
    out = []
    for i in range(len(inputs)):
        s = ""
        if USE_ANSWERS:
            s = '<answer> ' + inputs[i] + " <context> " + answers[i]
        else:
            s = inputs[i]
        out.append(tokenizer(
            s, 
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        ))
    return out

# Tokenizer Downloaded

In [None]:
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
);

In [15]:
def make_text(row):    
    encoded = {}
    if USE_ANSWERS:
        s = '<answer> ' + row['answer'] + ' <context> ' + row['article']
    else:
        s = row['article']
    encoded_text = tokenizer(
        s,
        pad_to_max_length=True, 
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    encoded['input_ids'] = torch.squeeze(encoded_text['input_ids'])
    encoded['attention_mask'] = torch.squeeze(encoded_text['attention_mask'])

    encoded_question = tokenizer(
        row['question'],
        pad_to_max_length=True,
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors='pt'
    )
    encoded['input_ids_question'] = torch.squeeze(encoded_question['input_ids'])
    return encoded

dataset = dataset.map(make_text)
dataset.set_format(type = 'torch', columns=['input_ids', 'attention_mask', 'input_ids_question'])
valid_loader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE, shuffle=True)

  0%|          | 0/3498 [00:00<?, ?ex/s]



  0%|          | 0/62445 [00:00<?, ?ex/s]

  0%|          | 0/3451 [00:00<?, ?ex/s]

In [16]:
q = []
x = dataset['validation']["input_ids"][0]
y = torch.tensor([x.tolist()])
question = qg_model.generate(input_ids=y)
z = tokenizer.decode(question[0], skip_special_tokens=True)
print(z)

<extra_id_0> Timothy was being robbed of his childhood.<extra_id_1> a sewing machine.<extra_id_2> Timothy


# Evaluation of Model

In [17]:
def all_tpos(raw_data):
  result = dict()
  for sentence in raw_data.keys():
    digits = re.findall(r'\d+', sentence)
    types = 'conversation' if 'conversation' in sentence else 'lecture'
    name = 'tpo_' + digits[0] + "-" + types + "_" + digits[1]
    if name in result.keys():
      result[name] = result[name] + [digits[2]]
    else:
      result[name] = [digits[2]]
  return result

def all_race_passage(raw_data):
  x = {}
  seen = set()
  for i in range(len(raw_data)):
    if raw_data['article'][i] not in seen:
      seen.add(raw_data['article'][i])
      x[raw_data['article'][i]] = [raw_data['question'][i]]
    else:
      x[raw_data['article'][i]].append(raw_data['question'][i])
  return x

# if to use RACE, use dataset['validation'] and TOEFL = False.
def evaluate_model(model, dev_raw, print_detail = False, TOEFL= False):
  device = "cpu" if TOEFL else "cuda"
  model.to(device)
  model.eval()
  if TOEFL: 
    raw = all_tpos(dev_raw)
    iterate = raw.keys()
  else:
    iterate = range(len(dev_raw))
    lookup = all_race_passage(dev_raw)
  bleu_total = []
  meteor_total = []
  rouge_total = []
  scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
  seen = set()
  bar1 = tqdm(total = len(iterate))
  for sentence in iterate:
    if TOEFL:
      question = raw[sentence][0]
      sentence = sentence + "_" + question
      contexts = get_contexts(dev_raw[sentence]["sentences"])
      encoded_contexts = encode_contexts(contexts)
      questions = []
      for i in encoded_contexts:
        question = model.generate(input_ids=i["input_ids"])
        questions.append(tokenizer.decode(question[0], skip_special_tokens=True))
      ground_truth = [get_sent_str(dev_raw[i]['question']) for i in dev_raw.keys() if i.startswith(sentence)]
    else:
      input_ids = (dev_raw["input_ids"][sentence]).to(device)
      input_ids_fixed = torch.tensor([input_ids.tolist()], device = device)
      question = model.generate(input_ids = input_ids_fixed)
      questions = tokenizer.decode(question[0].to('cpu'), skip_special_tokens=True)
      if questions in seen:
        continue
      seen.add(questions)
      questions = [questions]
      ground_truth = lookup[dev_raw['article'][sentence]]
    bleus = []
    meteors = []
    rouges = []
    for generated in questions:
      highest_bleu = 0.0
      highest_meteor = 0.0
      highest_rouge = 0.0
      for qs in ground_truth:
        r_score = scorer.score(qs, generated)
        rouge = r_score['rouge1'][2]
        generated_split = generated.split(" ")
        qs = qs.split(" ")
        bleu = nltk.translate.bleu_score.sentence_bleu(qs, generated_split)
        meteor = nltk.translate.meteor_score.meteor_score([qs], generated_split)
        if bleu > highest_bleu:
          highest_bleu = bleu
        if meteor > highest_meteor:
          highest_meteor = meteor
        if rouge > highest_rouge:
          highest_rouge = rouge
      bleus.append(highest_bleu)
      meteors.append(highest_meteor)
      rouges.append(highest_rouge)
    result = [
              {
                  'generated_question' : question,
                  'bleu_score' : bleu, 
                  'meteor_score' : meteor, 
                  'rouge_score' : rouge
              } for question, bleu_score, meteor_score, rouge in zip(questions, bleus, meteors, rouges)
             ]
    if print_detail:
      print(result)
    bleu_total.append(sum(bleus) / len(bleus))
    meteor_total.append(sum(meteors) / len(meteors))
    rouge_total.append(sum(rouges) / len(rouges))
    bar1.update(1)
  bar1.close()
  return bleu_total, meteor_total, rouge_total

In [None]:
len(dataset['validation'])

3451

In [None]:
evaluate_model(qg_model, dev_raw, False, True)