In [None]:
% mkdir data
% cd data
! git clone https://github.com/iamyuanchung/TOEFL-QA.git
% cd ..

In [None]:
! pip install transformers
! pip install sentencepiece
! pip install rouge-score
! pip install -U nltk

# Imports

In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from tqdm.notebook import tqdm
import re
from pprint import pprint
import sentencepiece
import nltk
from rouge_score import rouge_scorer
nltk.download('all')
from nltk.translate import meteor_score

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [None]:
import importlib
util = importlib.import_module("data.TOEFL-QA.utils")
TOEFL_PATH = "./data/TOEFL-QA/data/"
raw = util.load_data(TOEFL_PATH)
train_raw, dev_raw, test_raw = tuple(raw)

# Options

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print('Using device:', device)

Using device: cpu


In [None]:
PRETRAINED_MODEL = 't5-base'
DIR = "question_generator/"
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 200
USE_ANSWERS = False

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
qg_model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Utility Functions

In [None]:
def get_sent_str(sentence_list):
    sent = " ".join(sentence_list)
    sent = re.sub(r" (?P<punc>[.?,])", r"\1", sent)
    return sent

def get_sent_list(sentences):
    sent_list = []
    for sent in sentences:
        sent_list.append(get_sent_str(sent))
    return sent_list

In [None]:
def get_contexts(sentences):
    out = []
    for i in range(3, len(sentences)+1):
        out.append(" ".join([get_sent_str(sent) for sent in sentences[i-3:i]]))
    return out

def encode_contexts(inputs, answers=None):
    out = []
    for i in range(len(inputs)):
        s = ""
        if USE_ANSWERS:
            s = '<answer> ' + inputs[i] + " <context> " + answers[i]
        else:
            s = inputs[i]
        out.append(tokenizer(
            s, 
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        ))
    return out

# Tokenizer Downloaded

In [None]:
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
);

# Evaluation of Model

In [None]:
def all_tpos(raw_data):
  result = dict()
  for sentence in raw_data.keys():
    digits = re.findall(r'\d+', sentence)
    types = 'conversation' if 'conversation' in sentence else 'lecture'
    name = 'tpo_' + digits[0] + "-" + types + "_" + digits[1]
    if name in result.keys():
      result[name] = result[name] + [digits[2]]
    else:
      result[name] = [digits[2]]
  return result

def evaluate_model(model, dev_raw, print_detail = False):
  model.to(device)
  model.eval()
  raw_tpos = all_tpos(dev_raw)
  bleu_total = []
  meteor_total = []
  rouge_total = []
  scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
  for sentence in raw_tpos.keys():
    question = raw_tpos[sentence][0]
    sentence = sentence + "_" + question
    contexts = get_contexts(dev_raw[sentence]["sentences"])
    encoded_contexts = encode_contexts(contexts)
    questions = []
    for i in encoded_contexts:
      question = model.generate(input_ids=i["input_ids"])
      questions.append(tokenizer.decode(question[0], skip_special_tokens=True))
    ground_truth = [get_sent_str(dev_raw[i]['question']) for i in dev_raw.keys() if i.startswith(sentence)]
    bleus = []
    meteors = []
    rouges = []
    bar = tqdm(total = len(questions))
    for generated in questions:
      highest_bleu = 0.0
      highest_meteor = 0.0
      highest_rouge = 0.0
      for qs in ground_truth:
        r_score = scorer.score(qs, generated)
        rouge = r_score['rouge1'][2]
        generated = generated.split(" ")
        qs = qs.split(" ")
        bleu = nltk.translate.bleu_score.sentence_bleu(qs, generated)
        meteor = nltk.translate.meteor_score.meteor_score([qs], generated)
        if bleu > highest_bleu:
          highest_bleu = bleu
        if meteor > highest_meteor:
          highest_meteor = meteor
        if rouge > highest_rouge:
          highesr_rouge = rouge
      bleus.append(highest_bleu)
      meteors.append(highest_meteor)
      rouges.append(highest_rouge)
      bar.update(1)
    result = [
              {
                  'generated_question' : question,
                  'bleu_score' : bleu, 
                  'meteor_score' : meteor, 
                  'rouge_score' : rouge
              } for question, bleu_score, meteor_score, rouge in zip(questions, bleus, meteors, rouges)
             ]
    if print_detail:
      print(result)
    bleu_total.append(sum(bleus) / len(bleus))
    meteor_total.append(sum(meteors) / len(meteors))
    rouge_total.append(sum(rouges) / len(rouges))
  return bleu_total, meteor_total, rouge_total

In [None]:
evaluate_model(qg_model, dev_raw, True)