In [None]:
% mkdir data
% cd data
! git clone https://github.com/iamyuanchung/TOEFL-QA.git
% cd ..
! pip install transformers
! pip install sentencepiece
! pip install rouge-score
! pip install -U nltk
! pip install datasets
! pip install bert_score

# Imports

In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from tqdm.notebook import tqdm
import re
from pprint import pprint
import sentencepiece
import nltk
from rouge_score import rouge_scorer
nltk.download('all')
from nltk.translate import meteor_score
from datasets import load_dataset
from datasets import load_metric
import statistics
import string

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [4]:
import importlib
util = importlib.import_module("data.TOEFL-QA.utils")
TOEFL_PATH = "./data/TOEFL-QA/data/"
raw = util.load_data(TOEFL_PATH)
train_raw, dev_raw, test_raw = tuple(raw)
dataset = load_dataset("race", "high")

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading and preparing dataset race/high (download: 24.26 MiB, generated: 133.63 MiB, post-processed: Unknown size, total: 157.89 MiB) to /root/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b...


Downloading:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset race downloaded and prepared to /root/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Options

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
print('Using device:', device)

Using device: cuda


In [6]:
PRETRAINED_MODEL = 't5-base'
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 200
FNAME = "/content/drive/MyDrive/race_finetune_withanswer_epoch3.pt"
USE_ANSWERS = True

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
qg_model = T5ForConditionalGeneration.from_pretrained('t5-base')

tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
);


qg_model.resize_token_embeddings(len(tokenizer)) # to account for new special tokens
#trained = torch.load(FNAME)
#qg_model.load_state_dict(trained["model_state_dict"])
qg_model = qg_model.to(device)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
  
causal_tokenizer = AutoTokenizer.from_pretrained("noahjadallah/cause-effect-detection")

causal_model = AutoModelForTokenClassification.from_pretrained("noahjadallah/cause-effect-detection")

# https://colab.research.google.com/drive/14V9Ooy3aNPsRfTK88krwsereia8cfSPc?usp=sharing#scrollTo=eqYFDe_2HfQ7


# 2.2. Cause Begin -> B-Cause -> 1
# 2.3. Cause Inside -> I-Cause -> 2
# 2.4. Effect Begin -> B-Effect -> 3
# 2.5. Effect Inside -> I-Effect -> 4

label_list = ['O', 'B-CAUSE', 'I-CAUSE', 'B-EFFECT', 'I-EFFECT']

In [10]:
def make_text(row):    
    encoded = {}
    if USE_ANSWERS:
        s = '<answer> ' + row['answer'] + ' <context> ' + row['article']
    else:
        s = row['article']
    encoded_text = tokenizer(
        s,
        pad_to_max_length=True, 
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    encoded['input_ids'] = torch.squeeze(encoded_text['input_ids'])
    encoded['attention_mask'] = torch.squeeze(encoded_text['attention_mask'])

    encoded_question = tokenizer(
        row['question'],
        pad_to_max_length=True,
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors='pt'
    )
    encoded['input_ids_question'] = torch.squeeze(encoded_question['input_ids'])
    return encoded

dataset = dataset.map(make_text)
dataset.set_format(type = 'torch', columns=['input_ids', 'attention_mask', 'input_ids_question'])

  0%|          | 0/3498 [00:00<?, ?ex/s]



  0%|          | 0/62445 [00:00<?, ?ex/s]

  0%|          | 0/3451 [00:00<?, ?ex/s]

# Utility Functions

In [11]:
def get_sent_str(sentence_list):
    sent = " ".join(sentence_list)
    sent = re.sub(r" (?P<punc>[.?,])", r"\1", sent)
    return sent

def get_sent_list(sentences):
    sent_list = []
    for sent in sentences:
        sent_list.append(get_sent_str(sent))
    return sent_list

In [12]:
def set_fuzzy_context(key, raw_data):
    question = [raw_data[key]["question"]]
    results = []
    for ref in get_sent_list(raw_data[key]["sentences"]):
        results.append(bertscore.compute(predictions=question, references=[ref], lang='en'))
    idx = np.argsort(-1 * np.array([i["precision"] for i in results]).ravel())
    top5 = idx[:5]
    sent_list = get_sent_list(raw_data[key]["sentences"])
    raw_data[key]["context"] = " ".join([sent_list[i] for i in sorted(top5)]) # reorder sentences

In [13]:
def get_causation_prediction(sequence: str):
    tokens = causal_tokenizer.tokenize(causal_tokenizer.decode(causal_tokenizer.encode(sequence)))
    inputs = causal_tokenizer.encode(sequence, return_tensors="pt")

    outputs = causal_model(inputs).logits
    predictions = torch.argmax(outputs, dim=2).numpy()
    effects = [tokens[i] for i in range(len(tokens)) if predictions[0][i] > 2]
    return effects

def get_contexts(sentences):
    out = []
    for i in range(4, len(sentences)+1):
        center = i - 2
        effect = get_causation_prediction(get_sent_str(sentences[center]))
        if len(effect) < 2: # It's possible there's no effect in this sentencee
            continue
        effect = " ".join(effect)
        context = " ".join([get_sent_str(sent) for sent in sentences[center-2:center+2]])
        out.append('<answer> ' + effect + " <context> " + context)
    return out

def encode_contexts(inputs, answers=None):
    out = []
    for i in range(len(inputs)):
        s = ""
        s = inputs[i]
        out.append(tokenizer(
            s, 
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        ).to(device))
    return out

# Evaluation of Model

In [14]:
def all_tpos(raw_data):
  result = dict()
  for sentence in raw_data.keys():
    digits = re.findall(r'\d+', sentence)
    types = 'conversation' if 'conversation' in sentence else 'lecture'
    name = 'tpo_' + digits[0] + "-" + types + "_" + digits[1]
    if name in result.keys():
      result[name] = result[name] + [digits[2]]
    else:
      result[name] = [digits[2]]
  return result

def all_race_passage(raw_data):
  x = {}
  seen = set()
  for i in range(len(raw_data)):
    if raw_data['article'][i] not in seen:
      seen.add(raw_data['article'][i])
      x[raw_data['article'][i]] = [raw_data['question'][i]]
    else:
      x[raw_data['article'][i]].append(raw_data['question'][i])
  return x

# If to use RACE, use dataset[x] for x in {'validation', 'test'} and TOEFL = False.
# If to use TOEFL, use dev_raw or test_raw and TOEFL = True.
# To see how each generated question is scored, print_detail = True
def evaluate_model(model, dev_raw, print_detail = False, TOEFL= False):
  results = {}
  model.to(device)
  model.eval()
  if TOEFL: 
    raw = all_tpos(dev_raw)
    iterate = raw.keys()
  else:
    iterate = range(len(dev_raw))
    lookup = all_race_passage(dev_raw)
  bleu_total = []
  meteor_total = []
  rouge_total = []
  bert_total = []
  scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
  bar1 = tqdm(total = len(iterate))
  metric_bert = load_metric("bertscore")
  seen = set()
  for sentence in iterate:
    seen_TOEFL = set()
    if TOEFL:
      question = raw[sentence][0]
      sentence = sentence + "_" + question
      contexts = get_contexts(dev_raw[sentence]["sentences"])
      encoded_contexts = encode_contexts(contexts)
      questions = []
      for i in encoded_contexts:
        question = model.generate(input_ids=i["input_ids"])
        question = tokenizer.decode(question[0], skip_special_tokens=True)
        result = ""
        for i in range(len(question)):
          char = question[i]
          if char not in string.punctuation:
            result += char
        question = result
        if question not in seen_TOEFL:
          questions.append(question)
          seen_TOEFL.add(question)
      ground_truth = [dev_raw[i]['question'] for i in dev_raw.keys() if i.startswith(sentence)]
    else:
      input_ids = (dev_raw["input_ids"][sentence]).to(device)
      input_ids_fixed = torch.tensor([input_ids.tolist()], device = device)
      question = model.generate(input_ids = input_ids_fixed)
      questions = tokenizer.decode(question[0].to('cpu'), skip_special_tokens=True)
      if questions in seen:
        continue
      seen.add(questions)
      questions = [questions]
      ground_truth = lookup[dev_raw['article'][sentence]]
      gt = []
      for i in range(len(ground_truth)):
        s = ground_truth[i]
        s = s.split(" ")
        gt.append(s)
      ground_truth = gt
    bleus = []
    meteors = []
    rouges = []
    berts = []
    for generated in questions:
      highest_bleu = 0.0
      highest_meteor = 0.0
      highest_rouge = 0.0
      highest_bert = 0.0
      for qs in ground_truth:
        truth = " ".join(qs)
        bert_scorer = metric_bert.compute(predictions = [generated], references = [truth], lang = "English")["f1"][0]
        r_score = scorer.score(truth, generated)
        rouge = r_score['rouge1'][2]
        generated_split = generated.split(" ")
        bleu = nltk.translate.bleu_score.sentence_bleu([qs], generated_split)
        meteor = nltk.translate.meteor_score.meteor_score([qs], generated_split)
        if bleu > highest_bleu:
          highest_bleu = bleu
        if meteor > highest_meteor:
          highest_meteor = meteor
        if rouge > highest_rouge:
          highest_rouge = rouge
        if bert_scorer > highest_bert:
          highest_bert = bert_scorer
      bleus.append(highest_bleu)
      meteors.append(highest_meteor)
      rouges.append(highest_rouge)
      berts.append(highest_bert)
    results[sentence] = {
            "questions": questions,
            "bleu": bleus,
            "meteor": meteors,
            "rouge": rouges,
            "bert": berts,
            "ground_truth": [" ".join(x) for x in ground_truth]
        }
    if print_detail:
      print(results[sentence]["questions"])
    bleu_total.append(statistics.mean(bleus))
    meteor_total.append(statistics.mean(meteors))
    rouge_total.append(statistics.mean(rouges))
    bert_total.append(statistics.mean(berts))
    bar1.update(1)
  bar1.close()
  return bleu_total, meteor_total, rouge_total, bert_total, results

In [None]:
bleu_total, meteor_total, rouge_total, bert_total, results = evaluate_model(qg_model, dataset['test'], False, False)

In [None]:
print(sorted(bleu_total)[-5:])
print(sorted(meteor_total)[-5:])
print(sorted(rouge_total)[-5:])
print(sorted(bert_total)[-5:])

In [None]:
sorted(results.items(), key=lambda x: -max(x[1]["bleu"]))