# Imports

In [52]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from tqdm.notebook import tqdm
import re
from pprint import pprint

In [53]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [54]:
import importlib
util = importlib.import_module("data.TOEFL-QA.utils")
TOEFL_PATH = "./data/TOEFL-QA/data/"
raw = util.load_data(TOEFL_PATH)
train_raw, dev_raw, test_raw = tuple(raw)

# Options

In [55]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print('Using device:', device)

Using device: cuda


In [56]:
PRETRAINED_MODEL = 't5-base'
DIR = "question_generator/"
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 200
USE_ANSWER = True

# Utility Functions

In [57]:
def get_sent_str(sentence_list):
    sent = " ".join(sentence_list)
    sent = re.sub(r" (?P<punc>[.?,])", r"\1", sent)
    return sent

def get_sent_list(sentences):
    sent_list = []
    for sent in sentences:
        sent_list.append(get_sent_str(sent))
    return sent_list

In [58]:
def get_contexts(sentences):
    out = []
    for i in range(3, len(sentences)+1):
        out.append(" ".join([get_sent_str(sent) for sent in sentences[i-3:i]]))
    return out

def encode_contexts(inputs, answers=None):
    out = []
    for i in range(len(inputs)):
        s = ""
        if USE_ANSWERS:
            s = '<answer> ' + inputs[i] + " <context> " + answers[i]
        else:
            s = inputs[i]
        out.append(tokenizer(
            s, 
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        ))
    return out

# Model Init

In [59]:
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
);

In [67]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
qg_model = T5ForConditionalGeneration.from_pretrained('t5-base')
qg_model.resize_token_embeddings(len(tokenizer)) # to account for new special tokens
trained = torch.load(DIR + "toeflqa_finetune_epoch200.pt")
qg_model.load_state_dict(trained["model_state_dict"])
qg_model = qg_model.to(torch.device('cpu'))

In [68]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
# QA_input = [{
#     'question': 'what does the professor imply about the groups that are the smallest and have the least social',
#     'context': "well, to begin we'd better define exactly what we mean when we talk about states. the human groups that are the smallest and have the least social and political complexity, we call bands. the groups that are the largest and most socially and politically complex, we call states. so, the level of complexity here refers to the organization of people into large, diverse groups, and densely populated communities."
# },{
#     'question': 'what does the professor imply about the groups that are the smallest and have the least social',
#     'context': "well, to begin we'd better define exactly what we mean when we talk about states. the human groups that are the smallest and have the least social and political complexity, we call bands. the groups that are the largest and most socially and politically complex, we call states. so, the level of complexity here refers to the organization of people into large, diverse groups, and densely populated communities."
# }]
# res = nlp(QA_input)

# b) Load model & tokenizer
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Recreating questions for a random lecture

In [69]:
ground_truth = [
    {
        "question": get_sent_str(test_raw[i]['question']),
        "answer": get_sent_str(test_raw[i]['answer'])
    } for i in test_raw.keys() if i.startswith("tpo_22-lecture_1")
]
ground_truth

[{'question': 'why does the professor mention upper, middle, and lower classes',
  'answer': 'to further describe the organization of states'},
 {'question': "what is the professor's opinion about the environmental approach",
  'answer': 'some evidence supports it, but other evidence contradicts it'},
 {'question': 'what does the professor mainly discuss',
  'answer': 'common political problems of chiefdoms'},
 {'question': 'according to the professor, what are two typical characteristics of a band',
  'answer': 'it is able to meet its own basic needs'},
 {'question': 'what does the professor say is a characteristic feature of states',
  'answer': 'states manage food production for the entire population'},
 {'question': 'what reason for prehistoric social problems does the professor mention',
  'answer': 'competition for resources'}]

In [70]:
contexts = get_contexts(test_raw["tpo_22-lecture_1_11"]["sentences"])
encoded_contexts = encode_contexts(contexts)

In [71]:
questions = []
for i in encoded_contexts:
    question = qg_model.generate(input_ids=i["input_ids"])
    questions.append(tokenizer.decode(question[0], skip_special_tokens=True))

In [72]:
QA_input = [{
    'question': question,
    'context': context} for context, question in zip(contexts,questions)]
answers = nlp(QA_input)

In [73]:
generated = [
    {
        "question": question,
        "context": context,
        "answer": answer['answer']
    } for question, context, answer in zip(questions, contexts, answers)
]
generated

[{'question': 'what is the lecture mainly about',
  'context': "one of the big questions when we look at prehistory is why did the earliest states form? well, to begin we'd better define exactly what we mean when we talk about states. the human groups that are the smallest and have the least social and political complexity, we call bands.",
  'answer': 'states'},
 {'question': 'what does the professor imply about states',
  'context': "well, to begin we'd better define exactly what we mean when we talk about states. the human groups that are the smallest and have the least social and political complexity, we call bands. the groups that are the largest and most socially and politically complex, we call states.",
  'answer': 'the groups that are the largest and most socially and politically complex'},
 {'question': 'what is the lecture mainly about',
  'context': 'the human groups that are the smallest and have the least social and political complexity, we call bands. the groups that are