In [2]:
import torch, datasets
import sys
import importlib  
from external.questiongenerator import *
import re
util = importlib.import_module("data.TOEFL-QA.utils")

In [3]:
TOEFL_DATA = "./data/TOEFL-QA/data"

In [4]:
train_data, validation_data, test_data = tuple(util.load_data(TOEFL_DATA))

In [5]:
print(train_data.keys())

dict_keys(['tpo_1-conversation_1_1', 'tpo_1-conversation_1_2', 'tpo_1-conversation_1_3', 'tpo_1-conversation_1_4', 'tpo_1-conversation_2_1', 'tpo_1-conversation_2_2', 'tpo_1-conversation_2_3', 'tpo_1-conversation_2_4', 'tpo_1-conversation_2_5', 'tpo_1-lecture_1_10', 'tpo_1-lecture_1_6', 'tpo_1-lecture_1_7', 'tpo_1-lecture_1_8', 'tpo_1-lecture_1_9', 'tpo_1-lecture_2_12', 'tpo_1-lecture_2_13', 'tpo_1-lecture_2_15', 'tpo_1-lecture_2_16', 'tpo_1-lecture_3_10', 'tpo_1-lecture_3_11', 'tpo_1-lecture_3_6', 'tpo_1-lecture_3_7', 'tpo_1-lecture_3_8', 'tpo_1-lecture_3_9', 'tpo_1-lecture_4_12', 'tpo_1-lecture_4_13', 'tpo_1-lecture_4_15', 'tpo_10-conversation_1_1', 'tpo_10-conversation_1_2', 'tpo_10-conversation_1_3', 'tpo_10-conversation_1_4', 'tpo_10-conversation_2_1', 'tpo_10-conversation_2_2', 'tpo_10-conversation_2_3', 'tpo_10-conversation_2_4', 'tpo_10-lecture_1_10', 'tpo_10-lecture_1_11', 'tpo_10-lecture_1_6', 'tpo_10-lecture_1_7', 'tpo_10-lecture_1_8', 'tpo_10-lecture_1_9', 'tpo_10-lecture_2

In [6]:
def print_data(key="tpo_7-lecture_3_9"):
    print(" ".join([" ".join(i) for i in train_data[key]["sentences"]]))
    print(" ".join(train_data[key]["question"]))
    print(" ".join(train_data[key]["answer"]))

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
  
tokenizer = AutoTokenizer.from_pretrained("noahjadallah/cause-effect-detection")

model = AutoModelForTokenClassification.from_pretrained("noahjadallah/cause-effect-detection")

# https://colab.research.google.com/drive/14V9Ooy3aNPsRfTK88krwsereia8cfSPc?usp=sharing#scrollTo=eqYFDe_2HfQ7
label_list = ['O', 'B-CAUSE', 'I-CAUSE', 'B-EFFECT', 'I-EFFECT']

In [8]:
def get_sentence_str(sentence_list):
    sent = " ".join(sentence_list)
    sent = re.sub(r" (?P<punc>[.?,])", r"\1", sent)
    return sent
def get_full_paragraph(sentences):
    sent_list = []
    for sent in sentences:
        sent_list.append(get_sentence_str(sent))
    return " ".join(sent_list)
print(get_full_paragraph(train_data["tpo_7-lecture_3_9"]["sentences"]))

professor so we've been discussing 16th century native american life, and today we're going to focus on the iroquois and huron peoples. they lived in the northeastern great lakes region of north america. now, back then, their lifes depended on the natural resources of the forests, especially the birch tree. the birch tree can grow in many different types of soils and is prevalent in that area. now can anyone here describe a birch tree? student they are tall and white, the bark, i mean. professor yes. the birch tree has white bark, and this tough protective outer layer of the tree, this white bark, is waterproof. and this waterproof quality of the bark, it made it useful for making things like cooking containers, a variety of utensils. and if you peel birch bark in the winter, we call it the winter bark, another layer, a tougher inner layer of the tree adheres to the bark, producing a stronger material. so the winter bark was used for larger utensils and containers. student i know peopl

In [9]:
def get_causation_prediction(sequence: str):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
    inputs = tokenizer.encode(sequence, return_tensors="pt")

    outputs = model(inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    return [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())]
# sequence = " ".join(train_data["tpo_7-lecture_3_9"]["sentences"][3])
# get_causation_prediction(sequence)
# get_causation_prediction(get_full_paragraph(train_data["tpo_7-lecture_3_9"]["sentences"]))

In [10]:
# https://huggingface.co/iarfmoose/t5-base-question-generator
# https://github.com/AMontgomerie/question_generator
qg = QuestionGenerator()

In [11]:
# qg.generate(effect, 
#     num_questions=10, 
#     answer_style='all')
custom_inputs = ["<answer> many different types of soils <context> the birch tree can grow in many different types of soils and is prevalent in that area"]
custom_answers = [
    [
        {'answer': 'native americans', 'correct': False}, 
        {'answer': 'north american', 'correct': True}, 
        {'answer': 'americans', 'correct': False}, 
        {'answer': 'native american', 'correct': False}
    ]
]

# qg_inputs = qg.generate_qg_inputs(text, "multiple_choice")
# qg.generate_questions_from_inputs(qg_inputs)
qg._generate_question(custom_inputs)

'what types of soils can the birch tree grow in??'

In [12]:
inputs = []
passage = "tpo_7-lecture_3_9"
sentences = train_data[passage]["sentences"]

for ind in range(len(sentences)):
    sentence = sentences[ind]
    if ind == 0:
        context = " ".join([get_sentence_str(i) for i in sentences[ind: ind + 2]])
    elif ind == len(sentences) - 1:
        context = " ".join([get_sentence_str(i) for i in sentences[ind - 2: ind]])
    else:
        context = " ".join([get_sentence_str(i) for i in sentences[ind - 1: ind + 1]])
    sent_str = get_sentence_str(sentence)
    causation = get_causation_prediction(sent_str)
    tokens = [token for token, pred in causation if pred.endswith("EFFECT")]
    if len(tokens) > 3:
        ans = " ".join(tokens)
        inputs.append(f"<answer> {ans} <context> {context}")
for i in inputs:
    print(qg._generate_question(i))
    print(f"\t {i.split('<')[1]}")
    print(f"\t {i.split('<')[2]}")
    print()

what is the topic today???????
	 answer> going to focus on and peoples 
	 context> professor so we've been discussing 16th century native american life, and today we're going to focus on the iroquois and huron peoples. they lived in the northeastern great lakes region of north america.

where did the iroquois and huron peoples live??
	 answer> they lived in northeastern lakes region of america 
	 context> professor so we've been discussing 16th century native american life, and today we're going to focus on the iroquois and huron peoples. they lived in the northeastern great lakes region of north america.

what tree was the main source of their life?????????
	 answer> their life ##s depended on forests birch 
	 context> they lived in the northeastern great lakes region of north america. now, back then, their lifes depended on the natural resources of the forests, especially the birch tree.

what type of soil can the birch tree grow in???
	 answer> the birch tree can grow in many differ

what kind of boats were needed for small streams??
	 answer> they made narrow , maneuver ##able boats needed 
	 context> you see, the native americans made canoes of all types, for travel on small streams or on large open ocean waters. for small streams, they made narrow, maneuverable boats, while, while larger canoes were needed for the ocean.

how many canoes were needed for the ocean??
	 answer> they could travel throughout the area occasionally to canoe 
	 context> for small streams, they made narrow, maneuverable boats, while, while larger canoes were needed for the ocean. they could travel throughout the area only occasionally having to portage, to carry the canoe over a land short distance to another nearby stream.

how did they get there??
	 answer> [CLS] this wasn ' t a difficult task 
	 context> they could travel throughout the area only occasionally having to portage, to carry the canoe over a land short distance to another nearby stream. and since the canoes were so light, 

In [13]:
text = get_full_paragraph(train_data["tpo_7-lecture_3_9"]["sentences"][:20])
qg.generate(text, 
    num_questions=20, 
    answer_style='sentences')

Generating questions...





Evaluating QA pairs...



[{'question': 'what is the name of the bark?',
  'answer': 'and if you peel birch bark in the winter, we call it the winter bark, another layer, a tougher inner layer of the tree adheres to the bark, producing a stronger material.'},
 {'question': 'what shape could the native americans make utensils out of?',
  'answer': 'they could fold the bark into many shapes.'},
 {'question': 'what did the native americans do with the bark?',
  'answer': 'the native americans would cut the bark and fold it into any shape they needed, then secure it with cords until it dried.'},
 {'question': 'what type of soil can birch trees grow in?',
  'answer': 'the birch tree can grow in many different types of soils and is prevalent in that area.'},
 {'question': 'what is the best way to describe birch bark?',
  'answer': "professor oh, that's one of the great things about birch bark."},
 {'question': 'what was the most important use of birch bark?',
  'answer': 'now, back then, their lifes depended on the n