In [20]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [21]:
def take_question(url: str):
    data = pd.read_csv(url, sep='\t', encoding='ISO-8859-1')
    data.dropna(inplace=True)
    data.drop_duplicates(inplace=True)
    print(f"{data.isnull().sum()}\n")
    print(f"Data has {data.columns[0:-1]}")
    return data.reset_index(drop=True)

In [22]:
def take_answer(url: str):
    chars = ""
    with open(url, "r", encoding='ISO-8859-1') as f:
        text = f.read()
    data = pd.DataFrame(text.split('\n'), columns=['text'])
    data = data[data['text']!='']
    data.drop_duplicates(inplace=True)
    data = data.reset_index(drop=True)
    data = data.to_dict(orient="list")
    # data = " ".join(data['text'])
    return data

In [23]:
def check_my_question(data_question, myquestion: str):
    max_match = 0
    best_match_sentence = None
    index = 0
    question = data_question
    matched_rows = question[question['Question'].str.lower() == myquestion.lower()]
    if not matched_rows.empty:
        print(matched_rows)
    else:
        for i, question_in_column in enumerate(question['Question']):

            current_match = 0
            question_words = set(myquestion.lower().split())
            question_in_column_words = set(question_in_column.lower().split())

            for word in question_words:
                if word in question_in_column_words:
                    current_match += myquestion.lower().split().count(word)

            if current_match > max_match:
                max_match = current_match
                best_match_sentence = question_in_column
                index = i
    return best_match_sentence, index

In [24]:
def create_context_from_index_question(data_question, index):
    question = data_question
    back_url = question['ArticleFile'][index]
    url = f"./data/S08/{back_url}.txt"
    context = take_answer(url)
    return context

In [25]:
def answer_my_question(myquestion, context):
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    myquestion = myquestion
    context = context

    input_text = f"question: {myquestion} context: {context}"

    inputs = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(inputs, max_length=50)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Answer: {answer}")

In [52]:
question = take_question('./data/S08/question_answer_pairs.txt')
myquestion = "he managed his own reelection in the"
best_match_sentence, index = check_my_question(question, myquestion)
context = create_context_from_index_question(question, index)
# answer_my_question(myquestion, context)

ArticleTitle                0
Question                    0
Answer                      0
DifficultyFromQuestioner    0
DifficultyFromAnswerer      0
ArticleFile                 0
dtype: int64

Data has Index(['ArticleTitle', 'Question', 'Answer', 'DifficultyFromQuestioner',
       'DifficultyFromAnswerer'],
      dtype='object')


In [53]:
context['text'][2]

'Adams, a sponsor of the American Revolution in Massachusetts, was a driving force for independence in 1776; Jefferson called him the "Colossus of Independence". He represented the Continental Congress in Europe. He was a major negotiator of the eventual peace treaty with Great Britain, and chiefly responsible for obtaining the loans from the Amsterdam money market necessary for the conduct of the Revolution. His prestige secured his two elections as Washington\'s Vice President and his election to succeed him.   As President, he was frustrated by battles inside his own Federalist party against a faction led by Alexander Hamilton, but he broke with them to avert a major conflict with France in 1798, during the Quasi-War crisis. He became the founder of an important family of politicians, diplomats and historians, and in recent years his reputation has improved.'

In [50]:
answer_my_question(myquestion, context['text'][2])

Answer: Lincoln closely supervised the victorious war effort, especially the selection of top generals, including Ulysses S. Grant
