In [19]:
import pandas as pd
import sys
import spacy
from spacy.tokens import Doc
from spacy.tokens import Token

train_data = pd.read_csv('/Users/hoanh139/Desktop/pythonProject/NLP_QA/QA_train_data.csv')
test_data = pd.read_csv('/Users/hoanh139/Desktop/pythonProject/NLP_QA//QA_test_data.csv')

In [20]:
nlp = spacy.load("en_core_web_sm")
question_words = ['which', 'what', 'when', 'who', 'how', 'why']


def extract_question_word(doc: Doc) -> Token:
    question_word = [tok for tok in doc if tok.lemma_.lower() in question_words]

    return next(iter(question_word), None)


def calculate_similarity_for_question_word(doc: Doc, compare_doc: Doc) -> float:
    question_word = extract_question_word(doc)
    compare_question_word = extract_question_word(compare_doc)

    empty_token = nlp.tokenizer("")

    similarity = 0.0

    if question_word is None and compare_question_word is None:
        similarity = 1.0
    elif question_word is None:
        similarity = empty_token.similarity(compare_question_word)
    elif compare_question_word is None:
        similarity = question_word.similarity(empty_token)
    else:
        similarity = question_word.similarity(compare_question_word)

    return similarity


def remove_stop_words(text: str) -> str:
    return ' '.join(token.lemma_ for token in nlp(text) if not token.is_stop)


def calculate_similarity_between_tokens(token_list: list, other_token_list: list, number_of_intersecting_tokens: int) -> float:
    similarity = 0.0

    for t1 in token_list:
        for t2 in other_token_list:
            similarity += t1.similarity(t2)

    return similarity / (len(token_list) * len(other_token_list)) * number_of_intersecting_tokens


def calculate_similarity_for_docs(doc: Doc, compare_doc: Doc) -> float:
    tokens_without_compare_tokens = list(filter(lambda x: x.text not in [y.text for y in compare_doc], doc))
    compare_tokens_without_tokens = list(filter(lambda x: x.text not in [y.text for y in doc], compare_doc))

    intersecting_tokens = list(filter(lambda x: x.text in [y.text for y in compare_doc], doc))
    number_of_intersecting_tokens = len(intersecting_tokens)

    similarity = 0.0

    if len(tokens_without_compare_tokens) == 0 and len(compare_tokens_without_tokens) == 0:
        similarity = sys.float_info.max
    elif len(tokens_without_compare_tokens) == 0:
        similarity = calculate_similarity_between_tokens(compare_tokens_without_tokens, intersecting_tokens, number_of_intersecting_tokens)
    elif len(compare_tokens_without_tokens) == 0:
        similarity = calculate_similarity_between_tokens(tokens_without_compare_tokens, intersecting_tokens, number_of_intersecting_tokens)
    else:
        similarity = calculate_similarity_between_tokens(tokens_without_compare_tokens, compare_tokens_without_tokens, number_of_intersecting_tokens)

    return similarity


def calculate_similarity(question: str, compare_question: str) -> float:
    question = question.replace("?", "")
    compare_question = compare_question.replace("?", "")

    doc = nlp(question)
    compare_doc = nlp(compare_question)

    question_word_similarity = calculate_similarity_for_question_word(doc, compare_doc)

    question_without_stop_words = remove_stop_words(question)
    compare_question_without_stop_words = remove_stop_words(compare_question)

    doc_without_stop_words = nlp(question_without_stop_words)
    compare_doc_without_stop_words = nlp(compare_question_without_stop_words)

    similarity = calculate_similarity_for_docs(doc_without_stop_words, compare_doc_without_stop_words)

    return question_word_similarity * similarity

In [21]:
def predict_answer(question: str, df: pd.DataFrame):
    questions = df['Question']
    answers = df['Answer']

    similarities = questions.apply(lambda q: calculate_similarity(question, q))

    index = similarities.idxmax()
    similarity = similarities[index]

    if similarity < 1.0:
        return None
    else:
        return answers[index]

In [22]:
predicted = test_data['Question'].apply(lambda q: predict_answer(q))
actual = test_data['Answer']

In [None]:
def evaluate_result(predicted: str, actual: str) ->int:
    if predicted is None:
        return 0
    elif predicted == actual:
        return 1
    else:
        return -1

def evaluate_results(predicted: pd.Series, actual: pd.Series) ->float:
    sum=0
    for index, value in predicted.items():
        eval = evaluate_result(value, actual[index])
        sum += eval
    result = sum/predicted.size
    return result
score = evaluate_results(predicted, actual)
print('Accuracy: {:.2f}%'.format(score * 100))