In [1]:
from functools import partial
from itertools import product
from pathlib import Path

import editdistance
from tqdm import tqdm

from textmining.index import DiskIndex
from textmining.lemmatization import Lemmas
from textmining.tokenization import tokenize

In [2]:
with open("../data/poleval/pytania.txt") as f:
    questions = list(map(str.strip, f.readlines()))
with open("../data/poleval/odpowiedzi_baseline.txt") as f:
    baseline_answers = list(map(str.strip, f.readlines()))

In [3]:
with open("../data/poleval/odpowiedzi.txt") as f:
    correct = [l.split("\t") for l in map(str.strip, f.readlines())]

In [4]:
def scaled_editdist(ans, cor):
    ans = ans.lower()
    cor = cor.lower()

    return editdistance.eval(ans, cor) / len(cor)
    
def is_correct(response, ground_truth):
    if scaled_editdist(response, ground_truth) < 0.5:
        return True
    return False

In [5]:
lemmas = Lemmas.from_file(Path("../data/lemmas.pickle"))
index = DiskIndex(lemmas.lemmatize, Path("../data/index"))

In [6]:
def search(query, title_importance=1, exact_importance=1, total_importance=1):
    docs = index.search(query)
    qtokens = tokenize(query.lower())
    qlemmas = {lemma for token in qtokens for lemma in index.lemmatize(token)}
    for doc in docs:
        doc.title_matching = 0
        doc.exact_matching = 0
        for token in tokenize(doc.title):
            lemmas = set(index.lemmatize(token.lower()))
            if qlemmas.intersection(lemmas):
                doc.title_matching += 1
            if token.lower() in qtokens:
                doc.exact_matching += 1

        doc.total_matching = doc.title_matching
        for token in tokenize(doc.content):
            lemmas = set(index.lemmatize(token.lower()))
            if qlemmas.intersection(lemmas):
                doc.total_matching += 1
            if token.lower() in qtokens:
                doc.exact_matching += 1
    return sorted(
        docs, reverse=True,
        key=lambda d: (
            title_importance * d.title_matching
            + exact_importance * d.exact_matching
            + total_importance * d.total_matching
        )
    )

In [7]:
def answer(question, title_importance=1, exact_importance=1, total_importance=1):
    question_tokens = [token for token in tokenize(question.lower()) if len(token) > 1]
    while question_tokens:
        query = " ".join(question_tokens)

        for doc in search(query, title_importance, exact_importance, total_importance):
            result = doc.title
            res_tokens = tokenize(result.lower())

            for t1, t2 in product(res_tokens, question_tokens):
                if scaled_editdist(t1, t2) <= 0.5:
                    break
            else:
                paren_index = result.find("(")
                if paren_index != -1:
                    result = result[:paren_index]
                return result
        # if answer not found, remove first token of query
        del question_tokens[0]
    return "nie mam pojęcia, sorry"

In [8]:
def accuracy(answers):
    n_correct = 0
    for a, c in zip(answers, correct):
        if any([is_correct(a, c_) for c_ in c]):
            n_correct += 1
    return n_correct / len(answers)

In [9]:
accuracy(baseline_answers)

0.091

In [10]:
answers = list(tqdm(map(partial(answer, title_importance=1, exact_importance=1, total_importance=1), questions)))
accuracy(answers)

1000it [06:31,  2.55it/s]


0.044

In [11]:
answers = list(tqdm(map(partial(answer, title_importance=1, exact_importance=1, total_importance=2), questions)))
accuracy(answers)

1000it [06:39,  2.51it/s]


0.041

In [12]:
answers = list(tqdm(map(partial(answer, title_importance=1, exact_importance=2, total_importance=1), questions)))
accuracy(answers)

1000it [06:42,  2.48it/s]


0.044

In [13]:
answers = list(tqdm(map(partial(answer, title_importance=2, exact_importance=1, total_importance=1), questions)))
accuracy(answers)

1000it [06:46,  2.46it/s]


0.044

In [14]:
answers = list(tqdm(map(partial(answer, title_importance=0, exact_importance=0, total_importance=1), questions)))
accuracy(answers)

1000it [06:39,  2.50it/s]


0.041

Który potwór zabijał wzrokiem?
* Correct: Bazyliszek
* Baseline: Bazyliszek
* Solution: Herensugue