In [4]:
from functools import partial
from itertools import product
from pathlib import Path

import editdistance
from tqdm import tqdm

from textmining.index import DiskIndex, DiskPositionIndex
from textmining.lemmatization import Lemmas
from textmining.tokenization import tokenize

In [13]:
import editdistance
import sys


def scaled_editdist(ans, cor):
    ans = ans.lower()
    cor = cor.lower()
    
    return editdistance.eval(ans, cor) / len(cor)
    
def single_match(a, c):
    if c.isdecimal():
        return a == c
    return scaled_editdist(a, c) < 0.5
        
def match(ans, cor):
    return any(single_match(ans, c) for c in cor)

In [11]:
with open("../data/poleval/pytania.txt") as f:
    questions = [l.lower().strip() for l in f]

In [12]:
with open("../data/poleval/odpowiedzi.txt") as f:
    correct_answers = [x.lower().split('\t') for x in f]

In [14]:
lemmas = Lemmas.from_file(Path("../data/lemmas.pickle"))
index = DiskIndex(lemmas.lemmatize, Path("../data/index"))
pos_index = DiskPositionIndex(lemmas.lemmatize, Path("../data/position_index"))

In [15]:
def search(query, title_importance=1, exact_importance=1, total_importance=1):
    docs = index.search(query)
    qtokens = tokenize(query.lower())
    qlemmas = {lemma for token in qtokens for lemma in index.lemmatize(token)}
    for doc in docs:
        doc.title_matching = 0
        doc.exact_matching = 0
        for token in tokenize(doc.title):
            lemmas = set(index.lemmatize(token.lower()))
            if qlemmas.intersection(lemmas):
                doc.title_matching += 1
            if token.lower() in qtokens:
                doc.exact_matching += 1

        doc.total_matching = doc.title_matching
        for token in tokenize(doc.content):
            lemmas = set(index.lemmatize(token.lower()))
            if qlemmas.intersection(lemmas):
                doc.total_matching += 1
            if token.lower() in qtokens:
                doc.exact_matching += 1
    return sorted(
        docs, reverse=True,
        key=lambda d: (
            title_importance * d.title_matching
            + exact_importance * d.exact_matching
            + total_importance * d.total_matching
        )
    )

In [16]:
def answer(question, title_importance=1, exact_importance=1, total_importance=1):
    question_tokens = [token for token in tokenize(question.lower()) if len(token) > 1]
    if question_tokens[0] == "czy":
        return "Tak"
    while question_tokens:
        query = " ".join(question_tokens)

        for doc in search(query, title_importance, exact_importance, total_importance):
            result = doc.title
            res_tokens = tokenize(result.lower())

            for t1, t2 in product(res_tokens, question_tokens):
                if scaled_editdist(t1, t2) <= 0.5:
                    break
            else:
                paren_index = result.find("(")
                if paren_index != -1:
                    result = result[:paren_index]
                return result
        # if answer not found, remove first token of query
        del question_tokens[0]
    return "nie mam pojęcia, sorry"

In [17]:
answers = list(tqdm(map(answer, questions)))

1000it [06:16,  2.65it/s]


In [18]:
N = len(correct_answers)
score = 0.0

for ans, cor in zip(answers, correct_answers):    
    if match(ans, cor):
        score += 1
        
print ('TOTAL SCORE:', score)        

TOTAL SCORE: 93.0


In [19]:
def phrase_answer(question, title_importance=1, exact_importance=1, total_importance=1):
    question_tokens = [token for token in tokenize(question.lower()) if len(token) > 1]
    if question_tokens[0] == "czy":
        return "Tak"
    while question_tokens:
        query = " ".join(question_tokens)

        for doc in pos_index.search(query):
            result = doc.title
            res_tokens = tokenize(result.lower())

            for t1, t2 in product(res_tokens, question_tokens):
                if scaled_editdist(t1, t2) <= 0.5:
                    break
            else:
                paren_index = result.find("(")
                if paren_index != -1:
                    result = result[:paren_index]
                return result
        # if answer not found, remove first token of query
        del question_tokens[0]
    return "nie mam pojęcia, sorry"

In [20]:
phrase_answers = list(tqdm(map(phrase_answer, questions)))

N = len(correct_answers)
score = 0.0

for ans, cor in zip(answers, correct_answers):    
    if match(ans, cor):
        score += 1
        
print ('TOTAL SCORE:', score)        

1000it [15:03,  1.11it/s]

TOTAL SCORE: 93.0



