In [63]:
import random
from tqdm import tqdm
path_to_data = '../../datasets/riddles'

bases = {}
allowed_words_set = set()
allowed_words = []
answers = []
queries = []

In [64]:
def get_word_base(word):
    global bases
    word = word.lower()
    ret = bases.get(word)
    if ret:
        return ret
    return word

for x in open(f'{path_to_data}/superbazy_clean.txt'):
    word,base = x.lower().split()
    bases[word] = base

print("Loading allowed vocabulary...")
with open(f'{path_to_data}/plwiktionary_definitions_clean.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.split('###')
        word = parts[0].strip()
        if ' ' not in word:
            allowed_words_set.add(word)

allowed_words = list(allowed_words_set)
print(f"Loaded {len(allowed_words)} unique allowed words.")

with open(f'{path_to_data}/zagadki_do_testow_clean.txt') as file:
    for line in file:
        line = line.replace(';;', '').split()
        answers.append(line[0])
        queries.append(' '.join(line[1:]))

Loading allowed vocabulary...
Loaded 8085 unique allowed words.


In [65]:
print("\n=== DATA LOADED SUCCESSFULLY ===\n")
print(f"1. QUERIES (Total: {len(queries)})")
print(f"   Sample [0]: {queries[0]}")
print(f"   Sample [1]: {queries[1]}")
print("-" * 40)

print(f"2. ANSWERS (Total: {len(answers)})")
print(f"   Sample [0]: {answers[0]}")
print(f"   Sample [1]: {answers[1]}")
print("-" * 40)

print(f"3. ALLOWED VOCABULARY (Total unique words: {len(allowed_words)}")
print(f"   Sample keys: {list(allowed_words)[:10]}")
print("-" * 40)

print(f"4. BASES (Total forms: {len(bases)})")
print(f"   Sample: 'psa' -> '{bases['psa']}'")
print(f"   Sample: 'zrobił' -> '{bases['zrobił']}'")


=== DATA LOADED SUCCESSFULLY ===

1. QUERIES (Total: 1993)
   Sample [0]: rękopiśmienny tekst lub dokument, niepublikowany drukiem.
   Sample [1]: stan emocjonalny charakteryzujący się radością, życzliwością i łatwością w wywoływaniu uśmiechu.
----------------------------------------
2. ANSWERS (Total: 1993)
   Sample [0]: manuskrypt
   Sample [1]: wesołość
----------------------------------------
3. ALLOWED VOCABULARY (Total unique words: 8085
   Sample keys: ['spichlerz', 'prawniczka', 'tyran', 'rywalka', 'drogowskaz', 'internet', 'koparka', 'zgrupowanie', 'drzewko', 'mus']
----------------------------------------
4. BASES (Total forms: 1776667)
   Sample: 'psa' -> 'pies'
   Sample: 'zrobił' -> 'zrobić'


In [57]:
def mean_reciprocal_rank(real_answers, computed_answers, K=20):
    positions = []

    for real_answer, computed_answer in zip(real_answers, computed_answers):
        if real_answer in computed_answer[:K]:
            pos = computed_answer.index(real_answer) + 1
            positions.append(1 / pos)

    mrr = sum(positions) / len(real_answers)
    print('Mean Reciprocal Rank =', mrr)

    return mrr

def evaluate_algorithm(score_function, queries, answers, K):
    computed_answers = []
    for query in tqdm(queries, desc="queries answered"):
        computed_answers.append(score_function(query, K=K))
    score = mean_reciprocal_rank(answers, computed_answers, K=K)
    
    return score

In [66]:
def answer_riddle(riddle, K):
    return random.sample(allowed_words, K)

In [67]:
PART_OF_DATA = 100
K = 20
valid_queries = queries[:PART_OF_DATA]
valid_answers = answers[:PART_OF_DATA]
score = evaluate_algorithm(answer_riddle, valid_queries, valid_answers, K=K)
print(f"Score: {score}")

queries answered: 100%|██████████| 100/100 [00:00<00:00, 79558.12it/s]

Mean Reciprocal Rank = 0.0
Score: 0.0



