In [18]:
import os

Adapted from TestTrecEval.java

- TREC data must be converted to JSON via deserializer and placed in resources/trec
- This notebook must be run from the root of the project

In [19]:
trec_folder = 'resources/trec'
czech_data = os.path.join(trec_folder, 'czechData.json')
topic_data = os.path.join(trec_folder, 'topicData.json')


In [20]:
import json

# Load both files
with open(czech_data, 'r', encoding='utf8') as f:
    czech_data = json.load(f)

with open(topic_data, 'r', encoding='utf8') as f:
    topic_data = json.load(f)

In [21]:
# Setup NLTK
import nltk
from nltk_dependencies import setup_dependencies

# Download dependencies
setup_dependencies()

# Configure NLTK - set the resource path to correct location
nltk_resources_dir = os.path.join(os.getcwd(), 'resources\\nltk\\')
nltk.data.path.append(nltk_resources_dir)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Unzipping corpora\stopwords.zip.


Copying file: "czech" to "C:\dev\ir\irsp/nltk/corpora/stopwords"


In [22]:
from src.preprocessing.preprocessor_configurations import czech_default_stemmer
from src.index.index import Index
from src.index.index_config import IndexConfig
from src.preprocessing.preprocessing import Preprocessor

# Create new index (czech since we index documents in czech)
index = Index(
    config=IndexConfig(
        name='dummyIdxCs',
        preprocessor=Preprocessor(czech_default_stemmer)
    ),
    initial_batch=[]
)

In [23]:
# Create domain objects
docs = []
for doc in czech_data:
    docs.append(index.preprocess_document(index._parse_document_from_dict(doc)))

f'{len(docs)} documents loaded'

'81735 documents loaded'

In [None]:
# Add documents to index
index.add_batch(docs)
f'{len(index.documents)} documents added to index'

In [None]:
from typing import List
from src.api.dtos import QueryDto, ModelVariant

query_variants = ['title', 'description', 'both']


def build_query_variant(topic, variant) -> str:
    """
    Builds a query variant based on the topic and variant
    :param topic:
    :param variant:
    :return:
    """
    if variant == 'title':
        return topic['title']
    elif variant == 'description':
        return topic['description']
    elif variant == 'both':
        return f'{topic["title"]} {topic["description"]}'
    else:
        raise ValueError(f'Unknown variant: {variant}')


def search_topics(model_variant: ModelVariant, query_variant) -> List[str]:
    """
    Search for topics in the index using the specified model
    :param model_variant: Model variant to use
    :param query_variant: Query variant to use
    :return:
    """
    # Search for topics
    lines = []
    for topic in topic_data:
        # Build query object
        topic_id = topic['id']
        query_dto = QueryDto(
            query=build_query_variant(topic, query_variant),
            model=model_variant
        )

        # Search for documents
        search_result = index.search(query_dto)

        # Map to list that will be used for the script
        # These are automatically sorted by relevance
        documents = search_result.documents

        if len(documents) == 0:
            lines.append(f'{topic_id} Q0 abc 99 0.0 runindex1')
            continue

        for idx, document in enumerate(documents):
            line = f'{topic_id} Q0 {document.id} {idx + 1} {document.score} runindex1'
            lines.append(line)

    return lines


In [None]:
# Now run search_topics for each model variant

models = [
    {'variant_name': 'TFIDF', 'model': ModelVariant.TFIDF},
    {'variant_name': 'BOOL', 'model': ModelVariant.BOOL},
    {'variant_name': 'BM25', 'model': ModelVariant.BM25},
]

output_path = 'trec_eval_output'
os.makedirs(output_path, exist_ok=True)

for model in models:
    for variant in query_variants:
        lines = search_topics(model['variant'], variant)
        with open(os.path.join(output_path, f'{model["variant_name"]}_{variant}.txt'), 'w', encoding='utf8') as f:
            f.write('\n'.join(lines))
