This notebook was used to test the search models

In [1]:
import nltk
import os
from nltk_dependencies import setup_dependencies

# Download dependencies if necessary
setup_dependencies()

# Configure NLTK - set the resource path to correct location
nltk_resources_dir = os.path.join(os.getcwd(), 'resources\\nltk\\')
nltk.data.path.append(nltk_resources_dir)

Copying file: "czech" to "C:\dev\ir\irsp/nltk/corpora/stopwords"


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\dev\ir\irsp\resources/nltk/...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
import json

from src.preprocessing.preprocessor_configurations import english_default_stemmer
from src.index.index import Index
from src.index.index_config import IndexConfig

# Load sample file
with open('resources/docs/english_documents.json', 'r') as f:
    documents = json.load(f)

documents

[{'text': 'tropical fish include fish found in tropical enviroments'},
 {'text': 'fish live in a sea'},
 {'text': 'tropical fish are popular aquarium fish'},
 {'text': 'fish also live in Czechia'},
 {'text': 'Czechia is a country'}]

In [3]:
from src.preprocessing.preprocessing import Preprocessor
from src.index.index import add_index, delete_index

try:
    delete_index('test')
except:
    pass

# Create new index and add it to the app
index = Index(
    config=IndexConfig(
        name='test',
        preprocessor=Preprocessor(english_default_stemmer),
    ),
    initial_batch=[]
)

add_index('test', index)

In [4]:
# Map json documents to domain objects
docs_domain = []
for doc in documents:
    docs_domain.append(
        index._parse_document_from_dict(doc)
    )
index.add_batch(docs_domain)

len(index.documents)

5

In [12]:
from src.api.indices_dtos import QueryDto, ModelVariant

# A simple boolean query
boolean_query = 'Are Fish tropical'
query_dto = QueryDto(query=boolean_query, model=ModelVariant.BOOL)

documents = index.search(query_dto)

for document in documents:
    print(document)

In [6]:
from src.api.indices_dtos import QueryDto, ModelVariant

# A simple boolean query
boolean_query = '(NOT FISH) AND czechiA'
query_dto = QueryDto(query=boolean_query, model=ModelVariant.BOOL)

documents = index.search(query_dto)

print(f'Found {len(documents)} documents')
for document in documents:
    print(document)

Found 1 documents
{'score': None, 'document': DocumentDto(docId=UUID('179c9e29-18d2-4799-974b-4bce4ed092dd'), text='Czechia is a country', additionalProperties={})}


In [7]:
# Now test TF-IDF
tfidf_query = 'czechia'
query_dto = QueryDto(query=tfidf_query, model=ModelVariant.TFIDF)

documents = index.search(query_dto)

print(f'Found {len(documents)} documents')
for document in documents:
    print(document)

Found 2 documents
{'score': 0.4947592105690922, 'document': DocumentDto(docId=UUID('179c9e29-18d2-4799-974b-4bce4ed092dd'), text='Czechia is a country', additionalProperties={})}
{'score': 0.4408883034708105, 'document': DocumentDto(docId=UUID('7c413469-317b-4147-ac23-340a9e8f8c63'), text='fish also live in Czechia', additionalProperties={})}
