In [None]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.http.models import (FieldCondition, Filter, MatchText,
                                       MatchValue)
from sentence_transformers import SentenceTransformer

In [None]:
collection_name = "kva_test_collection"

model = SentenceTransformer('intfloat/multilingual-e5-base')
client = QdrantClient("", port=6333)

### Baasi ülevaade

In [None]:
first_page_results = client.scroll(
    collection_name=collection_name,
    scroll_filter=Filter(
        must=[
            FieldCondition(key="page_number", match=MatchValue(value=1)),
        ]
    ),
    limit=20,
    with_payload=True,
    with_vectors=False,
)

files_in_collection = set(record.payload['filename'] for record in first_page_results[0])
print('\n'.join(files_in_collection))

### Filtrid

In [None]:
# Otsi ainult valideeritud andmetest
query_filter = Filter(
            must=[
            FieldCondition(
                key="validated",
                match=MatchValue(
                    value=True,
                ),
            )
        ])

# Valideeritud andmed märksõnaga
query_filter_subtoken_search = Filter(
            must=[
            FieldCondition(
                key="validated",
                match=MatchValue(
                    value=True),),
            FieldCondition(
                key="content_type",
                match=MatchValue(
                    value="term"),)
        ])


"""
FieldCondition(
    key="text",
    match=MatchText(text="temporary area defined by the Supreme Allied"),
    ),
"""


### Näited

**LOGISTICS**
* 'logistika'
* 'logistics'
* 'logistics can be defined as something that'
* 'logistika tähendab näiteks'
* 'logistika: kaupade, teenuste ja info liikumine, selle juhtimine ja korraldamine (hõlmab nt hankimist, vedu, ladustamist, jaotamist, tarbimist)' 

In [None]:
def get_similarities(text, query_filter, collection_name="kva_test_collection"):
    search_result = client.search(
    collection_name=collection_name,
    query_vector=list(model.encode(text, normalize_embeddings=True).astype(float)), 
    query_filter=query_filter_subtoken_search,
    limit=5)

    result_dict = {
    'text' : [],
    'score':  [],
    'content_type' : [],
    'file': [],
    'page': []
}
    for point in search_result:
        result_dict['text'].append(point.payload["text"])
        result_dict['content_type'].append(point.payload["content_type"])
        result_dict['score'].append(point.score)
        result_dict['file'].append(point.payload["filename"])
        result_dict['page'].append(point.payload["page_number"])

# DataFrame'i koostamine

    df_properties = {
    'white-space': 'pre-wrap', # Allows text to wrap within cells
    'width': '300px', # Adjust as needed
}

    return pd.DataFrame(result_dict).style.set_properties(**df_properties)


In [None]:
text = 'Query: ' + 'logistika'
get_similarities(text, query_filter)

In [None]:
text = 'Query: ' + 'logistics'
get_similarities(text, query_filter)

In [None]:
text = 'Query: ' + 'logistics can be defined as a something that'
get_similarities(text, query_filter)

In [None]:
text = 'Query: ' + 'logistikaks nimetatakse kaupade, teenuste ja info liikumine, selle juhtimine ja korraldamine (hõlmab nt hankimist, vedu, ladustamist, jaotamist, tarbimist)'
get_similarities(text, query_filter_subtoken_search)