In [59]:
# метрики: hits@1, hits@5, ndcg@5, hits@10, ndcg@10, ndcg@20 (5 результатов на первой странице)

In [1]:
import os
from pypdf import PdfReader
from pathlib import Path
from tqdm.notebook import tqdm
import re
import numpy as np
import time
import pickle
from copy import deepcopy

# getting data ready

## documents

In [6]:
with open('files_cropped/res_dict_10.pickle', 'rb') as file:
    gd = pickle.load(file)

In [16]:
all_docs = []
i = 0
for main_name, main_section in gd.items():
    for section_name, section_docs in main_section.items():
        for doc in section_docs:
            if 'file_markdown' in doc:
                del doc['file_markdown']
            doc['OKS_main'] = main_name
            doc['OKS_section'] = section_name
            doc['id'] = i
            i += 1
            all_docs.append(doc)

In [31]:
all_docs[0].keys()

dict_keys(['number', 'title', 'status', 'date_start', 'date_cancel', 'replaced_by', 'OKS', 'file_path', 'file_url', 'OKS_main', 'OKS_section', 'id'])

In [34]:
len(np.unique([doc['number'] for doc in all_docs])), len(all_docs)

(3519, 3519)

In [36]:
with open('files_cropped/docs_flattened.pickle', 'wb') as file:
    pickle.dump(all_docs, file)

In [52]:
with open('dataset/documents_info.pickle', 'wb') as file:
    pickle.dump(all_docs, file)

## queries

In [45]:
with open('final_dataset.pickle', 'rb') as file:
    queries = pickle.load(file)

In [48]:
new_queries = []
for filename, doc_queries in queries.items():
    new_q = {}
    doc = [doc for doc in all_docs if doc['file_path'] == filename][0]
    new_q['doc_id'] = doc['id']
    new_q['filename'] = doc['file_path']
    new_q['doc_gost_number'] = doc['number']
    new_q['doc_title'] = doc['title']
    new_q['queries'] = doc_queries
    new_queries.append(deepcopy(new_q))

In [51]:
with open('final_dataset_final.pickle', 'wb') as file:
    pickle.dump(new_queries, file)

In [53]:
with open('dataset/queries.pickle', 'wb') as file:
    pickle.dump(new_queries, file)

# Texts

## litle preproc

In [4]:
doc_folder = 'files_cropped/'

In [6]:
from joblib import Parallel, delayed

def process_doc(doc):
    text = ''
    print('1')
    if doc['file_path']:
        try:
            reader = PdfReader(doc_folder + doc['file_path'])
            for page in reader.pages[1:-1]:
                page_text = page.extract_text()
                text += (page_text + '\n')
        except:
            pass
    doc['text'] = doc['title'] + '\n\n' + text
    return doc

if __name__ == '__main__':
    with Parallel(n_jobs=-1) as parallel:
        all_docs = parallel(delayed(process_doc)(doc) for doc in tqdm(all_docs))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [5]:
all_docs[0].keys()

dict_keys(['number', 'title', 'status', 'date_start', 'date_cancel', 'replaced_by', 'OKS', 'file_path', 'file_url', 'OKS_main', 'OKS_section', 'id'])

In [None]:
for i in range(len(all_docs)):
    all_docs[i]['text'] = all_docs[i]['text'].replace('\xad\n', '').replace('\n\xad', '')

In [50]:
with open('dataset/docs_with_text.pickle', 'wb') as f:
    pickle.dump(all_docs, f)

In [55]:
def lemmatize_doc(doc):
    text = doc['text']
    text = text.lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    words = ' '.join([w for w in words if w not in stopwords and w != ''])
    text = ''.join(lemmatizer.lemmatize(words))
    doc['text'] = text
    return doc
def stem_doc(doc):
    text = doc['text']
    text = text.lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    text = ' '.join([stemmer.stem(w) for w in words if w not in stopwords and w != ''])
    doc['text'] = text
    return doc

In [56]:
from joblib import Parallel, delayed

if __name__ == '__main__':
    with Parallel(n_jobs=-1) as parallel:
        docs_lemmatized = parallel(delayed(lemmatize_doc)(doc) for doc in tqdm(all_docs))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [57]:
with open('dataset/lemmatized_docs.pickle', 'wb') as f:
    pickle.dump(docs_lemmatized, f)

In [29]:
if __name__ == '__main__':
    with Parallel(n_jobs=-1) as parallel:
        docs_stemmed = parallel(delayed(stem_doc)(doc) for doc in tqdm(all_docs))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [30]:
with open('dataset/stemmed_docs.pickle', 'wb') as f:
    pickle.dump(docs_stemmed, f)

## testing

In [2]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from langchain_community.retrievers import BM25Retriever, TFIDFRetriever
from langchain_core.documents import Document
import pymorphy2

In [7]:
def pretty_print_results(results, precision=2):
    for name, metrics in results.items():
        print(f"{(' '.join(name.split('_'))).capitalize()}:")
        for metric_name, metric_value in metrics.items():
            if 'hits' in metric_name or 'dcg' in metric_name:
               print(f"{metric_name.upper().replace('_', '@')}: {round(metric_value, precision)}")
            elif metric_name == 'time_per_query':
                print(f"Average time per query: {round(metric_value*1000, 1)}ms")
            elif metric_name == 'not_found_queries':
                print(f"Number of not found queries: {len(metric_value)}")
        print()

In [4]:
stopwords = stopwords.words('russian')
stemmer = SnowballStemmer('russian')
punct = '!"#$%&()*\+,-\./:;<=>?@\[\]^_`{|}~„“«»†*\—/\-‘’'
lemmatizer = Mystem(grammar_info=False)
morph = pymorphy2.MorphAnalyzer()

In [2]:
with open('dataset/queries.pickle', 'rb') as f:
    queries = pickle.load(f)
with open('dataset/docs_with_text.pickle', 'rb') as f:
    all_docs = pickle.load(f)
with open('dataset/results.pickle', 'rb') as f:
    results = pickle.load(f)

In [5]:
def hits_k(requested_doc_id: int, found_doc_idx: list, ks: list):
    hits = []
    for k in ks:
        cur_res = 0
        if requested_doc_id in found_doc_idx[:k]:
            cur_res = 1
        hits.append((k, cur_res))
    return hits

In [7]:
def dcg_k(requested_doc_id: int, found_doc_idx: list, ks: list):
    dcgs = []
    requested_doc_pos = [i for i, doc_id in enumerate(found_doc_idx) if doc_id == requested_doc_id]
    if not requested_doc_pos:
        return [(k, 0) for k in ks]
    # if len(requested_doc_pos) > 1:
    #     raise ValueError("Seems like duplicate document ids are encountered")
    requested_doc_pos = requested_doc_pos[0] + 1
    for k in ks:
        dcg = (1 / np.log2(1 + requested_doc_pos)) * (requested_doc_pos <= k)
        dcgs.append((k, dcg))
    return dcgs

In [8]:
def test_retriever_pipeline(
    retriever,
    queries: list,
    hits_ks: list,
    dcg_ks: list,
    not_found_queries_threshold: int = 20,
    progress_bar: bool = True,
    query_preproc_fn = None
):
    """
    Args:
        retriever: LangChain retriever
        queries: search queries in following format: [{'doc_id': 123, 'queries': ['query1', 'query2', ...], ...}, ...]
        hits_ks: for wich Ks compute Hits@K in format of list: [1, 2, 3]
        dcg_ks: for which Ks compute DCG@K in format of list: [1, 2, 3]
        not_found_queries_threshold: return also queries that are not in top k documents (or None not to return them at all)
        progress_bar: whether to enable progress bar
        query_preproc_fn: function that preprocesses query
    Returns:
        dict: metrics in following format: {'hits_1': 0, 'hits_2': 0, ...}
    """
    metrics = dict()
    not_found_queries = []
    for k in hits_ks:
        metrics[f"hits_{k}"] = 0
    for k in dcg_ks:
        metrics[f"dcg_{k}"] = 0
    query_counter = 0
    t = time.time()
    for query in tqdm(queries, desc='Scoring queries', disable=not progress_bar):
        requested_doc_id = query['doc_id']
        for subquery in query['queries']:
            query_counter += 1
            subquery_text = subquery
            if query_preproc_fn:
                subquery_text = query_preproc_fn(subquery)
            found_doc_idx = [res.metadata['doc_id'] for res in retriever.invoke(subquery_text)]
            subquery_hits = hits_k(requested_doc_id, found_doc_idx, hits_ks)
            subquery_dcgs = dcg_k(requested_doc_id, found_doc_idx, dcg_ks)
            if not_found_queries_threshold and requested_doc_id not in found_doc_idx[:not_found_queries_threshold]:
                not_found_queries.append({'doc_id': requested_doc_id, 'doc_title': query['doc_title'], 'query': subquery})
            for k, score in subquery_hits:
                metrics[f"hits_{k}"] += score
            for k, score in subquery_dcgs:
                metrics[f"dcg_{k}"] += score
    metrics['time_per_query'] = time.time() - t
    for metric_name, metric_value in metrics.items():
        metrics[metric_name] = metric_value / query_counter
    metrics['not_found_queries'] = not_found_queries
    return metrics

In [9]:
def lemmatize_query(text):
    text = text.lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    words = ' '.join([w for w in words if w not in stopwords and w != ''])
    text = ''.join(lemmatizer.lemmatize(words))
    return text
def lemmatize_query2(text):
    text = text.lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    words = [w for w in words if w not in stopwords and w != '']
    text = ' '.join([morph.parse(w)[0].normal_form for w in words])
    return text
def stem_query(text):
    text = text.lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    text = ' '.join([stemmer.stem(w) for w in words if w not in stopwords and w != ''])
    return text

## bm25

In [9]:
retriever = BM25Retriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in all_docs],
    k=20
)

In [22]:
results['bm25_clear'] = test_retriever_pipeline(
    retriever=retriever,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
del retriever

In [33]:
bm25_stemmed = BM25Retriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in docs_stemmed],
    k=20
)

In [52]:
results['bm25_stemmed'] = test_retriever_pipeline(
    retriever=bm25_stemmed,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=stem_query
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [54]:
del bm25_stemmed, docs_stemmed

In [59]:
bm25_lemmatized = BM25Retriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in docs_lemmatized],
    k=20
)

In [72]:
results['bm25_lemmatized'] = test_retriever_pipeline(
    retriever=bm25_lemmatized,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lemmatize_query
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [78]:
pretty_print_results(results, 5)

Bm25 clear:
HITS@1: 0.395
HITS@5: 0.655
HITS@10: 0.72
DCG@5: 0.52954
DCG@10: 0.55046
DCG@20: 0.57054
Average time per query: 0.01036s
Number of not found queries: 40

Bm25 stemmed:
HITS@1: 0.56
HITS@5: 0.83
HITS@10: 0.895
DCG@5: 0.70912
DCG@10: 0.73019
DCG@20: 0.74035
Average time per query: 0.00997s
Number of not found queries: 13

Bm25 lemmatized:
HITS@1: 0.57
HITS@5: 0.82
HITS@10: 0.9
DCG@5: 0.70672
DCG@10: 0.73225
DCG@20: 0.74098
Average time per query: 1.95256s
Number of not found queries: 13



## tf-idf

In [10]:
with open('dataset/lemmatized_docs.pickle', 'rb') as f:
    docs_lemmatized = pickle.load(f)

In [11]:
tfidf_lemmatized = TFIDFRetriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in docs_lemmatized],
    k=20
)

In [22]:
results['tfidf_lemmatized'] = test_retriever_pipeline(
    retriever=tfidf_lemmatized,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lemmatize_query2
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
del docs_lemmatized, tfidf_lemmatized

In [26]:
tfidf_clear = TFIDFRetriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in all_docs],
    k=20
)

In [27]:
results['tfidf_clear'] = test_retriever_pipeline(
    retriever=tfidf_clear,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=None
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [29]:
del tfidf_clear

In [31]:
with open('dataset/stemmed_docs.pickle', 'rb') as f:
    docs_stemmed = pickle.load(f)

In [32]:
tfidf_stemmed = TFIDFRetriever.from_documents(
    [Document(page_content=doc['text'], metadata={'doc_id': doc['id'], 'title': doc['title']}) for doc in docs_stemmed],
    k=20
)

In [33]:
results['tfidf_stemmed'] = test_retriever_pipeline(
    retriever=tfidf_stemmed,
    queries=queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=stem_query
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

In [34]:
del docs_stemmed, tfidf_stemmed

## embeddings

In [10]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from langchain.retrievers import EnsembleRetriever

In [3]:
with open('dataset/splitted_texts_query.pickle', 'rb') as f:
    embedded_chunks = pickle.load(f)

In [11]:
def rank_documents_from_chunks(ranked_chunks):
    ranked_docs = dict()
    for doc_id, doc_score in ranked_chunks:
        if not doc_id in ranked_docs:
            ranked_docs[doc_id] = (doc_score, 1)
        else:
            past_score, past_number = ranked_docs[doc_id]
            ranked_docs[doc_id] = (past_score + doc_score, past_number + 1)
    for doc_id, (overall_doc_score, hitted_chunk_number) in ranked_docs.items():
        ranked_docs[doc_id] = overall_doc_score / hitted_chunk_number
    ranked_docs = sorted(ranked_docs, key=ranked_docs.get, reverse=False)
    return ranked_docs

In [12]:
def test_vectorstore_pipeline(
    vectorstore,
    queries: list,
    hits_ks: list,
    dcg_ks: list,
    not_found_queries_threshold: int = 20,
    progress_bar: bool = True,
    query_preproc_fn = None,
    doc_id_str = "doc_id",
    chunk_search=False
):
    """
    Args:
        vectorstore: LangChain vectorstore
        queries: search queries in following format: [{'doc_id': 123, 'queries': ['query1', 'query2', ...], ...}, ...]
        hits_ks: for wich Ks compute Hits@K in format of list: [1, 2, 3]
        dcg_ks: for which Ks compute DCG@K in format of list: [1, 2, 3]
        not_found_queries_threshold: return also queries that are not in top k documents (or None not to return them at all)
        progress_bar: whether to enable progress bar
        query_preproc_fn: function that preprocesses query
    Returns:
        dict: metrics in following format: {'hits_1': 0, 'hits_2': 0, ...}
    """
    metrics = dict()
    not_found_queries = []
    for k in hits_ks:
        metrics[f"hits_{k}"] = 0
    for k in dcg_ks:
        metrics[f"dcg_{k}"] = 0
    query_counter = 0
    t = time.time()
    for query in tqdm(queries, desc='Scoring queries', disable=not progress_bar):
        requested_doc_id = query['doc_id']
        for subquery in query['queries']:
            query_counter += 1
            subquery_text = subquery
            if query_preproc_fn:
                subquery_text = query_preproc_fn(subquery)
            if not chunk_search:
                found_doc_idx = [res[0].metadata[doc_id_str] for res in vectorstore.similarity_search_with_relevance_scores(subquery_text, k=20)]
            else:
                ranked_chunks = [
                    (res[0].metadata[doc_id_str], res[1]) 
                    for res in vectorstore.similarity_search_with_score(subquery_text, k=1000)
                ]
                found_doc_idx = rank_documents_from_chunks(ranked_chunks)
            subquery_hits = hits_k(requested_doc_id, found_doc_idx, hits_ks)
            subquery_dcgs = dcg_k(requested_doc_id, found_doc_idx, dcg_ks)
            if not_found_queries_threshold and requested_doc_id not in found_doc_idx[:not_found_queries_threshold]:
                not_found_queries.append({'doc_id': requested_doc_id, 'doc_title': query['doc_title'], 'query': subquery})
            for k, score in subquery_hits:
                metrics[f"hits_{k}"] += score
            for k, score in subquery_dcgs:
                metrics[f"dcg_{k}"] += score
    metrics['time_per_query'] = time.time() - t
    for metric_name, metric_value in metrics.items():
        metrics[metric_name] = metric_value / query_counter
    metrics['not_found_queries'] = not_found_queries
    return metrics

In [13]:
model_name = "intfloat/multilingual-e5-small" # large
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [14]:
for doc in tqdm(all_docs):
    text = f"query: {doc['title']}"
    doc['embedding_title'] = deepcopy(hf.embed_query(text))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [106]:
with open('dataset/all_docs_with_title_embeddings.pickle', 'wb') as f:
    pickle.dump(all_docs, f)

In [35]:
faiss_vectorstore = FAISS.from_embeddings(
    [(chunk.page_content, chunk.metadata['embedding']) for chunk in embedded_chunks],
    hf,
    metadatas=[
        {
            'doc_id': chunk.metadata['doc_id'],
            'title': chunk.metadata['title']
        }
        for chunk in embedded_chunks
    ]
)

In [15]:
faiss_vectorstore_titles = FAISS.from_embeddings(
    [(doc['title'], doc['embedding_title']) for doc in all_docs],
    hf,
    metadatas=[
        {
            'doc_id': doc['id']
        }
        for doc in all_docs
    ]
)

In [88]:
test_vectorstore_pipeline( # e5-large chunk search (doc_score = mean embedding score)
    faiss_vectorstore,
    queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lambda query: f"query: {query}",
    chunk_search=True
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

{'hits_1': 0.565,
 'hits_5': 0.875,
 'hits_10': 0.94,
 'dcg_5': 0.7386086454133057,
 'dcg_10': 0.7594829837499144,
 'dcg_20': 0.7660488660743507,
 'time_per_query': 0.04973018288612366,
 'not_found_queries': [{'doc_id': 2211,
   'doc_title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах',
   'query': 'защита товара от дождя гост'},
  {'doc_id': 1369,
   'doc_title': 'Транзисторы биполярные. Метод измерения постоянной времени цепи обратной связи на высокой частоте',
   'query': 'методы тестирования транзисторов с точностью'},
  {'doc_id': 1918,
   'doc_title': 'Болты клеммные для рельсовых скреплений железнодорожного пути. Технические условия',
   'query': 'болты клеммные для рельсовых скреплений железнодорожного пути технические условия'},
  {'doc_id': 817,
   'doc_title': 'Цепи тяговые вильчатые. Технические условия',
   'query': 'технические условия цепей'},
  {'doc_id': 1050,
   'doc_title': 'Котлы отопительные водогрейные теплопроизводительностью до 100 кВт. Общ

In [16]:
test_vectorstore_pipeline( # e5-small
    faiss_vectorstore_titles,
    queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lambda query: f"query: {query}"
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

{'hits_1': 0.765,
 'hits_5': 0.935,
 'hits_10': 0.955,
 'dcg_5': 0.8585697951412524,
 'dcg_10': 0.8649134284312257,
 'dcg_20': 0.8701623767662581,
 'time_per_query': 0.017327221632003783,
 'not_found_queries': [{'doc_id': 1258,
   'doc_title': 'Аппараты пускорегулирующие для разрядных ламп. Общие технические требования',
   'query': 'тест-таблицы для факсимильных аппаратов'},
  {'doc_id': 2211,
   'doc_title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах',
   'query': 'защита товара от дождя гост'},
  {'doc_id': 1369,
   'doc_title': 'Транзисторы биполярные. Метод измерения постоянной времени цепи обратной связи на высокой частоте',
   'query': 'методы тестирования транзисторов с точностью'},
  {'doc_id': 2263,
   'doc_title': 'Материалы ворсовые. Метод определения несминаемости ворса',
   'query': 'материалы искусственные мехов проверка'},
  {'doc_id': 2253,
   'doc_title': 'Волокно и жгут химические. Методы определения разрывной нагрузки и удлинение при разрыве',

In [18]:
test_vectorstore_pipeline( # e5-large
    faiss_vectorstore_titles,
    queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lambda query: f"query: {query}"
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

{'hits_1': 0.74,
 'hits_5': 0.94,
 'hits_10': 0.97,
 'dcg_5': 0.8478792682167118,
 'dcg_10': 0.8576362683546332,
 'dcg_20': 0.8616240208821239,
 'time_per_query': 0.032037713527679444,
 'not_found_queries': [{'doc_id': 42,
   'doc_title': 'Огнеупоры. Буквенные обозначения величин, применяемых при испытаниях',
   'query': 'исо стандарты огнеупоры'},
  {'doc_id': 1258,
   'doc_title': 'Аппараты пускорегулирующие для разрядных ламп. Общие технические требования',
   'query': 'тест-таблицы для факсимильных аппаратов'},
  {'doc_id': 2211,
   'doc_title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах',
   'query': 'защита товара от дождя гост'}]}

In [59]:
test_vectorstore_pipeline( # e5-large only chunks
    faiss_vectorstore,
    queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lambda query: f"query: {query}"
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

{'hits_1': 0.71,
 'hits_5': 0.935,
 'hits_10': 0.985,
 'dcg_5': 0.8317361854103221,
 'dcg_10': 0.848172067535831,
 'dcg_20': 0.8493953202464221,
 'time_per_query': 0.07090999364852905,
 'not_found_queries': [{'doc_id': 2211,
   'doc_title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах',
   'query': 'защита товара от дождя гост'},
  {'doc_id': 1523,
   'doc_title': 'Совместимость технических средств электромагнитная. Радиопомехи индустриальные от устройств с двигателями внутреннего сгорания. Нормы и методы испытаний',
   'query': 'методы измерения радиопомех от двс'}]}

In [26]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)

collection_name = "gosts"
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

True

In [28]:
for doc in tqdm(all_docs):
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=doc['id'],
                vector=doc['embedding_title'],
                payload={
                    "text": doc['text'],
                    'metadata': {
                        "title": doc['title']
                    }
                    
                }
            )
        ]
    )

  0%|          | 0/3519 [00:00<?, ?it/s]

In [30]:
qdrant_vectorstore = Qdrant(client, collection_name, hf, content_payload_key='text', metadata_payload_key='metadata')

In [34]:
test_vectorstore_pipeline(
    qdrant_vectorstore,
    queries,
    hits_ks=[1, 5, 10],
    dcg_ks=[5, 10, 20],
    not_found_queries_threshold=20,
    progress_bar=True,
    query_preproc_fn=lambda query: f"query: {query}",
    doc_id_str='_id'
)

Scoring queries:   0%|          | 0/100 [00:00<?, ?it/s]

{'hits_1': 0.725,
 'hits_5': 0.94,
 'hits_10': 0.97,
 'dcg_5': 0.8423432145202834,
 'dcg_10': 0.8521002146582051,
 'dcg_20': 0.8560879671856957,
 'time_per_query': 0.08718831777572632,
 'not_found_queries': [{'doc_id': 42,
   'doc_title': 'Огнеупоры. Буквенные обозначения величин, применяемых при испытаниях',
   'query': 'исо стандарты огнеупоры'},
  {'doc_id': 1258,
   'doc_title': 'Аппараты пускорегулирующие для разрядных ламп. Общие технические требования',
   'query': 'тест-таблицы для факсимильных аппаратов'},
  {'doc_id': 2211,
   'doc_title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах',
   'query': 'защита товара от дождя гост'}]}

In [8]:
pretty_print_results(results, 3)

Bm25 clear:
HITS@1: 0.395
HITS@5: 0.655
HITS@10: 0.72
DCG@5: 0.53
DCG@10: 0.55
DCG@20: 0.571
Average time per query: 10.4ms
Number of not found queries: 40

Bm25 stemmed:
HITS@1: 0.56
HITS@5: 0.83
HITS@10: 0.895
DCG@5: 0.709
DCG@10: 0.73
DCG@20: 0.74
Average time per query: 10.0ms
Number of not found queries: 13

Bm25 lemmatized:
HITS@1: 0.57
HITS@5: 0.82
HITS@10: 0.9
DCG@5: 0.707
DCG@10: 0.732
DCG@20: 0.741
Average time per query: 1952.6ms
Number of not found queries: 13

Tfidf lemmatized:
HITS@1: 0.42
HITS@5: 0.775
HITS@10: 0.875
DCG@5: 0.616
DCG@10: 0.647
DCG@20: 0.66
Average time per query: 12756.1ms
Number of not found queries: 15

Tfidf clear:
HITS@1: 0.41
HITS@5: 0.74
HITS@10: 0.83
DCG@5: 0.586
DCG@10: 0.615
DCG@20: 0.626
Average time per query: 7514.4ms
Number of not found queries: 26

Tfidf stemmed:
HITS@1: 0.445
HITS@5: 0.775
HITS@10: 0.9
DCG@5: 0.63
DCG@10: 0.67
DCG@20: 0.681
Average time per query: 7095.6ms
Number of not found queries: 11



## other

In [36]:
with open('dataset/results.pickle', 'wb') as f:
    pickle.dump(results, f)

In [89]:
k = 0
queries[k]

{'doc_id': 2414,
 'filename': 'gost_30422-96.pdf',
 'doc_gost_number': 'ГОСТ 30422-96',
 'doc_title': 'Табак и табачные изделия. Сигареты. Определение скорости свободного горения',
 'queries': ['скорость сигаретного горения', 'определение горения сигареты']}

In [91]:
result = retriever.invoke(queries[k]['queries'][1])
[res.metadata for res in result]

[{'doc_id': 2414,
  'title': 'Табак и табачные изделия. Сигареты. Определение скорости свободного горения'},
 {'doc_id': 2410, 'title': 'Сигареты. Отбор проб'},
 {'doc_id': 2415, 'title': 'Сигареты. Определение степени осыпаемости'},
 {'doc_id': 2411, 'title': 'Сигареты. Отбор проб'},
 {'doc_id': 2413,
  'title': 'Сигареты и фильтры. Определение номинального диаметра. Пневматический метод'},
 {'doc_id': 1175,
  'title': 'Материалы электроизоляционные твердые. Метод определения стойкости к действию электрической дуги постоянного тока низкого напряжения'},
 {'doc_id': 478,
  'title': 'Система стандартов безопасности труда. Пожаровзрывоопасность веществ и материалов. Номенклатура показателей и методы их определения'},
 {'doc_id': 1174,
  'title': 'Материалы электроизоляционные твердые. Метод определения стойкости к действию электрической дуги малого тока высокого напряжения'},
 {'doc_id': 476,
  'title': 'Система стандартов безопасности труда. Пожаровзрывоопасность веществ и материалов. Н