In [2]:
import os
from pypdf import PdfReader
from pathlib import Path
from tqdm.notebook import tqdm
import re
from razdel import sentenize
import numpy as np
import time
from joblib import Parallel, delayed
from copy import deepcopy
import pickle

## old

In [25]:
with open('files_cropped/docs_flattened.pickle', 'rb') as f:
    all_docs = pickle.load(f)

In [26]:
all_docs[0]

{'number': 'ГОСТ ISO/TR 10993-33-2018',
 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3',
 'status': 'active',
 'date_start': '06.01.2019',
 'date_cancel': '',
 'replaced_by': '',
 'OKS': '01.020',
 'file_path': '',
 'file_url': 'https://allgosts.ru/01/020/gost_iso!tr_10993-33-2018',
 'OKS_main': '01 ОБЩИЕ ПОЛОЖЕНИЯ. ТЕРМИНОЛОГИЯ. СТАНДАРТИЗАЦИЯ. ДОКУМЕНТАЦИЯ',
 'OKS_section': '01.020 Терминология (принципы и координация)',
 'id': 0}

In [29]:
def get_text(file):
    text = ''
    if not file['file_path']:
        text = file['title']
    else:
        try:
            reader = PdfReader('files_cropped/' + file['file_path'])
            for page in reader.pages[1:-1]:
                page_text = page.extract_text()
                text += (' ' + page_text)
            text = text \
                    .replace('\xad\n', '') \
                    .replace('\n\xad', '') \
                    .replace('\n\n', ' ') \
                    .replace('\n', ' ') \
                    .replace('   ', ' ') \
                    .replace('  ', ' ')
        except:
            text = file['title']
    return {
        'gost_number': file['number'],
        'title': file['title'],
        'filename': file['file_path'],
        'text': text
    }

In [30]:
if __name__ == '__main__':
    with Parallel(n_jobs=-1) as parallel:
        docs_w_text = parallel(delayed(get_text)(doc) for doc in tqdm(all_docs))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [61]:
with open('files_cropped/doc_with_text_for_similarity.pickle', 'wb') as f:
    pickle.dump(docs_w_text, f)

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

punct = '!"#$%&()*\+,-\./:;<=>?@\[\]^_`{|}~„“«»†*\—/\-‘’'
stemmer = SnowballStemmer('russian')
stopwords = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [3]:
def stem_doc(doc):
    doc = deepcopy(doc)
    text = doc['text'].lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    text = ' '.join([stemmer.stem(w) for w in words if w not in stopwords and w != ''])
    doc['text_normalized'] = text
    return doc

def lemmatize_doc(doc):
    doc = deepcopy(doc)
    text = doc['text'].lower()
    words = [w.strip(punct) for w in word_tokenize(text)]
    words = [w for w in words if w not in stopwords and w != '']
    doc['text_normalized'] = ' '.join([morph.parse(w)[0].normal_form for w in words])
    return doc

In [5]:
def get_tagged_doc(doc, index):
    return TaggedDocument(words=stem_text(doc['text']), tags=[str(index)])

In [74]:
if __name__ == '__main__':
    with Parallel(n_jobs=-1) as parallel:
        tagged_data = parallel(delayed(get_tagged_doc)(doc, i) for i, doc in enumerate(tqdm(docs_w_text)))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [77]:
vector_size = 40
window = 3
min_count = 20
epochs = 100

In [79]:
model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs, seed=11, workers=8)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [99]:
model.save('doc2vec_stemmed.pickle')

## new

In [3]:
with open('files_cropped/doc_with_text_for_similarity.pickle', 'rb') as f:
    docs_w_text = pickle.load(f)

In [4]:
def stem_text(text):
    words = [w.strip(punct) for w in word_tokenize(text.lower())]
    words = [stemmer.stem(w) for w in words if w not in stopwords and w != '']
    return words

def lemmatize_text(text):
    words = [w.strip(punct) for w in word_tokenize(text.lower())]
    words = [morph.parse(w)[0].normal_form for w in words if w not in stopwords and w != '']
    return words

In [5]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from uuid import UUID

client = QdrantClient(host="localhost", port=6333)
collection_name = "gosts_titles2"

In [6]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy2
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

punct = '!"#$%&()*\+,-\./:;<=>?@\[\]^_`{|}~„“«»†*\—/\-‘’'
stemmer = SnowballStemmer('russian')
stopwords = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [7]:
model_doc2vec = Doc2Vec.load('doc2vec_stemmed.pickle')

In [8]:
def pp_list(l):
    for i, el in enumerate(l):
        if len(el) == 2:
            print(f"{i+1}. {el[0]} {el[1]:.2f}")
        else:
            print(f"{i+1}. {el[0]}. {el[1]} {el[2]:.2f}")

In [28]:
collection_name = 'gosts_titles2'
collection_name2 = 'gosts_longformer'
collection_name3 = "gosts_mean_chunks"

In [29]:
def get_similar(doc):
    print(f"Document with title {doc['title']}\n")
    tokens = stem_text(doc['text'])
    inferred_vector = model_doc2vec.infer_vector(tokens)
    similar_docs = model_doc2vec.dv.most_similar([inferred_vector], topn=11)
    similar_docs = [(docs_w_text[int(doc_id)]['title'], score) for doc_id, score in similar_docs[1:]]
    print('Doc2vec:')
    pp_list(similar_docs)

    vec = np.array(
        client.scroll(
            collection_name,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(key='gost_number', match=models.MatchValue(value=doc['gost_number']))
                ]
            ),
            with_vectors=True
        )[0][0].vector
    )

    points = client.search(
        collection_name,
        vec,
        limit=11
    )
    res = [(point.payload['title'], point.score) for point in points[1:]]
    print('\nE5:')
    pp_list(res)

    vec = np.array(
        client.scroll(
            collection_name2,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(key='gost_number', match=models.MatchValue(value=doc['gost_number']))
                ]
            ),
            with_vectors=True
        )[0][0].vector
    )

    points = client.search(
        collection_name2,
        vec,
        limit=11
    )
    res = [(point.payload['title'], point.score) for point in points[1:]]
    print('\nLongformer:')
    pp_list(res)

    vec = np.array(
        client.scroll(
            collection_name3,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(key='gost_number', match=models.MatchValue(value=doc['gost_number']))
                ]
            ),
            with_vectors=True
        )[0][0].vector
    )

    points = client.search(
        collection_name3,
        vec,
        limit=11
    )
    res = [(point.payload['title'], point.score) for point in points[1:]]
    print('\nChunks mean:')
    pp_list(res)

In [30]:
# интересные доки: 2500, 2623

In [38]:
get_similar(docs_w_text[1101])

Document with title Генераторы нейтронов. Типы и параметры

Doc2vec:
1. Генераторы нейтронов. Метод измерения потока быстрых нейтронов 0.75
2. Глина бентонитовая для тонкой и строительной керамики. Методы определения показателя адсорбции и емкости катионного обмена 0.68
3. Индикаторы знакосинтезирующие. Методы измерения яркости, силы света, неравномерности яркости и неравномерности силы света 0.68
4. Детекторы ионизирующих излучений сцинтилляционные. Общие положения по методам измерений сцинтилляционных параметров 0.67
5. Индикаторы знакосинтезирующие. Методы измерения времени готовности 0.67
6. Источники света искусственные. Метод определения плотности потока энергии ультрафиолетового излучения 0.65
7. Приборы электронно-лучевые приемные. Метод измерения неравномерности яркости свечения экрана 0.64
8. Реактивы. Свинец (II) уксуснокислый 3-водный. Технические условия 0.64
9. Фотоумножители. Методы измерения спектральной анодной чувствительности 0.64
10. Концентрат вольфрамовый. Метод о

In [39]:
from sklearn.metrics import ndcg_score

In [47]:
print('doc2vec:')
print(
    ndcg_score(
    [[2, 3]],
    [[0.0, 1.0]],
    k=5
)
)
print('doc2vec:')
ndcg_score(
    [[1, 2]],
    [[0.0, 1.0]],
    k=10
)

doc2vec:
1.0
doc2vec:


1.0

In [11]:
with open('files_cropped/longformer_embeddings.pickle', 'rb') as f:
    lf_embs = pickle.load(f)

In [19]:
collection_name2 = 'gosts_longformer'

In [26]:
client.create_collection(
    collection_name=collection_name2,
    vectors_config=models.VectorParams(
        size=312,
        distance=models.Distance.COSINE
    )
)

True

In [27]:
client.upload_collection(
        collection_name2,
        ids=list(range(1, len(docs_w_text)+1)),
        payload=[
            {
                'gost_number': doc['gost_number'],
                'title': doc['title']
            }
            for doc in docs_w_text
        ],
        vectors=[np.array(vec, dtype=np.float64) for vec in lf_embs],
        parallel=1,
        max_retries=3
)

In [32]:
with open('dataset/splitted_texts_query.pickle', 'rb') as f:
    chunked_texts = pickle.load(f)

In [42]:
assert chunked_texts[-1].metadata['doc_id'] == len(docs_w_text) - 1
for i, doc in enumerate(tqdm(docs_w_text)):
    chunk_embs = [chunk.metadata['embedding'] for chunk in chunked_texts if chunk.metadata['doc_id'] == i]
    doc['chunk_embedding_mean'] = deepcopy(np.mean(chunk_embs, axis=0))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [44]:

client.create_collection(
    collection_name=collection_name3,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

True

In [45]:
client.upload_collection(
        collection_name3,
        ids=list(range(1, len(docs_w_text)+1)),
        payload=[
            {
                'gost_number': doc['gost_number'],
                'title': doc['title']
            }
            for doc in docs_w_text
        ],
        vectors=[doc['chunk_embedding_mean'] for doc in docs_w_text],
        parallel=1,
        max_retries=3
)

In [65]:
[i for i, doc in enumerate(docs_w_text) if 'Язык программирования' in doc['title']]

[1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618]

# ids from mongo to gosts_chunks_mean qdrant

In [1]:
from pymongo import MongoClient

def get_database():
   # Provide the mongodb atlas url to connect python to mongodb using pymongo
   CONNECTION_STRING = "mongodb://localhost:27017"
   # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
   client = MongoClient(CONNECTION_STRING)
   # Create the database for our example (we will use the same database throughout the tutorial
   return client['GOSTS']

In [2]:
db = get_database()

In [3]:
doc_collection = db.documents

In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from uuid import UUID

client = QdrantClient(host="localhost", port=6333)
collection_name = 'gosts_chunks_mean'

In [5]:
s = 0
for doc in doc_collection.find():
    s += 1
s

3518

## returning lost document

In [7]:
import pickle
with open('dataset/docs_before_db.pickle', 'rb') as f:
    docs_before_db = pickle.load(f)

In [19]:
for i, doc in enumerate(tqdm(docs_before_db)):
    els = doc_collection.find({'gost_number': doc['gost_number']})
    if len(list(els)) == 0:
        raise ValueError(i)

  0%|          | 0/3519 [00:00<?, ?it/s]

ValueError: 3518

In [20]:
doc_collection.insert_one(docs_before_db[-1])

InsertOneResult(ObjectId('663fa9788275d659b2aa3caf'), acknowledged=True)

In [22]:
del docs_before_db

## rest

In [2]:
def get_id(id: str):
    return str(UUID(str(id) + '0' * 8))

In [29]:
docs = []
for doc in tqdm(doc_collection.find(), total=3519):
    doc_id = get_id(doc['_id'])
    res = client.scroll(
        collection_name,
        scroll_filter=models.Filter(
            must=[
                models.FieldCondition(key="gost_number", match=models.MatchValue(value=doc['gost_number'])),
            ]
        ),
        with_vectors=True,
        with_payload=True
    )[0]
    assert len(res) == 1
    docs.append({
        'id': doc_id,
        'gost_number': doc['gost_number'],
        'title': doc['title'],
        'vector': deepcopy(res[0].vector)
    })

  0%|          | 0/3519 [00:00<?, ?it/s]

In [30]:
collection_name2 = 'gosts_mean_chunks'

In [31]:
client.create_collection(
    collection_name=collection_name2,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

True

In [32]:
client.upload_collection(
        collection_name2,
        ids=[doc['id'] for doc in docs],
        payload=[
            {
                'gost_number': doc['gost_number'],
                'title': doc['title']
            }
            for doc in docs
        ],
        vectors=[doc['vector'] for doc in docs],
        parallel=1,
        max_retries=3
)

In [11]:
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import torch
import torch.nn.functional as F

In [4]:
tokenizer = AutoTokenizer.from_pretrained('models/multilingual-e5-large/tokenizer')
session = ort.InferenceSession('models/multilingual-e5-large/model/multilingual-e5-large.onnx')


In [7]:
def get_embeddings(texts: list) -> torch.Tensor:
    ort_inputs = ort_tokenize(texts, tokenizer)
    ort_outs = session.run(None, ort_inputs)
    embeddings = average_pool(ort_outs, ort_inputs, normalize=True)
    return embeddings

In [9]:
def ort_tokenize(texts: list, tokenizer) -> dict:
    inputs = tokenizer(texts, return_tensors="np", max_length=512, padding=True, truncation=True)
    ort_inputs = {
        "input_ids": inputs['input_ids'].astype(np.int64),
        "attention_mask": inputs['attention_mask'].astype(np.int64),
    }
    return ort_inputs

def average_pool(ort_outs: np.ndarray, ort_inputs: dict, normalize: bool = True, return_tensors: str = 'np') -> torch.Tensor:
    last_hidden_states = torch.Tensor(ort_outs[0])
    attention_mask = torch.Tensor(ort_inputs['attention_mask'])
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    embeddings = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    if normalize:
        embeddings = F.normalize(embeddings, p=2, dim=1)
    if return_tensors == 'np':
        embeddings = np.array(embeddings)
    return embeddings

In [21]:
client.upload_collection(
    'gosts_titles2',
    ids=[get_id('663fa9788275d659b2aa3caf')],
    payload=[
        {'gost_number': 'ГОСТ 20809-75', 'title': 'Патроны охотничьи 9х53. Типы и основные размеры'}
    ],
    vectors=get_embeddings('Патроны охотничьи 9х53. Типы и основные размеры')
)