In [1]:
import pickle
from pypdf import PdfReader
from pathlib import Path
from tqdm.notebook import tqdm
from copy import deepcopy
import os
import numpy as np

# mongodb

In [1]:
from pymongo import MongoClient

def get_database():
   # Provide the mongodb atlas url to connect python to mongodb using pymongo
   CONNECTION_STRING = "mongodb://localhost:27017"
   # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
   client = MongoClient(CONNECTION_STRING)
   # Create the database for our example (we will use the same database throughout the tutorial
   return client['GOSTS']

In [2]:
db = get_database()

In [3]:
doc_collection = db.documents

In [5]:
def get_doc_text(filename):
    if not filename:
        return ''
    dirpath = Path('c:/users/1645286/vkr/files_cropped')
    file_path = os.path.join(dirpath, filename)
    if filename.endswith('.pdf'):
        reader = PdfReader(file_path)
        text = ''
        for page in reader.pages:
            page_text = page.extract_text()
            text += (page_text + ' ')
    elif filename.endswith('.txt'):
        with open(file_path, 'r') as file:
            text = file.read()
    else:
        print('file name with unknown extension: ', filename)
        return ''
    text = text.replace('\xad\n', '').replace('\n\xad', '').replace('\n', ' ').replace('   ', ' ').replace('  ', ' ')
    return text

In [6]:
with open('res_dict_10.pickle', 'rb') as f:
    gd = pickle.load(f)
all_docs = []
for main_name, sections in gd.items():
    for section_name, section_docs in sections.items():
        for doc in section_docs:
            doc['main_section'] = main_name
            doc['subsection'] = section_name
            all_docs.append(deepcopy(doc))
del gd

In [7]:
statuses = {
    'active': 'Действует',
    'cancelled': 'Отменен',
    'replaced': 'Заменен'
}

In [8]:
for doc in all_docs:
    if doc['status'] in statuses:
        doc['status'] = statuses[doc['status']]

In [9]:
all_docs[0]

{'number': 'ГОСТ ISO/TR 10993-33-2018',
 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3',
 'status': 'Действует',
 'date_start': '06.01.2019',
 'date_cancel': '',
 'replaced_by': '',
 'OKS': '01.020',
 'file_path': '',
 'file_markdown': '<article class="article" id="article" itemprop="articleBody">\n<p align="right">ГОСТ ISO/TR 10993-33-2018 </p><p align="right"></p><p align="right"></p><p align="center">МЕЖГОСУДАРСТВЕННЫЙ СТАНДАРТ </p><p align="center">ИЗДЕЛИЯ МЕДИЦИНСКИЕ. ОЦЕНКА БИОЛОГИЧЕСКОГО ДЕЙСТВИЯ МЕДИЦИНСКИХ ИЗДЕЛИЙ </p><p align="center">Часть 33 </p><p align="center">Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3 </p><p align="center">Medical devices. Biological evaluation of medical devices. Part 33. Guidance on tests to evaluate genotoxicity. Supplement to ISO 10993-3 </p><p>МКС 01.020 </p><p align="right">Дата введения 2019-06-01 </p><

In [11]:
from joblib import Parallel, delayed

def process_doc(doc):
    doc_temp = dict()
    doc_temp['gost_number'] = doc['number']
    doc_temp['filename'] = doc['file_path']
    for field in ['title', 'status', 'date_start', 'date_cancel', 'replaced_by', 'main_section', 'subsection', 'OKS', 'file_url']:
        doc_temp[field] = doc[field]
    doc_text = ''
    try:
        doc_text = get_doc_text(doc['file_path'])
    except:
        pass
    doc_temp['text_plain'] = doc_text
    doc_temp['text_markdown'] = doc['file_markdown']
    return doc_temp

In [12]:
results = Parallel(n_jobs=6)(delayed(process_doc)(doc) for doc in tqdm(all_docs))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [14]:
with open('dataset/docs_before_db.pickle', 'wb') as f:
    pickle.dump(results, f)

In [13]:
doc_collection.insert_many(results)

InsertManyResult([ObjectId('663fa9788275d659b2aa2ef1'), ObjectId('663fa9788275d659b2aa2ef2'), ObjectId('663fa9788275d659b2aa2ef3'), ObjectId('663fa9788275d659b2aa2ef4'), ObjectId('663fa9788275d659b2aa2ef5'), ObjectId('663fa9788275d659b2aa2ef6'), ObjectId('663fa9788275d659b2aa2ef7'), ObjectId('663fa9788275d659b2aa2ef8'), ObjectId('663fa9788275d659b2aa2ef9'), ObjectId('663fa9788275d659b2aa2efa'), ObjectId('663fa9788275d659b2aa2efb'), ObjectId('663fa9788275d659b2aa2efc'), ObjectId('663fa9788275d659b2aa2efd'), ObjectId('663fa9788275d659b2aa2efe'), ObjectId('663fa9788275d659b2aa2eff'), ObjectId('663fa9788275d659b2aa2f00'), ObjectId('663fa9788275d659b2aa2f01'), ObjectId('663fa9788275d659b2aa2f02'), ObjectId('663fa9788275d659b2aa2f03'), ObjectId('663fa9788275d659b2aa2f04'), ObjectId('663fa9788275d659b2aa2f05'), ObjectId('663fa9788275d659b2aa2f06'), ObjectId('663fa9788275d659b2aa2f07'), ObjectId('663fa9788275d659b2aa2f08'), ObjectId('663fa9788275d659b2aa2f09'), ObjectId('663fa9788275d659b2aa2f

In [15]:
del results, all_docs

# qdrant

In [4]:
with open('dataset/docs_with_text.pickle', 'rb') as f:
    all_docs = pickle.load(f)

In [2]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)

In [None]:
collection_name = "gosts_titles"
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

In [8]:
all_docs[0]

{'number': 'ГОСТ ISO/TR 10993-33-2018',
 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3',
 'status': 'active',
 'date_start': '06.01.2019',
 'date_cancel': '',
 'replaced_by': '',
 'OKS': '01.020',
 'file_path': '',
 'file_url': 'https://allgosts.ru/01/020/gost_iso!tr_10993-33-2018',
 'OKS_main': '01 ОБЩИЕ ПОЛОЖЕНИЯ. ТЕРМИНОЛОГИЯ. СТАНДАРТИЗАЦИЯ. ДОКУМЕНТАЦИЯ',
 'OKS_section': '01.020 Терминология (принципы и координация)',
 'id': 0,
 'text': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3\n\n'}

In [1]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('intfloat/multilingual-e5-large', device='cpu')

In [4]:
%%time 
encoder.encode(f"query: вчера была хорошая погода")

CPU times: total: 766 ms
Wall time: 203 ms


array([ 0.03238133, -0.03997267, -0.04472129, ...,  0.00433277,
       -0.0406142 , -0.02151736], dtype=float32)

In [25]:
for i, doc in tqdm(enumerate(all_docs), total=len(all_docs)):
    doc_id = doc_collection.find_one({'gost_number': doc['number']})['_id']
    embedding = encoder.encode(f'query: {doc["title"]}')
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=i,
                vector=embedding,
                payload={
                    "document_id": str(doc_id),
                    "gost_number": doc['number'],
                    "title": doc['title']
                }
            )
        ]
    )

  0%|          | 0/3519 [00:00<?, ?it/s]

In [15]:
query = "защита тар от воды"
client.search(collection_name, encoder.encode(f"query: {query}"), score_threshold=0.82, limit=1000)

[ScoredPoint(id=2137, version=2137, score=0.85592663, payload={'document_id': '663fa9788275d659b2aa374a', 'gost_number': 'ГОСТ 13905-2005', 'title': 'Тара стеклянная. Метод контроля водостойкости внутренней поверхности'}, vector=None, shard_key=None),
 ScoredPoint(id=2168, version=2168, score=0.8458297, payload={'document_id': '663fa9788275d659b2aa3769', 'gost_number': 'ГОСТ 13905-78', 'title': 'Тара стеклянная. Метод определения водостойкости внутренней поверхности'}, vector=None, shard_key=None),
 ScoredPoint(id=2211, version=2211, score=0.8435078, payload={'document_id': '663fa9788275d659b2aa3794', 'gost_number': 'ГОСТ 18119-72', 'title': 'Тара транспортная наполненная. Метод испытания в водяных брызгах'}, vector=None, shard_key=None),
 ScoredPoint(id=2133, version=2133, score=0.83860785, payload={'document_id': '663fa9788275d659b2aa3746', 'gost_number': 'ГОСТ 13903-2005', 'title': 'Тара стеклянная. Методы контроля термической стойкости'}, vector=None, shard_key=None),
 ScoredPoint(

In [1]:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('models/multilingual-e5-large/tokenizer')

In [2]:
type(tokenizer)

transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast

In [None]:
tokenizer.save_pretrained(model_name.split('/')[-1] + '_tokenizer')

In [3]:
from transformers import AutoModel, AutoTokenizer
import torch

In [1]:


model_name = "intfloat/multilingual-e5-large"
model = AutoModel.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
tokenizer = AutoTokenizer.from_pretrained('models/multilingual-e5-large/tokenizer')
dummy_input = tokenizer("Hello, world!", return_tensors="pt", max_length=512, padding=True, truncation=True)
output_path = "models/multilingual-e5-large/model/multilingual-e5-large.onnx"

# Установка модели в режим вывода
model.eval()

# Конвертация модели в ONNX
input_names = ["input_ids", "attention_mask"]
output_names = ["last_hidden_state", "pooler_output"]
dynamic_axes = {
    "input_ids": {0: "batch_size", 1: 'sequence_length'},
    "attention_mask": {0: "batch_size", 1: 'sequence_length'},
    "last_hidden_state": {0: "batch_size", 1: 'sequence_length'},
    "pooler_output": {0: "batch_size"},
}

with torch.no_grad():
    torch.onnx.export(
        model,
        (dummy_input["input_ids"], dummy_input["attention_mask"]),
        output_path,
        input_names=input_names,
        output_names=output_names,
        dynamic_axes=dynamic_axes,
        opset_version=13,
    )

In [3]:
from transformers import AutoTokenizer
import onnxruntime as ort

model_path = "models/multilingual-e5-large/model/multilingual-e5-large.onnx"
tokenizer_path = "models/multilingual-e5-large/tokenizer"

session = ort.InferenceSession(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [51]:
import numpy as np
import torch.nn.functional as F
def predict(input_text: str):
    inputs = tokenizer(input_text, return_tensors="np", max_length=512, padding=True, truncation=True)
    ort_inputs = {
        "input_ids": inputs['input_ids'].astype(np.int64),
        "attention_mask": inputs['attention_mask'].astype(np.int64),
    }
    ort_outs = session.run(None, ort_inputs)
    last_hidden_states = torch.Tensor(ort_outs[0])
    attention_mask = torch.Tensor(inputs['attention_mask'])
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    embedding = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    embedding = F.normalize(embedding, p=2, dim=1)
    return embedding

In [52]:
np.linalg.norm(predict('dsads dasd'))

1.0

In [3]:
import numpy as np
type(np.array([12]))

numpy.ndarray

In [5]:
import json
json.dumps([12])

'[12]'

In [23]:
import requests
import time
from tqdm.notebook import tqdm
import random

In [27]:
times = []
for i in tqdm(range(100)):
    t = time.time()
    requests.get(
        'http://localhost:5000/search/text',
        params={
            'query': 'гост'
        }
    )
    times.append(time.time() - t)
print(f'Среднее время ответа: {(sum(times) / len(times))*1000:.0f} миллисекунд')

  0%|          | 0/100 [00:00<?, ?it/s]

Среднее время ответа: 258 миллисекунд


In [28]:
times = []
queries = [
    'гост',
    'стандарт',
    'строительство',
    'дороги'
]
for i in tqdm(range(100)):
    t = time.time()
    _ = requests.get(
        'http://localhost:5000/search/semantic',
        params={
            'query': random.choice(queries)
        }
    )
    times.append(time.time() - t)
print(f'Среднее время ответа: {(sum(times) / len(times))*1000:.0f} миллисекунд')

  0%|          | 0/100 [00:00<?, ?it/s]

Среднее время ответа: 915 миллисекунд


In [8]:
import numpy as np

In [10]:
from transformers import AutoTokenizer
import onnxruntime as ort

model_path = "models/multilingual-e5-large/model/multilingual-e5-large.onnx"
tokenizer_path = "models/multilingual-e5-large/tokenizer"

session = ort.InferenceSession(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('models/multilingual-e5-large/tokenizer')

In [49]:
texts = ['123', '321']
inputs = tokenizer(texts, return_tensors="np", max_length=512, padding='do_not_pad', truncation=True)
inputs

{'input_ids': array([array([    0, 37638,     2]),
       array([     0,      6, 103705,      2])], dtype=object), 'attention_mask': array([array([1, 1, 1]), array([1, 1, 1, 1])], dtype=object)}

In [50]:
ort_inputs = {
        "input_ids": inputs['input_ids'].astype(np.int64),
        "attention_mask": inputs['attention_mask'].astype(np.int64),
    }
outs = session.run(None, ort_inputs)

ValueError: setting an array element with a sequence.

In [18]:
outs[0].shape

(2, 4, 1024)

In [25]:
outs[1][0]

array([-0.5958997 ,  0.16221763, -0.4026752 , ..., -0.54916334,
       -0.8697545 ,  0.03568749], dtype=float32)

In [29]:
outs[1][0]

array([-0.5958997 ,  0.16221763, -0.4026752 , ..., -0.54916334,
       -0.8697545 ,  0.03568749], dtype=float32)

In [32]:
outs[1][1]

array([-0.78014565,  0.09020353, -0.2220357 , ..., -0.0879686 ,
       -0.0972221 ,  0.16961685], dtype=float32)

In [40]:
import torch.nn.functional as F

In [41]:
def ort_tokenize(texts: list, tokenizer) -> dict:
    inputs = tokenizer(texts, return_tensors="np", max_length=512, padding=True, truncation=True)
    ort_inputs = {
        "input_ids": inputs['input_ids'].astype(np.int64),
        "attention_mask": inputs['attention_mask'].astype(np.int64),
    }
    return ort_inputs

def average_pool(ort_outs: np.ndarray, ort_inputs: dict, normalize: bool = True) -> torch.Tensor:
    last_hidden_states = torch.Tensor(ort_outs[0])
    attention_mask = torch.Tensor(ort_inputs['attention_mask'])
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    embeddings = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    if normalize:
        embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings

In [42]:
def get_embeddings(texts: str) -> torch.Tensor:
    ort_inputs = ort_tokenize(texts, tokenizer)
    ort_outs = session.run(None, ort_inputs)
    embeddings = average_pool(ort_outs, ort_inputs, normalize=True)
    return embeddings

In [52]:
get_embeddings(['text', 'text2'])[1]

tensor([ 0.0089,  0.0165, -0.0119,  ...,  0.0141, -0.0202,  0.0098])

In [3]:
collection_name = 'gosts_titles'

In [5]:
import qdrant_client

In [6]:
client.retrieve(
    collection_name=collection_name,
    ids=[1, 2, 3]
)

[Record(id=1, payload={'document_id': '663fa9788275d659b2aa2ef2', 'gost_number': 'ГОСТ ISO 10993-13-2016', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 13. Идентификация и количественное определение продуктов деструкции полимерных медицинских изделий'}, vector=None, shard_key=None),
 Record(id=2, payload={'document_id': '663fa9788275d659b2aa2ef3', 'gost_number': 'ГОСТ ISO 10993-16-2016', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 16. Концепция токсикокинетических исследований продуктов разложения и выщелачиваемых веществ'}, vector=None, shard_key=None),
 Record(id=3, payload={'document_id': '663fa9788275d659b2aa2ef4', 'gost_number': 'ГОСТ ISO 10993-3-2018', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 3. Исследования генотоксичности, канцерогенности и токсического действия на репродуктивную функцию'}, vector=None, shard_key=None)]

In [9]:
client.upload_collection(
    collection_name=collection_name,
    payload=[
        {'color': 'green'},
        {'color': 'red'}
    ],
    vectors=[
        np.random.randn(1024),
        np.random.randn(1024)
    ],
    parallel=4,
    max_retries=3
)

In [13]:
client.search(collection_name, np.random.randn(1024))

[ScoredPoint(id='3d6f07de-4cda-45ee-964b-e646be712a61', version=3521, score=0.033385467, payload={'color': 'green'}, vector=None, shard_key=None),
 ScoredPoint(id='a2217640-f3e8-465d-b384-5c642389b021', version=3521, score=0.032190464, payload={'color': 'red'}, vector=None, shard_key=None),
 ScoredPoint(id=2643, version=2643, score=0.024467152, payload={'document_id': '663fa9788275d659b2aa3944', 'gost_number': 'ГОСТ 19662-89', 'title': 'Резервуары изотермические для жидкой двуокиси углерода. Типы, основные параметры и размеры'}, vector=None, shard_key=None),
 ScoredPoint(id=1661, version=1661, score=0.020367432, payload={'document_id': '663fa9788275d659b2aa356e', 'gost_number': 'ГОСТ 23468-85', 'title': 'Микрокалькуляторы. Общие технические условия'}, vector=None, shard_key=None),
 ScoredPoint(id=2218, version=2218, score=0.019718118, payload={'document_id': '663fa9788275d659b2aa379b', 'gost_number': 'ГОСТ 18477-79', 'title': 'Контейнеры универсальные. Типы, основные параметры и размер

In [73]:
client.delete(
    collection_name=collection_name,
    points_selector=models.PointIdsList(
        points=['4248f87d-7780-499a-8303-631d4363d212', '661fb845-7337-4341-bb68-1df09b715017'],
    ),
)

UpdateResult(operation_id=3520, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
client.delete(
    collection_name=collection_name,
    points_selector=models.FilterSelector(
        filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="color",
                    match=models.MatchAny(any=['green', 'red']),
                )
            ],
        )
    ),
)

UpdateResult(operation_id=3522, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
from qdrant_client.http import models

In [24]:
client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="color",
                match=models.MatchAny(any=['green', 'red']),
            )
        ]
    )
)

([], None)

In [27]:
client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="document_id",
                match=models.MatchValue(value='663fa9788275d659b2aa2ef1'),
            )
        ]
    )
)[0][0].id

0

In [None]:
client.retrieve(
    collection_name=collection_name,
)

# inserting docs once again

In [1]:
from pymongo import MongoClient

def get_database():
   # Provide the mongodb atlas url to connect python to mongodb using pymongo
   CONNECTION_STRING = "mongodb://localhost:27017"
   # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
   client = MongoClient(CONNECTION_STRING)
   # Create the database for our example (we will use the same database throughout the tutorial
   return client['GOSTS']

db = get_database()
doc_collection = db.documents

In [2]:
from transformers import AutoTokenizer
import onnxruntime as ort

model_path = "models/multilingual-e5-large/model/multilingual-e5-large.onnx"
tokenizer_path = "models/multilingual-e5-large/tokenizer"

session = ort.InferenceSession(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [3]:
import torch.nn.functional as F
import numpy as np
import torch
def ort_tokenize(texts: list, tokenizer) -> dict:
    inputs = tokenizer(texts, return_tensors="np", max_length=512, padding=True, truncation=True)
    ort_inputs = {
        "input_ids": inputs['input_ids'].astype(np.int64),
        "attention_mask": inputs['attention_mask'].astype(np.int64),
    }
    return ort_inputs

def average_pool(ort_outs: np.ndarray, ort_inputs: dict, normalize: bool = True) -> torch.Tensor:
    last_hidden_states = torch.Tensor(ort_outs[0])
    attention_mask = torch.Tensor(ort_inputs['attention_mask'])
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    embeddings = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    if normalize:
        embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings

def get_embeddings(texts: str) -> torch.Tensor:
    ort_inputs = ort_tokenize(texts, tokenizer)
    ort_outs = session.run(None, ort_inputs)
    embeddings = average_pool(ort_outs, ort_inputs, normalize=True)
    return embeddings

In [4]:
collection_name = 'gosts_titles2'

In [5]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(host="localhost", port=6333)

In [80]:
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE
    )
)

True

In [2]:
from tqdm.notebook import tqdm

In [37]:
documents = [
    {
        '_id': doc['_id'],
        'title': doc['title'],
        'gost_number': doc['gost_number']
    }
    for doc in tqdm(doc_collection.find(), total=3519)
]

  0%|          | 0/3519 [00:00<?, ?it/s]

In [28]:
documents[0]

{'_id': ObjectId('663fa9788275d659b2aa2ef1'),
 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3',
 'gost_number': 'ГОСТ ISO/TR 10993-33-2018'}

In [29]:
batch_size = 32

In [47]:
from copy import deepcopy

batches = []
batch = []
for i, document in tqdm(enumerate(documents), total=len(documents)):
    batch.append(document)
    if len(batch) == batch_size:
        batches.append(deepcopy(batch))
        batch = []
    elif i == len(documents) - 1:
        batches.append(deepcopy(batch))

  0%|          | 0/3519 [00:00<?, ?it/s]

In [49]:
s = 0
for b in batches:
   s += len(b)
s

3519

In [50]:
len(documents)

3519

In [53]:
documents[0]

{'_id': ObjectId('663fa9788275d659b2aa2ef1'),
 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3',
 'gost_number': 'ГОСТ ISO/TR 10993-33-2018'}

In [73]:
final = {
    'ids': [],
    'gost_numbers': [],
    'titles': [],
    'vectors': torch.Tensor([])
}
for batch in tqdm(batches):
    final['ids'].extend([doc['_id'] for doc in batch])
    final['gost_numbers'].extend([doc['gost_number'] for doc in batch])
    embeddings = get_embeddings([doc['title'] for doc in batch])
    final['vectors'] = torch.concat([final['vectors'], embeddings])

  0%|          | 0/110 [00:00<?, ?it/s]

In [84]:
for batch in tqdm(batches):
    final['titles'].extend([doc['title'] for doc in batch])

  0%|          | 0/110 [00:00<?, ?it/s]

In [122]:
for name in ['ids', 'gost_numbers', 'titles', 'vectors']:
    print(len(final[name]))

3519
3519
3519
3519


In [151]:
from uuid import UUID

In [153]:
client.upload_collection(
    collection_name=collection_name,
    ids=[str(UUID(str(doc_id) + '0'*8)) for doc_id in final['ids']],
    payload=[
        {
            'gost_number': doc_number,
            'title': doc_title
        }
        for doc_number, doc_title in zip(final['gost_numbers'], final['titles'])
    ],
    vectors=[np.array(vector) for vector in final['vectors']],
    parallel=6,
    max_retries=3
)

In [1]:
str(UUID(str(doc_id) + '0'*8))

NameError: name 'UUID' is not defined

In [6]:
client.scroll('gosts_titles2')

([Record(id='663fa978-8275-d659-b2aa-2ef100000000', payload={'gost_number': 'ГОСТ ISO/TR 10993-33-2018', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 33. Руководство по испытаниям на генотоксичность. Дополнение к ISO 10993-3'}, vector=None, shard_key=None),
  Record(id='663fa978-8275-d659-b2aa-2ef200000000', payload={'gost_number': 'ГОСТ ISO 10993-13-2016', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 13. Идентификация и количественное определение продуктов деструкции полимерных медицинских изделий'}, vector=None, shard_key=None),
  Record(id='663fa978-8275-d659-b2aa-2ef300000000', payload={'gost_number': 'ГОСТ ISO 10993-16-2016', 'title': 'Изделия медицинские. Оценка биологического действия медицинских изделий. Часть 16. Концепция токсикокинетических исследований продуктов разложения и выщелачиваемых веществ'}, vector=None, shard_key=None),
  Record(id='663fa978-8275-d659-b2aa-2ef400000000', payloa

In [6]:
client.search(
    collection_name,
    get_embeddings('query: патроны охотничьи')[0]
)

[ScoredPoint(id='663fa978-8275-d659-b2aa-3caf00000000', version=109, score=0.85418797, payload={'gost_number': 'ГОСТ 20809-75', 'title': 'Патроны охотничьи 9х53. Типы и основные размеры'}, vector=None, shard_key=None),
 ScoredPoint(id='663fa978-8275-d659-b2aa-3cae00000000', version=109, score=0.8514027, payload={'gost_number': 'ГОСТ 20808-75', 'title': 'Патроны охотничьи 5,6x39. Типы и основные размеры'}, vector=None, shard_key=None),
 ScoredPoint(id='663fa978-8275-d659-b2aa-3caa00000000', version=109, score=0.82560974, payload={'gost_number': 'ГОСТ 18406-79', 'title': 'Ружья охотничьи гладкоствольные двуствольные. Общие технические требования'}, vector=None, shard_key=None),
 ScoredPoint(id='663fa978-8275-d659-b2aa-3c9200000000', version=109, score=0.82201856, payload={'gost_number': 'ГОСТ Р 51715-2001', 'title': 'Изделия декоративные и сувенирные, сходные по внешнему строению с холодным или метательным оружием. Общие технические требования'}, vector=None, shard_key=None),
 ScoredPoin

In [7]:
client.delete(
    collection_name,
    points_selector=models.PointIdsList(
            points=['663fa978-8275-d659-b2aa-3caf00000000'],
        )
)

UpdateResult(operation_id=159, status=<UpdateStatus.COMPLETED: 'completed'>)