In [1]:
import json
import os
from typing import List

import spacy
from document_structure import (Chunk, ContentTextData, Document, FootnoteData,
                                TableData, TermData)
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, PointStruct, VectorParams
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SpacySenter:
    def __init__(self) -> None:
        self.nlp = spacy.load("en_core_web_trf", enable=[
                              'transformer', 'parser'])

    def get_sentences(self, text: str = '') -> List[dict]:
        sentence_data = list()
        for sent in self.nlp(text).sents:
            sentence_data.append({
                'text': sent.text,
                'start_char': sent.start_char,
                'end_char': sent.end_char,
                'length_token': len(sent)
            })

        return sentence_data


class WhitespaceTokenizer:

    def get_tokens(self, text: str = '') -> List[dict]:
        token_data = list()

        tokens = text.split(' ')

        char_counter = 0

        for token in tokens:
            token_data.append({
                'text': token,
                'start_char': char_counter,
                'end_char': char_counter + len(token),
            })
            char_counter += len(token) + 1
        return token_data

class E5Tokenizer:

    def __init__(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')

    def get_tokens(self, text: str = '') -> List[dict]:


        token_data = list()

        encoded_input = self.tokenizer.encode_plus(text, return_offsets_mapping=True, add_special_tokens=False)

        # The 'offset_mapping' contains the start and end positions of each token in the original text
        offset_mapping = encoded_input['offset_mapping']

        for token_index, (start_pos, end_pos) in enumerate(offset_mapping):
            token_data.append({
                'text': text[start_pos:end_pos],
                'start_char': start_pos,
                'end_char': end_pos,
            })

        return token_data

def section_chunks_to_points(document_metadata: dict, section_chunks: List[Chunk], last_idx: int, model):
    section_points = list()

    for i, chunk in enumerate(section_chunks, 1):    
        try:
            chunk_text = 'passage:' + chunk.get_text()
        except TypeError:
            print(chunk)
            print(chunk_text)
            raise TypeError
        payload = document_metadata.copy()
        payload.update(chunk.get_data())
        section_points.append(
                PointStruct(id = last_idx + i,
                            vector=list(model.encode(chunk_text, normalize_embeddings=True).astype(float)),
                            payload=payload)
                            )
    return section_points

---

In [3]:
# todo parameetriks? sh parameetrite suurus, tokenite arv, lausete hulk chunkis, overlap?
model = SentenceTransformer('intfloat/multilingual-e5-base')
client = QdrantClient("172.24.228.4", port=6333)

collection_name = "test_collection"
embedding_size = 768
max_tokens = 450
sentence_block_size = 5

existing_collections = [coll.name for coll in client.get_collections().collections]

if collection_name not in existing_collections:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_size, distance=Distance.COSINE),
    )



In [5]:
tokenizer = E5Tokenizer()
senter = SpacySenter()


fpath = 'C:\\Users\\sandra.eiche\\OneDrive - Eesti Keele Instituut\\Documents\\KVA\\kva_parsed_jsons'
fname = '20231004-JDP_0_01_1_2023_Edition_B_web.json'

collection_info = client.get_collection(collection_name)

In [6]:
with open(os.path.join(fpath, fname), 'r') as fin:

    document_json = json.loads(fin.read())

    document = Document(
        json_filename=fname,
        filename=document_json['filename'],
        publication=document_json['publication'],
        publication_year=document_json['publication_year'],
        title=document_json['title'],
        author=document_json['author'],
        languages=document_json['languages'],
        field_keywords=document_json['field_keywords'],
        header_height=document_json['header_height'],
        footer_height=document_json['footer_height'],
        table_extraction_strategy=document_json['table_extraction_strategy'],
        horizontal_sorting=document_json['horizontal_sorting'],
        footnote_regex=document_json['footnote_regex'],
        footnote_group=document_json['footnote_group'],
        custom_regex=document_json['custom_regex'],
        term_data= TermData(document_json['term_data']),
        footnote_data=FootnoteData(document_json['footnote_data']),
        table_data=TableData(document_json['table_data']),
        content_text_data=ContentTextData(document_json['content_text_data'])
        )

    document_metadata = document.get_metadata()

    # parse content chunks one by one 
    content_chunks = document.content_text_data.to_chunks(sentensizer=senter, tokenizer=tokenizer, 
                                                          max_tokens=max_tokens, 
                                                          n_sentences_in_block=sentence_block_size)
    term_chunks = document.term_data.to_chunks()
    footnote_chunks = document.footnote_data.to_chunks()
    table_chunks = document.table_data.to_chunks(tokenizer=tokenizer, max_tokens=max_tokens)


    # Chunks to PointStruct
    for section_chunks in [content_chunks, term_chunks, footnote_chunks, table_chunks]:
        last_idx = client.get_collection(collection_name).vectors_count
        if not last_idx:
            last_idx = 0

        section_points = section_chunks_to_points(document_metadata, section_chunks, last_idx, model=model)
        
        step = 100
        for i in range(0, len(section_points), step): 
            x = i 
            operation_info = client.upsert(
                collection_name=collection_name,
                wait=False,
                points=section_points[x:x+step])
            print(operation_info)

operation_id=1 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
operation_id=2 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
operation_id=3 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
operation_id=4 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
operation_id=5 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
operation_id=6 status=<UpdateStatus.ACKNOWLEDGED: 'acknowledged'>
