# BEIR data + hybrid search
* dense (bge-m3) + sparse (bm42)

## embedding
**Sparse**
* bm42 with fastembed.SparseTextEmbedding model
    * https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions
* use IDF (inverse document frequency) modifier

In [1]:
import json
from pathlib import Path
import time
from typing import Any, Dict, List, Optional

import jsonlines
import pandas as pd
from tqdm import tqdm

from config import settings

In [2]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
core_src_dir = os.path.join(parent_dir, "src/psiking")
sys.path.append(core_src_dir)

In [3]:
## Import Core Schemas
from core.base.schema import Document, TextNode

# 1. Read data
* read BEIR data & convert to Document
    * https://huggingface.co/BeIR
* test with `scifact` dataset
    * 5K passages
    * https://huggingface.co/datasets/BeIR/scifact

In [4]:
## Implement Custom Readers
from core.reader.base import BaseReader

class QuoraDataReader(BaseReader):
    def __init__(self):
        pass
    
    def read(self, data: dict, extra_info: Optional[dict] = None,) -> Optional[Document]:
        """Data format
        ['_id', 'title', 'text', 'metadata']
        """
        metadata = extra_info or {}
        
        text = data.get('text', '')
        if not text:
            return None
        node = TextNode(
            text=text,
            metadata=metadata
        )
        return Document(
            nodes=[node],
            metadata={
                "source_id": data['_id'],
                "title": data['title'],
                **metadata
            }
        )

    def run(self, file_path: str | Path,extra_info: Optional[dict] = None) -> List[Document]:
        metadata = extra_info or {}
        documents = []
        with jsonlines.open(file_path) as reader:
            for data in reader:
                document = self.read(data, extra_info={**metadata})
                if document:
                    documents.append(document)
        return documents

In [5]:
document_path = os.path.join(settings.data_dir, "beir/scifact/corpus.jsonl")

reader = QuoraDataReader()
documents = reader.run(document_path, extra_info={"source_file": "beir-scifact-corpus"})
print(len(documents))

5183


In [6]:
documents[:2]

[Document(id_='8add4065-38d4-4023-b65d-82c921451926', metadata={'source_id': '4983', 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.', 'source_file': 'beir-scifact-corpus'}, nodes=[TextNode(id_='4f84faae-82af-4f9b-a8f8-31d1ea61713f', metadata={'source_file': 'beir-scifact-corpus'}, text_type=<TextType.PLAIN: 'plain'>, label=<TextLabel.PLAIN: 'plain'>, resource=MediaResource(data=None, text='Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of premat

# 2. Run Splitter
* simple inefficient splitter

In [None]:
# 3. Run Splitter
from core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
for document in documents:
    document_chunks = []
    source_id = document.id_
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            split_nodes = splitter.run(node)
        else:
            split_nodes = [node]
        
        # Create New Document
        chunk = Document(
            nodes=split_nodes,
            metadata={
                "source_id": source_id,
                "source_file": document.metadata['source_file'],
                "title": document.metadata['title'],
            }
        )
        document_chunks.append(chunk)
    chunks.extend(document_chunks)
print(len(chunks))

5183


# 4. Format (Prepare Embedding Input)

In [8]:
from core.formatter.document.simple import SimpleTextOnlyFormatter

# use default templates
formatter = SimpleTextOnlyFormatter()
formatted_texts = formatter.run(chunks)

def select_embedding_input_idxs(texts: str, min_length: int = 20):
    return [i for i, x in enumerate(texts) if len(x.strip())>min_length]

embedding_input_idxs = select_embedding_input_idxs(
    texts=formatted_texts,
    min_length=20
)
print(len(embedding_input_idxs))

5183


# 5. Embedder
## 5-1. Dense Embedder

In [9]:
# Dense
from openai import OpenAI
from core.embedder.openai.text_embedder import OpenAITextEmbedder

print(settings.openai_embedding_model)
client = OpenAI(
    base_url=settings.openai_embedding_base_url,
    api_key=settings.openai_embedding_api_key
)

dense_embedder = OpenAITextEmbedder(client = client)
embedding_inputs = [formatted_texts[x] for x in embedding_input_idxs]
dense_embeddings = dense_embedder.run(
    texts=embedding_inputs,
    model=settings.openai_embedding_model,
    batch_size = 12,
    disable_tqdm=False
)
print(len(dense_embeddings), len(dense_embeddings[0]))

baai/bge-m3


100%|██████████| 432/432 [00:58<00:00,  7.38it/s]

5183 1024





## 5-2. Sparse Embedder
* Embed using BM42 Sparse embedder model
    * https://huggingface.co/Qdrant/all_miniLM_L6_v2_with_attentions

### Loading model from pre-downloaded directory
* Load model using 'specific model path'
    * specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
    * download_model method skips download phase (available > v0.5.1 )
        * https://github.com/qdrant/fastembed/blob/a931f143ef3543234bc9d8d0c305496c67199972/fastembed/common/model_management.py#L367
    * build from source with commit `a931f143ef3543234bc9d8d0c305496c67199972`
* cache_dir: cache_dir (str, optional): The path to the cache directory.
    Can be set using the `FASTEMBED_CACHE_PATH` env variable.
    Defaults to `fastembed_cache` in the system's temp directory.
```
cd poetry
poetry build
pip install --force-reinstall fastembed-0.5.1-py3-none-any.whl
```

In [10]:
os.environ["FASTEMBED_CACHE_PATH"] = str(os.path.join(os.getcwd(), "fastembed"))
print(os.environ["FASTEMBED_CACHE_PATH"])
sparse_model_dir = os.path.join(settings.model_weight_dir, "fastembed/sparse/all_miniLM_L6_v2_with_attentions")
os.listdir(sparse_model_dir)

/Users/id4thomas/github/psi-king/examples/fastembed


['tokenizer_config.json',
 'special_tokens_map.json',
 'config.json',
 'tokenizer.json',
 'README.md',
 'vocab.txt',
 'model.onnx',
 '.gitattributes',
 '.git',
 'stopwords.txt']

In [11]:
# Load fastembed model
from fastembed import SparseTextEmbedding

# test specific_model_path function
downloaded_dir = SparseTextEmbedding.download_model(
    model={},
    cache_dir=os.environ["FASTEMBED_CACHE_PATH"],
    specific_model_path=sparse_model_dir,
)
print(downloaded_dir)

sparse_model = SparseTextEmbedding(
    model_name="Qdrant/bm42-all-minilm-l6-v2-attentions",
    specific_model_path=sparse_model_dir,
    cuda=False,
    lazy_load=False
)

embeddings = list(sparse_model.embed(["hi"]))
print(embeddings)
embeddings[0].values.tolist(), embeddings[0].indices.tolist()

/Users/id4thomas/models/fastembed/sparse/all_miniLM_L6_v2_with_attentions
[SparseEmbedding(values=array([0.30918342]), indices=array([948991206]))]


([0.3091834199811786], [948991206])

In [45]:
text = "안녕하세요 제 이름은 송영록입니다."
print("ENCODED:")
encoded = sparse_model.model.tokenizer.encode(text)  # type: ignore[union-attr]
print(len(encoded.ids))
print(encoded.tokens[:10]) # BPE segmented tokens
document_tokens_with_ids = enumerate(encoded.tokens)

print("RECONSTRUCTED:")
reconstructed = sparse_model.model._reconstruct_bpe(document_tokens_with_ids)
print(len(reconstructed))
for reconstructed_word in reconstructed:
    print(reconstructed_word)

ENCODED:
41
['[CLS]', 'ᄋ', '##ᅡ', '##ᆫ', '##ᄂ', '##ᅧ', '##ᆼ', '##ᄒ', '##ᅡ', '##ᄉ']
RECONSTRUCTED:
5
('안녕하세요', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
('제', [13, 14])
('이름은', [15, 16, 17, 18, 19, 20, 21, 22])
('송영록입니다', [23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38])
('.', [39])


In [12]:
# Load Embedder
from core.embedder.fastembed.local_sparse import LocalFastEmbedSparseEmbedder

sparse_embedder = LocalFastEmbedSparseEmbedder(
    model=sparse_model
)

sparse_embedding_values, sparse_embedding_indices = sparse_embedder.run(
    embedding_inputs, batch_size=256
)

# 6. Add to VectorStore
* qdrant single vector with sparse vectorstore

In [13]:
from qdrant_client import QdrantClient
from core.storage.vectorstore.qdrant import QdrantSingleHybridVectorStore

# initialize client
client = QdrantClient(url=settings.qdrant_url)
collection_name = "beir-scifact-hybrid"

vector_store = QdrantSingleHybridVectorStore(
    collection_name=collection_name,
    client=client
)

In [14]:
## Create Collection
from qdrant_client.http import models

# bge-m3 1024 dim
dense_embedding_dim=1024
dense_vectors_config = models.VectorParams(
    size=dense_embedding_dim,
    distance=models.Distance.COSINE,
    on_disk=True,
)

# 
sparse_embedding_dim=10
sparse_vectors_config = models.SparseVectorParams(
    modifier=models.Modifier.IDF,
)

# Create VectorStore
vector_store.create_collection(
    dense_vector_config=dense_vectors_config,
    sparse_vector_config=sparse_vectors_config,
    on_disk_payload=True,  # store the payload on disk
)

# Create Index
vector_store.create_index(
    field_name="text",
    field_schema=models.TextIndexParams(
        type="text",
        tokenizer=models.TokenizerType.MULTILINGUAL,
    ),
)

In [15]:
print(
    len(documents),
    len(embedding_inputs),
    len(dense_embeddings),
    len(sparse_embedding_values),
    len(sparse_embedding_indices),
)

5183 5183 5183 5183 5183


In [16]:
vector_store.add(
    documents=[chunks[x] for x in embedding_input_idxs],
    texts=embedding_inputs,
    dense_embeddings=dense_embeddings,
    sparse_embedding_values=sparse_embedding_values,
    sparse_embedding_indices=sparse_embedding_indices,
    metadata_keys=["source_file", "source_id", "title"]
)

In [17]:
# check collection
collection_info = vector_store._client.get_collection(
    collection_name=vector_store.collection_name
)
print(collection_info.model_dump_json(indent=4))

{
    "status": "green",
    "optimizer_status": "ok",
    "vectors_count": null,
    "indexed_vectors_count": 5183,
    "points_count": 5183,
    "segments_count": 8,
    "config": {
        "params": {
            "vectors": {
                "vector_dense": {
                    "size": 1024,
                    "distance": "Cosine",
                    "hnsw_config": null,
                    "quantization_config": null,
                    "on_disk": true,
                    "datatype": null,
                    "multivector_config": null
                }
            },
            "shard_number": 1,
            "sharding_method": null,
            "replication_factor": 1,
            "write_consistency_factor": 1,
            "read_fan_out_factor": null,
            "on_disk_payload": true,
            "sparse_vectors": {
                "vector_sparse": {
                    "index": null,
                    "modifier": "idf"
                }
            }
        },
       

In [18]:
# check point
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [28]:
points[0]

Record(id='5cdc8783-3524-4c79-b980-9b0dcd08d10c', payload={'source_id': '8add4065-38d4-4023-b65d-82c921451926'}, vector={'vector_sparse': SparseVector(indices=[64890871, 104660491, 119240421, 150760872, 160759034, 194714415, 242156862, 243669559, 291984105, 301030427, 354307472, 436751995, 440354553, 442064690, 512480045, 555641829, 581432272, 602572328, 611385325, 640186380, 640477688, 659326392, 754797265, 756007634, 794129062, 804460016, 880670268, 915998194, 967714644, 1001081408, 1031134330, 1058501323, 1082468256, 1093482543, 1162692215, 1230423685, 1296924235, 1298296063, 1326575350, 1338150097, 1365144653, 1376624736, 1595981574, 1701701189, 1724426273, 1727802518, 1852771076, 1923685331, 1960942172, 2031875777, 2058513491], values=[0.20482236, 0.11231826, 0.043780316, 0.036478095, 0.083974384, 0.0488945, 0.13082118, 0.044764306, 0.15345916, 0.12254513, 0.060797732, 0.033715118, 0.11159267, 0.04200773, 0.111143656, 0.08230578, 0.15471552, 0.045902673, 0.1920953, 0.15333788, 0.0

In [19]:
print(points[0].id)
print(points[0].payload)

5cdc8783-3524-4c79-b980-9b0dcd08d10c
{'source_id': '8add4065-38d4-4023-b65d-82c921451926'}


In [21]:
print(points[0].vector.keys())

dict_keys(['vector_sparse', 'vector_dense'])


In [22]:
# dense
len(points[0].vector['vector_dense'])

1024

In [27]:
# sparse
type(points[0].vector['vector_sparse'])
sparse_value = points[0].vector['vector_sparse'].values
sparse_index = points[0].vector['vector_sparse'].indices
print(len(sparse_value), sparse_value[:10])
print(len(sparse_index), sparse_index[:10])

51 [0.20482236, 0.11231826, 0.043780316, 0.036478095, 0.083974384, 0.0488945, 0.13082118, 0.044764306, 0.15345916, 0.12254513]
51 [64890871, 104660491, 119240421, 150760872, 160759034, 194714415, 242156862, 243669559, 291984105, 301030427]
