In [1]:
import json
import os
from pathlib import Path
from typing import Any, Dict, List, Optional

import jsonlines
import pandas as pd
from tqdm import tqdm

from config import settings

In [2]:
# import sys
# import os

# parent_dir = os.path.dirname(os.getcwd())
# core_src_dir = os.path.join(parent_dir, "src/psiking")
# sys.path.append(core_src_dir)

In [3]:
## Import Core Schemas
from psiking.core.base.schema import Document, TextNode

# 1. Read data
* read BEIR data & convert to Document
    * https://huggingface.co/BeIR
* test with `scifact` dataset
    * 5K passages
    * https://huggingface.co/datasets/BeIR/scifact

In [4]:
## Implement Custom Readers
from psiking.core.reader.base import BaseReader

class QuoraDataReader(BaseReader):
    def __init__(self):
        pass
    
    def read(
        self,
        data: dict,
        extra_info: Optional[dict] = None,
    ) -> Optional[Document]:
        """Data format
        ['_id', 'title', 'text', 'metadata']
        """
        metadata = extra_info or {}
        
        text = data.get('text', '')
        if not text:
            return None
        node = TextNode(
            text=text,
            metadata=metadata
        )
        return Document(
            nodes=[node],
            metadata={
                "source_id": data['_id'],
                "title": data['title'],
                **metadata
            }
        )

    def run(
        self,
        file_path: str | Path,
        extra_info: Optional[dict] = None
    ) -> List[Document]:
        metadata = extra_info or {}
        
        documents = []
        with jsonlines.open(file_path) as reader:
            for data in reader:
                document = self.read(
                    data,
                    extra_info={
                        **metadata
                    }
                )
                if document:
                    documents.append(document)
        return documents

In [5]:
document_path = os.path.join(settings.data_dir, "retrieval_dataset/beir/scifact/corpus.jsonl")

reader = QuoraDataReader()
documents = reader.run(document_path, extra_info={"source_file": "beir-scifact-corpus"})
print(len(documents))

5183


In [6]:
documents[:2]

[Document(id_='6e109d68-c0db-490b-ae7b-4bfb17b2e254', metadata={'source_id': '4983', 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.', 'source_file': 'beir-scifact-corpus'}, nodes=[TextNode(id_='cc617258-b857-41f0-a74e-9723b45b24ad', metadata={'source_file': 'beir-scifact-corpus'}, text_type=<TextType.PLAIN: 'plain'>, label=<TextLabel.PLAIN: 'plain'>, resource=MediaResource(data=None, text='Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of premat

# 2. Run Splitter
* simple inefficient splitter

In [7]:
# 3. Run Splitter
from psiking.core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

chunks = []
for document in documents:
    document_chunks = []
    source_id = document.id_
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            split_nodes = splitter.run(node)
        else:
            split_nodes = [node]
        
        # Create New Document
        chunk = Document(
            nodes=split_nodes,
            metadata={
                "source_id": source_id,
                "source_file": document.metadata['source_file'],
                "title": document.metadata['title'],
            }
        )
        document_chunks.append(chunk)
    chunks.extend(document_chunks)
print(len(chunks))

5183


# 4. Formatter

In [8]:
# 4. Format (Prepare Embedding Input)
from psiking.core.formatter.document.simple import SimpleTextOnlyFormatter

# use default templates
formatter = SimpleTextOnlyFormatter()
formatted_texts = formatter.run(chunks)

In [9]:
def select_embedding_input_idxs(texts: str, min_length: int = 20):
    return [i for i, x in enumerate(texts) if len(x.strip())>min_length]

embedding_input_idxs = select_embedding_input_idxs(
    texts=formatted_texts,
    min_length=20
)
print(len(embedding_input_idxs))

5183


# 5. Embedder

In [10]:
# 5. Embed
from openai import OpenAI
from psiking.core.embedder.openai.text_embedder import OpenAITextEmbedder

print(settings.openai_embedding_model)
client = OpenAI(
    base_url=settings.openai_embedding_base_url,
    api_key=settings.openai_embedding_api_key
)

embedder = OpenAITextEmbedder(
    client = client
)

baai/bge-m3


In [13]:
embedding_inputs = [formatted_texts[x] for x in embedding_input_idxs]
embeddings = embedder.run(
    texts=embedding_inputs,
    model=settings.openai_embedding_model,
    batch_size = 32,
    disable_tqdm = False
)

100%|██████████| 162/162 [00:59<00:00,  2.74it/s]


In [14]:
print(len(embeddings), len(embeddings[0]))

5183 1024


# 6. Add to VectorStore
* qdrant single vector vectorstore

In [15]:
# 6. Add to VectorStore
from qdrant_client import QdrantClient
from psiking.core.storage.vectorstore.qdrant import QdrantSingleVectorStore

# initialize client
client = QdrantClient(":memory:")
collection_name = "beir-scifact"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

In [16]:
from qdrant_client.http import models

embedding_dim = len(embeddings[0])

vector_store.create_collection(
    on_disk_payload=True,  # store the payload on disk
    vectors_config = models.VectorParams(
        size=embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True,
    )
)

In [17]:
vector_store.add(
    documents=[chunks[x] for x in embedding_input_idxs],
    embeddings=embeddings,
    metadata_keys=["source_file", "source_id", "title"]
)

In [18]:
# check collection
collection_info = vector_store._client.get_collection(
    collection_name=vector_store.collection_name
)
print(collection_info.model_dump_json(indent=4))

{
    "status": "green",
    "optimizer_status": "ok",
    "vectors_count": null,
    "indexed_vectors_count": 0,
    "points_count": 5183,
    "segments_count": 1,
    "config": {
        "params": {
            "vectors": {
                "size": 1024,
                "distance": "Cosine",
                "hnsw_config": null,
                "quantization_config": null,
                "on_disk": true,
                "datatype": null,
                "multivector_config": null
            },
            "shard_number": null,
            "sharding_method": null,
            "replication_factor": null,
            "write_consistency_factor": null,
            "read_fan_out_factor": null,
            "on_disk_payload": null,
            "sparse_vectors": null
        },
        "hnsw_config": {
            "m": 16,
            "ef_construct": 100,
            "full_scan_threshold": 10000,
            "max_indexing_threads": 0,
            "on_disk": null,
            "payload_m": null

In [19]:
# check point
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [20]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

1cdfe8db-19df-494b-8c25-7aed9645dbcc
{'source_id': '6e109d68-c0db-490b-ae7b-4bfb17b2e254', 'source_file': 'beir-scifact-corpus', 'title': 'Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.'}
1024


# 7. Retrieval

In [21]:
import numpy as np
from psiking.core.storage.vectorstore.schema import (
    MetadataFilters,
    FilterOperator,
    VectorStoreQuery,
    VectorStoreQueryMode,
    VectorStoreQueryOptions,
)   

In [22]:
query_embedding = np.random.randn(1024)

vsquery=VectorStoreQuery(
    dense_embedding=query_embedding
)
vsoptions=VectorStoreQueryOptions(
    mode=VectorStoreQueryMode.DENSE,
    top_k=10
)

In [23]:
points = vector_store.query(
    query=vsquery,
    options=vsoptions
)

In [24]:
points

[ScoredPoint(id='8f64a3bb-498c-4d42-bafc-7340ba3a0e6d', version=0, score=0.11509909600173898, payload={'source_id': '17a624da-f1bd-4ddf-8330-f78ee459082b', 'source_file': 'beir-scifact-corpus', 'title': 'Matrix-embedded osteocytes regulate mobilization of hematopoietic stem/progenitor cells.'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='846be232-1024-490d-8c77-f6214a892eb9', version=0, score=0.11258887816288517, payload={'source_id': 'ca109d79-7e45-4764-be6c-c4159d5d289f', 'source_file': 'beir-scifact-corpus', 'title': 'Cell and biomolecule delivery for tissue repair and regeneration in the central nervous system.'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id='c21053da-26a8-4d95-968f-f4fc2a821ce2', version=0, score=0.10986213965508985, payload={'source_id': '828e2fb8-c793-4940-ac0a-d8c8b55adb7e', 'source_file': 'beir-scifact-corpus', 'title': 'The N-end rule pathway regulates pathogen responses in plants.'}, vector=None, shard_key=None, order_