# 3_2 index text (messages)
* index `jinaai/JinaVDR`-`github-readme-retrieval-multilingual` dataset
* only index 'text' in 'messages' format

In [1]:
import os
import sys
parent_dir = os.path.dirname(os.path.dirname(os.getcwd()))
print(parent_dir)
core_src_dir = os.path.join(parent_dir, "src/psiking-core")
print(core_src_dir)
sys.path.append(core_src_dir)

from datasets import load_dataset, load_from_disk

from config import settings

/Users/id4thomas/github/psi-king
/Users/id4thomas/github/psi-king/src/psiking-core


# 1. Load Dataset
* `ko` split of `github-readme-retrieval-multilingual` 

In [2]:
ds = load_from_disk(
    os.path.join(
        settings.data_dir,
        'github-readme-retrieval-multilingual/data/ko'
    )
)

In [3]:
id_column = list(range(len(ds['test'])))
ds['test']=ds['test'].add_column(
    name='id',
    column=id_column
)

In [4]:
ds['test'][0]

{'query': '페이퍼 머니를 사용하는 주식 거래 앱입니다.',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=618x1080>,
 'image_filename': 'images/1249_screenshot.png',
 'license_type': 'APACHE_2',
 'license_text': '                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      "License" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      "Licensor" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      "Legal Entity" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      "control" means (i) the power, 

# 2. Prepare PSIKing Documents
* dataset text -> PSIKing Document

In [5]:
from psiking.core.base.schema import TextNode, Document

In [6]:
def prepare_documents(x):
    node = TextNode(
        text=x['text_description'],
        metadata={
            'docid': x['id']
        }
    )
    document = Document(
        nodes=[node],
        metadata={
            'docid': x['id']
        }
    )
    return {'doc': document.to_dict()}

In [7]:
collection = ds.map(
    function=prepare_documents,
    batched=False
)

In [8]:
collection['test'][6]

{'query': 'Bulma를 기반으로 하는 Vue.js용 경량 UI 구성 요소',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=540x1080>,
 'image_filename': 'images/774_screenshot.png',
 'license_type': 'MIT',
 'license_text': 'MIT License\n\nCopyright (c) 2017-2019 Rafael Beraldo\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS F

# 3. Chunk, Embed

## 3-1. Chunk Texts

In [9]:
from psiking.core.splitter.text.langchain_text_splitters import LangchainRecursiveCharacterTextSplitter

splitter = LangchainRecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 128
)

In [10]:
chunks = []
for sample in collection['test']:
    document = Document.from_dict(sample['doc'])
    
    document_chunks = []
    source_id = document.id_
    for i, node in enumerate(document.nodes):
        # Run Splitter
        if isinstance(node, TextNode):
            try:
                split_nodes = splitter.run(node)
            except Exception as e:
                print(i, node)
                print(str(e))
                raise e
        else:
            split_nodes = [node]
        
        # Create New Document
        # print(len(document.nodes[0].text), "SPLIT:", len(split_nodes))
        for split_node in split_nodes:
            # Each Document contains single node
            chunk = Document(
                nodes=[split_node],
                metadata={
                    "source_id": source_id,
                    "docid": document.metadata['docid'],
                }
            )
            document_chunks.append(chunk)
    chunks.extend(document_chunks)
print(len(chunks))

1435


In [26]:
# Save Chunks (for testing)
import json
with open('cache/text_messages/chunks.json', 'w') as f:
    f.write(
        json.dumps(
            {
              'chunks': [
                  x.to_dict() for x in chunks
              ]  
            },
            indent=4,
            ensure_ascii=False
        )
    )

## 3-2. Embed with jina-emb-v4

In [12]:
import asyncio
from tqdm.asyncio import tqdm

from psiking.core.embedder.vllm.online_jina_emb_v4 import VLLMOnlineJinaEmbV4Embedder

In [13]:
VLLM_ENDPOINT="http://localhost:8080"
MODEL="jina-embeddings-v4-vllm-retrieval"
embedder = VLLMOnlineJinaEmbV4Embedder(
    base_url=VLLM_ENDPOINT,
    model=MODEL
)

In [14]:
async def embed(semaphore, doc: Document):
    text = doc.nodes[0].text
    messages = [
        {'role': 'user', 'content': text}
    ]
    async with semaphore:
        embedding = await embedder.arun(
            input=messages,
            input_format='messages',
            pool=True,
            normalize=True
        )
    return embedding

In [15]:
semaphore = asyncio.Semaphore(32)

tasks = []
for chunk in chunks:
    task = embed(semaphore, chunk)
    tasks.append(task)

embeddings = await tqdm.gather(*tasks)

100%|██████████| 1435/1435 [1:52:07<00:00,  4.69s/it]  


In [16]:
print(len(embeddings))
print(len(embeddings[0]))

1435
2048


In [17]:
type(embeddings[0].tolist()[0])

float

# 4. Insert into VectorStore

In [18]:
from qdrant_client import QdrantClient
from psiking.core.storage.vectorstore.qdrant import QdrantSingleVectorStore

# initialize client
client = QdrantClient(host="localhost", port=6333)
collection_name = "jinavdr-github-text-messages"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

In [19]:
from qdrant_client.http import models

embedding_dim = 2048

vector_store.create_collection(
    on_disk_payload=True,  # store the payload on disk
    vectors_config = models.VectorParams(
        size=embedding_dim,
        distance=models.Distance.COSINE,
        on_disk=True,
    )
)

In [20]:
# np array -> List[float]
embeddings_float = [
    x.tolist() for x in embeddings
]

In [21]:
vector_store.add(
    documents=chunks,
    embeddings=embeddings_float,
    metadata_keys=["source_file", "docid"]
)

In [22]:
chunks[0].id_

'a181f2b9-49ca-4e60-9d04-0165d6d44755'

In [23]:
points = vector_store._client.retrieve(
    collection_name=vector_store.collection_name,
    ids=[chunks[0].id_],
    with_vectors=True
)

In [24]:
print(points[0].id)
print(points[0].payload)
print(len(points[0].vector))

a181f2b9-49ca-4e60-9d04-0165d6d44755
{'docid': 0}
2048


In [25]:
points[0]

Record(id='a181f2b9-49ca-4e60-9d04-0165d6d44755', payload={'docid': 0}, vector=[-0.015094001, -0.019790087, -0.0030749126, -0.021820737, 0.0034364625, -0.0022062545, 0.025856335, 0.031427637, -0.0049702073, -0.0061005964, -0.015703578, 0.01009177, -0.01278171, 0.01913805, -0.032352127, -0.025651608, 0.01780954, -0.018801076, 0.055043537, -0.02959405, -0.038807396, 0.009012873, 0.041322988, -0.0030946515, 0.00393653, -0.024757117, -0.045410987, -0.008298751, 0.033571877, -0.014848868, 0.014466347, 0.024501072, 0.010748489, 0.0124892695, -0.0714017, -0.008384778, -0.026107293, -0.004878163, -0.0039856643, -0.00475566, -0.0024958306, -0.007319636, -0.036342904, -0.052934628, -0.013474823, -0.093797, -0.005896045, -0.011144181, 0.0151680205, -0.018928787, 0.018489074, -0.0014877085, -0.057965178, -0.028002363, -0.011120492, 0.022317015, 0.02291552, -0.0053579556, -0.014189871, 0.0040804925, 0.0026239015, -0.025605505, 0.034305587, 0.018617958, 0.025937406, -0.022483382, -0.036429934, 0.024