# 4_1 evaluate text query
* evaluate with queries in text format
* calculate mAP, mRR

In [5]:
import os
import sys
parent_dir = os.path.dirname(os.path.dirname(os.getcwd()))
print(parent_dir)
core_src_dir = os.path.join(parent_dir, "src/psiking-core")
print(core_src_dir)
sys.path.append(core_src_dir)

from config import settings

/Users/id4thomas/github/psi-king
/Users/id4thomas/github/psi-king/src/psiking-core


# 1. Load Data

## 1-1. Load Dataset

In [6]:
from datasets import load_dataset, load_from_disk

ds = load_from_disk(
    os.path.join(
        settings.data_dir,
        'github-readme-retrieval-multilingual/data/ko'
    )
)

In [7]:
id_column = list(range(len(ds['test'])))
ds['test']=ds['test'].add_column(
    name='id',
    column=id_column
)

In [9]:
queries = ds['test']['query']
print(len(queries))
queries[:3]

1000


['페이퍼 머니를 사용하는 주식 거래 앱입니다.',
 '내장된 글꼴과 하위 수준의 선 및 선반 반복기를 갖춘 Hershey 글꼴 라이브러리',
 '🖍 Text Marker(하이라이터)는 Sublime Text에서 단어를 강조 표시합니다.']

## 1-2. Calculate Query Embeddings

In [3]:
import asyncio
from tqdm.asyncio import tqdm

from psiking.core.embedder.vllm.online_jina_emb_v4 import VLLMOnlineJinaEmbV4Embedder

VLLM_ENDPOINT="http://localhost:8080"
MODEL="jina-embeddings-v4-vllm-retrieval"
embedder = VLLMOnlineJinaEmbV4Embedder(
    base_url=VLLM_ENDPOINT,
    model=MODEL
)

In [10]:
async def embed(semaphore, text: str):
    async with semaphore:
        embedding = await embedder.arun(
            input=text,
            input_format='text',
            mode='query',
            pool=True,
            normalize=True
        )
    return embedding

In [24]:
semaphore = asyncio.Semaphore(32)

tasks = []
for query in queries:
    task = embed(semaphore, query)
    tasks.append(task)

query_embeddings = await tqdm.gather(*tasks)

 18%|█▊        | 175/1000 [25:06<1:58:23,  8.61s/it]
100%|██████████| 1000/1000 [01:06<00:00, 14.97it/s]


# Load VectorStores

In [77]:
from qdrant_client import QdrantClient
from psiking.core.storage.vectorstore.qdrant import QdrantSingleVectorStore

# initialize client
client = QdrantClient(host="localhost", port=6333)
collection_name = "jinavdr-github-text"
collection_name = "jinavdr-github-text-messages"
collection_name = "jinavdr-github-multimodal-messages"

vector_store = QdrantSingleVectorStore(
    collection_name=collection_name,
    client=client
)

In [78]:
import numpy as np
query_embedding = np.random.randn(2048)
print(query_embedding.shape)

(2048,)


In [79]:
from qdrant_client.http.models import SearchRequest

query_embedding = query_embeddings[0]
similarity_top_k = 10
response = vector_store._client.search_batch(
    collection_name=collection_name,
    requests = [
        SearchRequest(
            vector=query_embedding,
            limit=similarity_top_k,
            with_payload=True
        )
    ]
)

  response = vector_store._client.search_batch(


In [80]:
response

[[ScoredPoint(id='9257681f-037e-4af2-b1cb-a3c57796e237', version=0, score=0.6773325, payload={'docid': 0}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='5dd6e597-c99d-4867-8bb6-fb59027f96ad', version=0, score=0.6773325, payload={'docid': 0}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='3337df3a-1beb-467e-9298-b3fd6ccadc82', version=1, score=0.6398333, payload={'docid': 63}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='d289037b-f54d-4171-b37a-4434e2161308', version=1, score=0.6398333, payload={'docid': 63}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='7ef9e592-371c-4169-bb1e-ee0bcd2c7cd1', version=20, score=0.6379668, payload={'docid': 934}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='8ce40ee0-56e5-4a07-aca5-117f72921a0f', version=1, score=0.61766183, payload={'docid': 58}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id='ca3d8128-4686-479b-b655-b11827a6eb2e', versio

In [81]:
def retrieve(query_embedding, k=10):
    response = vector_store._client.search_batch(
        collection_name=collection_name,
        requests = [
            SearchRequest(
                vector=query_embedding,
                limit=k,
                with_payload=True
            )
        ]
    )
    return [x.payload['docid'] for x in response[0]]

In [82]:
retrieved_docids = []
for query_embedding in query_embeddings:
    retrieved_docids.append(retrieve(query_embedding))

  response = vector_store._client.search_batch(


# Evaluate

mAP:
```
jinavdr-github-text mAP: 0.123
jinavdr-github-text-messages mAP: 0.134
jinavdr-github-multimodal-messages mAP: 0.327
```

mRR:
```
jinavdr-github-text mRR: 0.377
jinavdr-github-text-messages mRR: 0.415
jinavdr-github-multimodal-messages mRR: 0.854
```

In [83]:
def calculate_ap(docids, answer_id):
    ap = 0.0
    n_relevant = 0
    for i in range(len(docids)):
        if docids[i]==answer_id:
            n_relevant+=1
        ap += n_relevant/(i+1)
    
    ap/=len(docids)
    return ap

In [84]:
aps = [calculate_ap(x, i) for i, x in enumerate(retrieved_docids)]
map = sum(aps)/len(aps)
print("{} mAP: {:.3f}".format(collection_name, map))

jinavdr-github-multimodal-messages mAP: 0.327


In [85]:
def calculate_rr(docids, answer_id):
    rr = 0.0
    for i, docid in enumerate(docids):
        if docid==answer_id:
            rr = 1/(i+1)
            break
    return rr

In [86]:
rrs = [calculate_rr(x, i) for i, x in enumerate(retrieved_docids)]
mrr = sum(rrs)/len(rrs)
print("{} mRR: {:.3f}".format(collection_name, mrr))

jinavdr-github-multimodal-messages mRR: 0.854
