# **Framework, Library, and API Key**

In [1]:
import pandas as pd
import json
import time
import resource 

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_qdrant import Qdrant

import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance


  from .autonotebook import tqdm as notebook_tqdm


# **Dataset**

In [2]:
# Load FAQ json.
with open("../data/full_faq.json", "r", encoding="utf-8") as file:
    faq_data = json.load(file)

faq_data[0]


{'question': 'Di mana ada lokasi Rumah Sakit Siloam?',
 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.',
 'category': 'FAQ Website'}

# **Embedding Model**

# **sentence-transformers/paraphrase-multilingual-mpnet-base-v2**

In [5]:
# Initiate FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(
    cache_dir="../embedding_cache",
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

url = "http://localhost:6333"
collection_name = "faq-question"
distance = Distance.COSINE
dimension = 768


In [16]:
# Initiate client.
client = QdrantClient("localhost", port=6333)

# Show collections.
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='faq-question')]


In [None]:
# # Show collection details.
# collection_info = client.get_collection(collection_name="test_faq_openai")
# print(collection_info)


In [6]:
# # Delete collection.
# client.delete_collection(collection_name="faq-question")
# client.delete_collection(collection_name="faq-question-answer")
# client.delete_collection(collection_name="faq-query-passage")


True

In [8]:
def moveEmbedding(faq_data, batch_size=100):
    """
    Load FAQ data from a JSON-like list of dictionaries and upsert embeddings into Qdrant,
    processing the data in batches.
    
    Each FAQ should have the following keys: 'question', 'answer', and 'category'.
    The text for embedding is constructed by concatenating these fields with a newline.
    """
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if it doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    total_batches = (len(faq_data) - 1) // batch_size + 1
    overall_start_time = time.time()
    
    for batch_num in range(total_batches):
        batch_start_time = time.time()
        start = batch_num * batch_size
        end = start + batch_size
        batch_faq = faq_data[start:end]
        
        texts = []
        ids = []
        payloads = []
        
        for i, faq in enumerate(batch_faq, start=start):
            # Concatenate question and answer with a newline delimiter.
            text = faq['question']
            texts.append(text)
            ids.append(i)
            payloads.append({
                "page_content": text,
                "metadata": {
                    "question": faq['question'],
                    "answer": faq['answer'],
                    "category": faq['category']
                }
            })
        
        # Batch embed the texts.
        batch_embeddings = embeddings.embed_documents(texts)
        
        points = []
        for j, emb in enumerate(batch_embeddings):
            points.append(
                PointStruct(
                    id=ids[j],
                    vector=emb,
                    payload=payloads[j]
                )
            )
        
        # Upsert the current batch of points into Qdrant.
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        batch_end_time = time.time()
        batch_elapsed = batch_end_time - batch_start_time
        print(f"Processed batch {batch_num+1}/{total_batches} in {batch_elapsed:.2f} seconds")
    
    overall_end_time = time.time()
    total_elapsed = overall_end_time - overall_start_time

    # Calculate peak memory usage (in MB).
    peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    print(f"Added: {len(faq_data)} FAQs in {total_elapsed:.2f} seconds, Peak Memory: {peak_memory:.2f} MB")


In [9]:
moveEmbedding(faq_data)

Processed batch 1/4 in 4.36 seconds
Processed batch 2/4 in 2.44 seconds
Processed batch 3/4 in 2.11 seconds
Processed batch 4/4 in 0.92 seconds
Added: 349 FAQs in 9.83 seconds, Peak Memory: 3910.23 MB


In [10]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [11]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.9851 | Di mana ada lokasi Rumah Sakit Siloam?
0.9205 | Alamat Siloam Hospitals Buton dimana?
0.9033 | Alamat Siloam Hospitals Kupang dimana?
0.8978 | Alamat Siloam Hospitals Medan dimana?
0.8948 | Alamat Siloam Hospitals Bangka Belitung dimana?
0.8934 | Alamat Siloam Hospitals Paal Dua dimana?
0.8925 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.8875 | Rumah sakit Siloam di Tangerang apa saja?
0.8851 | Alamat Siloam Hospitals Kelapa Dua dimana?
0.8809 | Alamat Siloam Hospitals Agora Cempaka Putih dimana?
0.8801 | Alamat Siloam Hospitals Jember dimana?
0.8789 | Alamat Siloam Hospitals Mampang dimana?
0.8766 | Alamat Siloam Hospitals Lubuk Linggau dimana?
0.8721 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.8649 | Alamat Siloam Hospitals Bogor dimana?
0.8615 | Alamat Siloam Hospitals Ambon dimana?
0.8597 | Alamat MRCCC Siloam Hospitals Semanggi dimana?
0.8589 | Alamat Siloam Hospitals Sentosa dimana?
0.8542 | Alamat Siloam Hospitals Mataram dimana?
0.8536 | Alamat Siloam Hospitals

In [12]:
test_list = [{'query': x['question'], 'expected': x['question']} for x in faq_data]
print(test_list[0])

{'query': 'Di mana ada lokasi Rumah Sakit Siloam?', 'expected': 'Di mana ada lokasi Rumah Sakit Siloam?'}


In [13]:
test_list = test_list + [
    {
        "query": "siloam ada dimana saja",
        "expected": "Di mana ada lokasi Rumah Sakit Siloam?"
    },
    {
        "query": "jam buka mcu",
        "expected": "Jam buka MCU / Medical Check Up?"
    },
    {
        "query": "layanan kesehatan di rumah",
        "expected": "Apa itu Siloam at Home? Apakah Siloam memberikan layanan kesehatan di rumah atau Homecare?"
    },
    {
        "query": "cek obat beli di siloam dari aplikasi",
        "expected": "Halo saya mau cek obat yang saya beli dari Siloam, bisakah dari aplikasi MySiloam?"
    },
    {
        "query": "asuransi kerjasa sama dengan siloam",
        "expected": "Asuransi apa saja yang bekerja sama dengan Siloam?"
    },
    {
        "query": "test",
        "expected": "test"
    },
    
]


In [14]:
def testing(qdrant, q, expected):
    doc = qdrant.similarity_search_with_score(q, score_threshold=0.1, k=20)
    i = 0
    for d in doc:
        if(d[0].metadata["question"] == expected):
            return [ True, d]
        i += 1
    if len(doc) == 0:
        return [False, "No result"]
    return [False, doc[0]]


In [15]:
# With question only.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,test,Berapa lama hasil dari MCU / Medical Check Up ...,0.52157
