# **Framework, Library, and API Key**

In [1]:
import pandas as pd
import json
import time
import resource 

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_qdrant import Qdrant

import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance


  from .autonotebook import tqdm as notebook_tqdm


# **Dataset**

In [2]:
# Load FAQ json.
with open("../data/full_faq.json", "r", encoding="utf-8") as file:
    faq_data = json.load(file)

faq_data[0]


{'question': 'Di mana ada lokasi Rumah Sakit Siloam?',
 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.',
 'category': 'FAQ Website'}

# **Embedding Model**

In [3]:
from fastembed import TextEmbedding

# Create the DataFrame and clean it up
supported_models = (
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file", "additional_files", "license", "tasks"])
    .reset_index(drop=True)
)

# Filter model.
filtered_models = supported_models[
    supported_models['description'].str.contains("multilingual", case=False, na=False) &
    ~supported_models['model'].str.contains("jinaai", case=False, na=False)
]

# Display the filtered DataFrame with full column content
with pd.option_context('display.max_colwidth', None):
    display(filtered_models)


Unnamed: 0,model,description,size_in_GB,dim
9,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",0.22,384
24,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 384 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year.",1.0,768
28,intfloat/multilingual-e5-large,"Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",2.24,1024


# **sentence-transformers/paraphrase-multilingual-mpnet-base-v2**

In [4]:
# Initiate FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(
    cache_dir="../embedding_cache",
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

url = "http://localhost:6333"
collection_name = "faq-question"
distance = Distance.COSINE
dimension = 768


[32m2025-03-18 01:13:27.322[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m429[0m - [31m[1mCould not download model from HuggingFace: 403 Forbidden: None.
Cannot access content at: https://huggingface.co/api/models/xenova/paraphrase-multilingual-mpnet-base-v2.
Make sure your token has the correct permissions. Falling back to other sources.[0m
[32m2025-03-18 01:13:27.323[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m450[0m - [31m[1mCould not download model from either source, sleeping for 3.0 seconds, 2 retries left.[0m
[32m2025-03-18 01:13:30.328[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m429[0m - [31m[1mCould not download model from HuggingFace: 403 Forbidden: None.
Cannot access content at: https://huggingface.co/api/models/xenova/paraphrase-multilingual-mpnet-base-v2.
Make sure your token has the corre

KeyboardInterrupt: 

In [7]:
# Initiate client.
client = QdrantClient("localhost", port=6333)

# Show collections.
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='faq-question'), CollectionDescription(name='faq-question-answer')]


In [None]:
# # Show collection details.
# collection_info = client.get_collection(collection_name="test_faq_openai")
# print(collection_info)


In [6]:
# # Delete collection.
# client.delete_collection(collection_name="faq-question")
# client.delete_collection(collection_name="faq-question-answer")
# client.delete_collection(collection_name="faq-query-passage")


True

In [8]:
def moveEmbedding(faq_data, batch_size=100):
    """
    Load FAQ data from a JSON-like list of dictionaries and upsert embeddings into Qdrant,
    processing the data in batches.
    
    Each FAQ should have the following keys: 'question', 'answer', and 'category'.
    The text for embedding is constructed by concatenating these fields with a newline.
    """
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if it doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    total_batches = (len(faq_data) - 1) // batch_size + 1
    overall_start_time = time.time()
    
    for batch_num in range(total_batches):
        batch_start_time = time.time()
        start = batch_num * batch_size
        end = start + batch_size
        batch_faq = faq_data[start:end]
        
        texts = []
        ids = []
        payloads = []
        
        for i, faq in enumerate(batch_faq, start=start):
            # Concatenate question and answer with a newline delimiter.
            # text = faq['question']
            # text = faq['question'] + "\n" + faq['answer']
            text = f"query: {faq['question']}\npassage: {faq['answer']}"
            texts.append(text)
            ids.append(i)
            payloads.append({
                "page_content": text,
                "metadata": {
                    "question": faq['question'],
                    "answer": faq['answer'],
                    "category": faq['category']
                }
            })
        
        # Batch embed the texts.
        batch_embeddings = embeddings.embed_documents(texts)
        
        points = []
        for j, emb in enumerate(batch_embeddings):
            points.append(
                PointStruct(
                    id=ids[j],
                    vector=emb,
                    payload=payloads[j]
                )
            )
        
        # Upsert the current batch of points into Qdrant.
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        batch_end_time = time.time()
        batch_elapsed = batch_end_time - batch_start_time
        print(f"Processed batch {batch_num+1}/{total_batches} in {batch_elapsed:.2f} seconds")
    
    overall_end_time = time.time()
    total_elapsed = overall_end_time - overall_start_time

    # Calculate peak memory usage (in MB).
    peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    print(f"Added: {len(faq_data)} FAQs in {total_elapsed:.2f} seconds, Peak Memory: {peak_memory:.2f} MB")


# **Question Only**

In [9]:
moveEmbedding(faq_data)

Processed batch 1/4 in 4.71 seconds
Processed batch 2/4 in 2.59 seconds
Processed batch 3/4 in 2.43 seconds
Processed batch 4/4 in 1.09 seconds
Added: 349 FAQs in 10.82 seconds, Peak Memory: 2322.62 MB


In [10]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [11]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.9851 | Di mana ada lokasi Rumah Sakit Siloam?
0.9205 | Alamat Siloam Hospitals Buton dimana?
0.9033 | Alamat Siloam Hospitals Kupang dimana?
0.8978 | Alamat Siloam Hospitals Medan dimana?
0.8948 | Alamat Siloam Hospitals Bangka Belitung dimana?
0.8934 | Alamat Siloam Hospitals Paal Dua dimana?
0.8925 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.8875 | Rumah sakit Siloam di Tangerang apa saja?
0.8851 | Alamat Siloam Hospitals Kelapa Dua dimana?
0.8809 | Alamat Siloam Hospitals Agora Cempaka Putih dimana?
0.8801 | Alamat Siloam Hospitals Jember dimana?
0.8789 | Alamat Siloam Hospitals Mampang dimana?
0.8766 | Alamat Siloam Hospitals Lubuk Linggau dimana?
0.8721 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.8649 | Alamat Siloam Hospitals Bogor dimana?
0.8615 | Alamat Siloam Hospitals Ambon dimana?
0.8597 | Alamat MRCCC Siloam Hospitals Semanggi dimana?
0.8589 | Alamat Siloam Hospitals Sentosa dimana?
0.8542 | Alamat Siloam Hospitals Mataram dimana?
0.8536 | Alamat Siloam Hospitals

In [9]:
test_list = [{'query': x['question'], 'expected': x['question']} for x in faq_data]
print(test_list[0])

{'query': 'Di mana ada lokasi Rumah Sakit Siloam?', 'expected': 'Di mana ada lokasi Rumah Sakit Siloam?'}


In [10]:
test_list = test_list + [
    {
        "query": "siloam ada dimana saja",
        "expected": "Di mana ada lokasi Rumah Sakit Siloam?"
    },
    {
        "query": "jam buka mcu",
        "expected": "Jam buka MCU / Medical Check Up?"
    },
    {
        "query": "layanan kesehatan di rumah",
        "expected": "Apa itu Siloam at Home? Apakah Siloam memberikan layanan kesehatan di rumah atau Homecare?"
    },
    {
        "query": "cek obat beli di siloam dari aplikasi",
        "expected": "Halo saya mau cek obat yang saya beli dari Siloam, bisakah dari aplikasi MySiloam?"
    },
    {
        "query": "asuransi kerjasa sama dengan siloam",
        "expected": "Asuransi apa saja yang bekerja sama dengan Siloam?"
    },
    {
        "query": "test",
        "expected": "test"
    },
    
]


In [11]:
def testing(qdrant, q, expected):
    doc = qdrant.similarity_search_with_score(q, score_threshold=0.1, k=20)
    i = 0
    for d in doc:
        if(d[0].metadata["question"] == expected):
            return [ True, d]
        i += 1
    if len(doc) == 0:
        return [False, "No result"]
    return [False, doc[0]]


In [19]:
# With question only.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,test,Berapa lama hasil dari MCU / Medical Check Up ...,0.52157


# **Question + Answer** 

In [23]:
moveEmbedding(faq_data)

Processed batch 1/4 in 51.16 seconds
Processed batch 2/4 in 19.14 seconds
Processed batch 3/4 in 6.59 seconds
Processed batch 4/4 in 2.78 seconds
Added: 349 FAQs in 79.67 seconds, Peak Memory: 6355.68 MB


In [24]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


In [25]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8152 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.7729 | Alamat Siloam Hospitals Makassar dimana?
0.7695 | Di mana ada lokasi Rumah Sakit Siloam?
0.7602 | Alamat Siloam Hospitals Buton dimana?
0.7588 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.7584 | Rumah sakit Siloam di Tangerang apa saja?
0.7536 | Alamat Siloam Hospitals Sentosa dimana?
0.7476 | Alamat Siloam Hospitals Medan dimana?
0.7469 | Rumah sakit Siloam Hospitals Bangka Belitung punya nama lain apa?
0.7452 | Alamat Siloam Hospitals Yogyakarta dimana?
0.7447 | Rumah sakit Siloam Hospitals Medan punya nama lain apa?
0.7440 | Alamat Siloam Hospitals Banjarmasin dimana?
0.7424 | Rumah sakit Siloam Hospitals Bekasi Timur punya nama lain apa?
0.7402 | Alamat Siloam Hospitals Lubuk Linggau dimana?
0.7389 | Alamat Siloam Hospitals Bekasi Sepanjang Jaya dimana?
0.7378 | Apa misi Siloam Hospitals?
0.7375 | Alamat Siloam Hospitals Jember dimana?
0.7370 | Rumah sakit Siloam Hospitals Putera Bahagia punya nama lain apa?
0.7359

In [26]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Fasilitas apa yang dimiliki pusat keunggulan R...,Apa visi Siloam Hospitals?,0.650053
1,siloam ada dimana saja,Apa nama lain Siloam Kuta?,0.521162
2,test,Berapa lama hasil dari MCU / Medical Check Up ...,0.378687


# **query: Question + passage: Answer** 

In [12]:
moveEmbedding(faq_data)

Processed batch 1/4 in 51.67 seconds
Processed batch 2/4 in 19.55 seconds
Processed batch 3/4 in 7.29 seconds
Processed batch 4/4 in 3.03 seconds
Added: 349 FAQs in 81.55 seconds, Peak Memory: 4455.13 MB


In [13]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [14]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8291 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.7610 | Di mana ada lokasi Rumah Sakit Siloam?
0.7573 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.7409 | Alamat Siloam Hospitals Makassar dimana?
0.7383 | Alamat Siloam Hospitals Buton dimana?
0.7254 | Alamat Siloam Hospitals Banjarmasin dimana?
0.7123 | Alamat Siloam Hospitals Palangkaraya dimana?
0.7117 | Rumah sakit Siloam di Tangerang apa saja?
0.7105 | Alamat Siloam Hospitals Sentosa dimana?
0.7054 | Alamat Siloam Hospitals Purwakarta dimana?
0.7005 | Alamat Siloam Hospitals Lubuk Linggau dimana?
0.6994 | Alamat MRCCC Siloam Hospitals Semanggi dimana?
0.6979 | Alamat Siloam Hospitals Jember dimana?
0.6944 | Apa misi Siloam Hospitals?
0.6934 | Apa visi Siloam Hospitals?
0.6931 | Alamat Siloam Hospitals Bekasi Sepanjang Jaya dimana?
0.6924 | Alamat Siloam Hospitals Semarang dimana?
0.6914 | Alamat Siloam Hospitals Surabaya dimana?
0.6914 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.6911 | Rumah sakit Siloam Hospitals M

In [None]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Fasilitas apa yang dimiliki pusat keunggulan R...,Apa misi Siloam Hospitals?,0.625438
1,Rumah Sakit Siloam mana saja yang menerima pas...,Data apa yang dibutuhkan saat menggunakan BPJS...,0.65062
2,siloam ada dimana saja,Apa nama lain Siloam Cempaka Putih?,0.386341
3,test,Berapa lama hasil dari MCU / Medical Check Up ...,0.421545


: 