# **Framework, Library, and API Key**

In [1]:
import pandas as pd
import json
import time
import resource 

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_qdrant import Qdrant

import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance


  from .autonotebook import tqdm as notebook_tqdm


# **Dataset**

In [2]:
# Load FAQ json.
with open("../data/full_faq.json", "r", encoding="utf-8") as file:
    faq_data = json.load(file)

faq_data[0]


{'question': 'Di mana ada lokasi Rumah Sakit Siloam?',
 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.',
 'category': 'FAQ Website'}

# **Embedding Model**

In [3]:
from fastembed import TextEmbedding

# Create the DataFrame and clean it up
supported_models = (
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file", "additional_files", "license", "tasks"])
    .reset_index(drop=True)
)

# Filter model.
filtered_models = supported_models[
    supported_models['description'].str.contains("multilingual", case=False, na=False) &
    ~supported_models['model'].str.contains("jinaai", case=False, na=False)
]

# Display the filtered DataFrame with full column content
with pd.option_context('display.max_colwidth', None):
    display(filtered_models)


Unnamed: 0,model,description,size_in_GB,dim
9,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",0.22,384
24,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 384 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year.",1.0,768
28,intfloat/multilingual-e5-large,"Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",2.24,1024


# **sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2**

In [37]:
# Initiate FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(
    cache_dir="../embedding_cache",
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

url = "http://localhost:6333"
# collection_name = "faq-question"
# collection_name = "faq-question-answer"
collection_name = "faq-query-passage"
distance = Distance.COSINE
dimension = 384


  values["model"] = fastembed.TextEmbedding(


Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00,  3.23it/s]


In [38]:
# Initiate client.
client = QdrantClient("localhost", port=6333)

# Show collections.
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='faq-question'), CollectionDescription(name='faq-question-answer'), CollectionDescription(name='faq-query-passage')]


In [None]:
# # Show collection details.
# collection_info = client.get_collection(collection_name="test_faq_openai")
# print(collection_info)


In [23]:
# Delete collection.
# client.delete_collection(collection_name="faq-question-answer")


True

In [32]:
def moveEmbedding(faq_data, batch_size=100):
    """
    Load FAQ data from a JSON-like list of dictionaries and upsert embeddings into Qdrant,
    processing the data in batches.
    
    Each FAQ should have the following keys: 'question', 'answer', and 'category'.
    The text for embedding is constructed by concatenating these fields with a newline.
    """
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if it doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    total_batches = (len(faq_data) - 1) // batch_size + 1
    overall_start_time = time.time()
    
    for batch_num in range(total_batches):
        batch_start_time = time.time()
        start = batch_num * batch_size
        end = start + batch_size
        batch_faq = faq_data[start:end]
        
        texts = []
        ids = []
        payloads = []
        
        for i, faq in enumerate(batch_faq, start=start):
            # Concatenate question and answer with a newline delimiter.
            # text = faq['question']
            # text = faq['question'] + "\n" + faq['answer']
            text = f"query: {faq['question']}\npassage: {faq['answer']}"
            texts.append(text)
            ids.append(i)
            payloads.append({
                "page_content": text,
                "metadata": {
                    "question": faq['question'],
                    "answer": faq['answer'],
                    "category": faq['category']
                }
            })
        
        # Batch embed the texts.
        batch_embeddings = embeddings.embed_documents(texts)
        
        points = []
        for j, emb in enumerate(batch_embeddings):
            points.append(
                PointStruct(
                    id=ids[j],
                    vector=emb,
                    payload=payloads[j]
                )
            )
        
        # Upsert the current batch of points into Qdrant.
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        batch_end_time = time.time()
        batch_elapsed = batch_end_time - batch_start_time
        print(f"Processed batch {batch_num+1}/{total_batches} in {batch_elapsed:.2f} seconds")
    
    overall_end_time = time.time()
    total_elapsed = overall_end_time - overall_start_time

    # Calculate peak memory usage (in MB).
    peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    print(f"Added: {len(faq_data)} FAQs in {total_elapsed:.2f} seconds, Peak Memory: {peak_memory:.2f} MB")


# **Question Only**

In [7]:
moveEmbedding(faq_data)

Processed batch 1/4 in 1.55 seconds
Processed batch 2/4 in 0.75 seconds
Processed batch 3/4 in 0.64 seconds
Processed batch 4/4 in 0.32 seconds
Added: 349 FAQs in 3.26 seconds, Peak Memory: 908.22 MB


In [8]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [9]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.9926 | Di mana ada lokasi Rumah Sakit Siloam?
0.9410 | Alamat Siloam Hospitals Mampang dimana?
0.9267 | Alamat Siloam Hospitals Medan dimana?
0.9201 | Alamat Siloam Hospitals Palangkaraya dimana?
0.8930 | Apa visi Siloam Hospitals?
0.8910 | Alamat Siloam Hospitals Jambi dimana?
0.8841 | Alamat Siloam Hospitals Ambon dimana?
0.8835 | Alamat Siloam Hospitals Bangka Belitung dimana?
0.8829 | Alamat Siloam Hospitals Balikpapan dimana?
0.8825 | Alamat Siloam Hospitals Kupang dimana?
0.8800 | Alamat Siloam Hospitals Purwakarta dimana?
0.8763 | Rumah sakit Siloam di Jabodetabek apa saja?
0.8759 | Alamat Siloam Hospitals Semarang dimana?
0.8757 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.8752 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.8752 | Alamat Siloam Hospitals Paal Dua dimana?
0.8739 | Rumah sakit Siloam di Tangerang apa saja?
0.8725 | Alamat Siloam Hospitals Buton dimana?
0.8718 | Apa misi Siloam Hospitals?
0.8705 | Alamat Siloam Hospitals Jember dimana?


In [10]:
test_list = [{'query': x['question'], 'expected': x['question']} for x in faq_data]
print(test_list[0])

{'query': 'Di mana ada lokasi Rumah Sakit Siloam?', 'expected': 'Di mana ada lokasi Rumah Sakit Siloam?'}


In [11]:
test_list = test_list + [
    {
        "query": "siloam ada dimana saja",
        "expected": "Di mana ada lokasi Rumah Sakit Siloam?"
    },
    {
        "query": "jam buka mcu",
        "expected": "Jam buka MCU / Medical Check Up?"
    },
    {
        "query": "layanan kesehatan di rumah",
        "expected": "Apa itu Siloam at Home? Apakah Siloam memberikan layanan kesehatan di rumah atau Homecare?"
    },
    {
        "query": "cek obat beli di siloam dari aplikasi",
        "expected": "Halo saya mau cek obat yang saya beli dari Siloam, bisakah dari aplikasi MySiloam?"
    },
    {
        "query": "asuransi kerjasa sama dengan siloam",
        "expected": "Asuransi apa saja yang bekerja sama dengan Siloam?"
    },
]


In [12]:
def testing(qdrant, q, expected):
    doc = qdrant.similarity_search_with_score(q, score_threshold=0.1, k=20)
    i = 0
    for d in doc:
        if(d[0].metadata["question"] == expected):
            return [ True, d]
        i += 1
    if len(doc) == 0:
        return [False, "No result"]
    return [False, doc[0]]


In [13]:
# With question only.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,jam buka mcu,Apa itu SHKP?,0.600095
1,asuransi kerjasa sama dengan siloam,Bagaimana alur dalam membayar dengan asuransi?,0.731035


# **Question + Answer** 

In [26]:
moveEmbedding(faq_data)

Processed batch 1/4 in 4.98 seconds
Processed batch 2/4 in 5.44 seconds
Processed batch 3/4 in 2.22 seconds
Processed batch 4/4 in 0.85 seconds
Added: 349 FAQs in 13.49 seconds, Peak Memory: 1765.21 MB


In [27]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


In [28]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8514 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.8158 | Alamat Siloam Hospitals Purwakarta dimana?
0.8148 | Alamat Siloam Hospitals Palangkaraya dimana?
0.8126 | Rumah sakit Siloam Hospitals Palangkaraya punya nama lain apa?
0.8125 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.8114 | Rumah sakit Siloam Hospitals Bekasi Timur punya nama lain apa?
0.8056 | Rumah sakit Siloam di Tangerang apa saja?
0.8052 | Alamat Siloam Hospitals Mampang dimana?
0.7990 | Rumah sakit Siloam Hospitals Ambon punya nama lain apa?
0.7956 | Rumah sakit Siloam Hospitals Bekasi Sepanjang Jaya punya nama lain apa?
0.7943 | Rumah sakit Siloam Hospitals Labuan Bajo punya nama lain apa?
0.7932 | Alamat Siloam Hospitals Ambon dimana?
0.7898 | Rumah sakit Siloam di pulau Kalimantan apa saja?
0.7872 | Alamat Siloam Hospitals Lippo Cikarang dimana?
0.7869 | Di mana ada lokasi Rumah Sakit Siloam?
0.7865 | Rumah sakit Siloam di pulau Jawa apa saja?
0.7851 | Rumah sakit Siloam Hospitals Medan punya nama lain apa

In [29]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Rumah sakit Siloam di Jabodetabek apa saja?,Alamat Siloam Hospitals Labuan Bajo dimana?,0.806024
1,siloam ada dimana saja,Alamat SHCN dimana?,0.294736


# **query: Question + passage: Answer** 

In [33]:
moveEmbedding(faq_data)

Processed batch 1/4 in 5.37 seconds
Processed batch 2/4 in 5.19 seconds
Processed batch 3/4 in 2.58 seconds
Processed batch 4/4 in 0.99 seconds
Added: 349 FAQs in 14.14 seconds, Peak Memory: 2775.07 MB


In [34]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


In [35]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8388 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.7805 | Alamat Siloam Hospitals Purwakarta dimana?
0.7638 | Rumah sakit Siloam di pulau Jawa apa saja?
0.7607 | Rumah sakit Siloam di Tangerang apa saja?
0.7554 | Alamat Siloam Hospitals Palangkaraya dimana?
0.7532 | Rumah sakit Siloam Hospitals Bekasi Timur punya nama lain apa?
0.7521 | Alamat Siloam Hospitals Sentosa dimana?
0.7516 | Di mana ada lokasi Rumah Sakit Siloam?
0.7516 | Alamat Siloam Hospitals Bekasi Sepanjang Jaya dimana?
0.7505 | Alamat Siloam Hospitals Lippo Cikarang dimana?
0.7453 | Alamat Siloam Hospitals Kupang dimana?
0.7405 | Alamat Siloam Hospitals Semarang dimana?
0.7401 | Alamat Siloam Hospitals Buton dimana?
0.7381 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.7369 | Rumah sakit Siloam di pulau Kalimantan apa saja?
0.7363 | Rumah sakit Siloam Hospitals Palangkaraya punya nama lain apa?
0.7356 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.7341 | Rumah sakit Siloam Hospitals Ambon punya nama lain apa?


In [36]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Rumah sakit Siloam di Jabodetabek apa saja?,Alamat Siloam Hospitals Putera Bahagia dimana?,0.768076
1,Alamat SHBN dimana?,Alamat SHSB dimana?,0.686908
2,Alamat Siloam Hospitals Medan dimana?,Alamat Siloam Hospitals Putera Bahagia dimana?,0.868845
3,siloam ada dimana saja,Alamat SHLB dimana?,0.284515
