# **Framework, Library, and API Key**

In [1]:
import pandas as pd
import json
import time
import resource 

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_qdrant import Qdrant

import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance


  from .autonotebook import tqdm as notebook_tqdm


# **Dataset**

In [2]:
# Load FAQ json.
with open("../data/full_faq.json", "r", encoding="utf-8") as file:
    faq_data = json.load(file)

faq_data[0]


{'question': 'Di mana ada lokasi Rumah Sakit Siloam?',
 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.',
 'category': 'FAQ Website'}

# **Embedding Model**

In [3]:
from fastembed import TextEmbedding

# Create the DataFrame and clean it up
supported_models = (
    pd.DataFrame(TextEmbedding.list_supported_models())
    .sort_values("size_in_GB")
    .drop(columns=["sources", "model_file", "additional_files", "license", "tasks"])
    .reset_index(drop=True)
)

# Filter model.
filtered_models = supported_models[
    supported_models['description'].str.contains("multilingual", case=False, na=False) &
    ~supported_models['model'].str.contains("jinaai", case=False, na=False)
]

# Display the filtered DataFrame with full column content
with pd.option_context('display.max_colwidth', None):
    display(filtered_models)


Unnamed: 0,model,description,size_in_GB,dim
9,sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 512 input tokens truncation, Prefixes for queries/documents: not necessary, 2019 year.",0.22,384
24,sentence-transformers/paraphrase-multilingual-mpnet-base-v2,"Text embeddings, Unimodal (text), Multilingual (~50 languages), 384 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year.",1.0,768
28,intfloat/multilingual-e5-large,"Text embeddings, Unimodal (text), Multilingual (~100 languages), 512 input tokens truncation, Prefixes for queries/documents: necessary, 2024 year.",2.24,1024


# **intfloat/multilingual-e5-large**

In [3]:
# Initiate FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(
    cache_dir="../embedding_cache",
    model_name="intfloat/multilingual-e5-large"
)

url = "http://localhost:6333"
# collection_name = "faq-question"
collection_name = "faq-question-answer"
# collection_name = "faq-query-passage"
distance = Distance.COSINE
dimension = 1024


  values["model"] = fastembed.TextEmbedding(


In [6]:
# Initiate client.
client = QdrantClient("localhost", port=6333)

# Show collections.
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='faq-question')]


In [None]:
# # Show collection details.
# collection_info = client.get_collection(collection_name="test_faq_openai")
# print(collection_info)


In [None]:
# # Delete collection.
# client.delete_collection(collection_name="faq-question")
# client.delete_collection(collection_name="faq-question-answer")
# client.delete_collection(collection_name="faq-query-passage")


True

In [7]:
def moveEmbedding(faq_data, batch_size=100):
    """
    Load FAQ data from a JSON-like list of dictionaries and upsert embeddings into Qdrant,
    processing the data in batches.
    
    Each FAQ should have the following keys: 'question', 'answer', and 'category'.
    The text for embedding is constructed by concatenating these fields with a newline.
    """
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if it doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    total_batches = (len(faq_data) - 1) // batch_size + 1
    overall_start_time = time.time()
    
    for batch_num in range(total_batches):
        batch_start_time = time.time()
        start = batch_num * batch_size
        end = start + batch_size
        batch_faq = faq_data[start:end]
        
        texts = []
        ids = []
        payloads = []
        
        for i, faq in enumerate(batch_faq, start=start):
            # Concatenate question and answer with a newline delimiter.
            # text = faq['question']
            text = faq['question'] + "\n" + faq['answer']
            # text = f"query: {faq['question']}\npassage: {faq['answer']}"
            texts.append(text)
            ids.append(i)
            payloads.append({
                "page_content": text,
                "metadata": {
                    "question": faq['question'],
                    "answer": faq['answer'],
                    "category": faq['category']
                }
            })
        
        # Batch embed the texts.
        batch_embeddings = embeddings.embed_documents(texts)
        
        points = []
        for j, emb in enumerate(batch_embeddings):
            points.append(
                PointStruct(
                    id=ids[j],
                    vector=emb,
                    payload=payloads[j]
                )
            )
        
        # Upsert the current batch of points into Qdrant.
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        batch_end_time = time.time()
        batch_elapsed = batch_end_time - batch_start_time
        print(f"Processed batch {batch_num+1}/{total_batches} in {batch_elapsed:.2f} seconds")
    
    overall_end_time = time.time()
    total_elapsed = overall_end_time - overall_start_time

    # Calculate peak memory usage (in MB).
    peak_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    print(f"Added: {len(faq_data)} FAQs in {total_elapsed:.2f} seconds, Peak Memory: {peak_memory:.2f} MB")


# **Question Only**

In [9]:
moveEmbedding(faq_data)

Processed batch 1/4 in 17.15 seconds
Processed batch 2/4 in 9.09 seconds
Processed batch 3/4 in 7.92 seconds
Processed batch 4/4 in 3.73 seconds
Added: 349 FAQs in 37.89 seconds, Peak Memory: 2049.07 MB


In [10]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [11]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.9801 | Di mana ada lokasi Rumah Sakit Siloam?
0.9303 | Alamat Siloam Hospitals ASRI dimana?
0.9284 | Rumah sakit Siloam di Jakarta apa saja?
0.9253 | Rumah sakit Siloam di Jabodetabek apa saja?
0.9250 | Alamat Siloam Hospitals Medan dimana?
0.9240 | Alamat Siloam Hospitals Surabaya dimana?
0.9219 | Alamat Siloam Hospitals Jambi dimana?
0.9205 | Alamat Siloam Hospitals Mampang dimana?
0.9205 | Alamat Siloam Hospitals Bogor dimana?
0.9201 | Alamat Siloam Hospitals Kebon Jeruk dimana?
0.9199 | Rumah sakit Siloam di pulau Jawa apa saja?
0.9188 | Alamat Siloam Hospitals Makassar dimana?
0.9181 | Alamat Siloam Hospitals Sentosa dimana?
0.9170 | Alamat Siloam Hospitals Semarang dimana?
0.9157 | Rumah sakit Siloam di pulau Sumatera apa saja?
0.9155 | Alamat Siloam Hospitals Lippo Village dimana?
0.9141 | Rumah sakit Siloam di pulau Kalimantan apa saja?
0.9140 | Rumah sakit Siloam di Tangerang apa saja?
0.9136 | Alamat Siloam Hospitals Banjarmasin dimana?
0.9132 | Alamat Siloam Hospitals Yogy

In [12]:
test_list = [{'query': x['question'], 'expected': x['question']} for x in faq_data]
print(test_list[0])

{'query': 'Di mana ada lokasi Rumah Sakit Siloam?', 'expected': 'Di mana ada lokasi Rumah Sakit Siloam?'}


In [13]:
test_list = test_list + [
    {
        "query": "siloam ada dimana saja",
        "expected": "Di mana ada lokasi Rumah Sakit Siloam?"
    },
    {
        "query": "jam buka mcu",
        "expected": "Jam buka MCU / Medical Check Up?"
    },
    {
        "query": "layanan kesehatan di rumah",
        "expected": "Apa itu Siloam at Home? Apakah Siloam memberikan layanan kesehatan di rumah atau Homecare?"
    },
    {
        "query": "cek obat beli di siloam dari aplikasi",
        "expected": "Halo saya mau cek obat yang saya beli dari Siloam, bisakah dari aplikasi MySiloam?"
    },
    {
        "query": "asuransi kerjasa sama dengan siloam",
        "expected": "Asuransi apa saja yang bekerja sama dengan Siloam?"
    },
    {
        "query": "test",
        "expected": "test"
    },
    
]


In [14]:
def testing(qdrant, q, expected):
    doc = qdrant.similarity_search_with_score(q, score_threshold=0.1, k=20)
    i = 0
    for d in doc:
        if(d[0].metadata["question"] == expected):
            return [ True, d]
        i += 1
    if len(doc) == 0:
        return [False, "No result"]
    return [False, doc[0]]


In [15]:
# With question only.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,test,Bagaimana jika ada rujukan untuk pemeriksaan L...,0.784677


# **Question + Answer** 

In [8]:
moveEmbedding(faq_data)

Processed batch 1/4 in 175.97 seconds
Processed batch 2/4 in 67.59 seconds
Processed batch 3/4 in 24.06 seconds
Processed batch 4/4 in 10.04 seconds
Added: 349 FAQs in 277.66 seconds, Peak Memory: 6229.53 MB


In [9]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [10]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8789 | Alamat Siloam Hospitals Yogyakarta dimana?
0.8783 | Alamat Siloam Hospitals TB Simatupang dimana?
0.8774 | Alamat Siloam Hospitals Bogor dimana?
0.8756 | Di mana ada lokasi Rumah Sakit Siloam?
0.8754 | Alamat Siloam Hospitals Jambi dimana?
0.8750 | Alamat Siloam Hospitals Kebon Jeruk dimana?
0.8748 | Alamat Siloam Hospitals Medan dimana?
0.8746 | Alamat Siloam Hospitals Jember dimana?
0.8735 | Alamat Siloam Hospitals ASRI dimana?
0.8734 | Alamat Siloam Hospitals Semarang dimana?
0.8732 | Alamat Siloam Hospitals Balikpapan dimana?
0.8728 | Alamat Siloam Hospitals Mampang dimana?
0.8717 | Alamat Siloam Hospitals Surabaya dimana?
0.8715 | Alamat Siloam Hospitals Denpasar dimana?
0.8713 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.8711 | Alamat Siloam Hospitals Sentosa dimana?
0.8706 | Alamat Siloam Hospitals Mataram dimana?
0.8700 | Alamat Siloam Hospitals Kelapa Dua dimana?
0.8696 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.8689 | Alamat Siloam Hospitals Labuan Bajo d

In [15]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,siloam ada dimana saja,Apakah saya bisa melakukan pemesanan homeservi...,0.835366
1,test,Produk yang saya cari tidak ada di website,0.755827


# **query: Question + passage: Answer** 

In [12]:
moveEmbedding(faq_data)

Processed batch 1/4 in 51.67 seconds
Processed batch 2/4 in 19.55 seconds
Processed batch 3/4 in 7.29 seconds
Processed batch 4/4 in 3.03 seconds
Added: 349 FAQs in 81.55 seconds, Peak Memory: 4455.13 MB


In [13]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [14]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.1, k=20)
for document, score in doc:
    print(f"{score:.4f} | {document.metadata['question']}")


0.8291 | Alamat Siloam Hospitals Putera Bahagia dimana?
0.7610 | Di mana ada lokasi Rumah Sakit Siloam?
0.7573 | Alamat Siloam Hospitals Labuan Bajo dimana?
0.7409 | Alamat Siloam Hospitals Makassar dimana?
0.7383 | Alamat Siloam Hospitals Buton dimana?
0.7254 | Alamat Siloam Hospitals Banjarmasin dimana?
0.7123 | Alamat Siloam Hospitals Palangkaraya dimana?
0.7117 | Rumah sakit Siloam di Tangerang apa saja?
0.7105 | Alamat Siloam Hospitals Sentosa dimana?
0.7054 | Alamat Siloam Hospitals Purwakarta dimana?
0.7005 | Alamat Siloam Hospitals Lubuk Linggau dimana?
0.6994 | Alamat MRCCC Siloam Hospitals Semanggi dimana?
0.6979 | Alamat Siloam Hospitals Jember dimana?
0.6944 | Apa misi Siloam Hospitals?
0.6934 | Apa visi Siloam Hospitals?
0.6931 | Alamat Siloam Hospitals Bekasi Sepanjang Jaya dimana?
0.6924 | Alamat Siloam Hospitals Semarang dimana?
0.6914 | Alamat Siloam Hospitals Surabaya dimana?
0.6914 | Alamat Siloam Hospitals Bekasi Timur dimana?
0.6911 | Rumah sakit Siloam Hospitals M

In [15]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Fasilitas apa yang dimiliki pusat keunggulan R...,Apa misi Siloam Hospitals?,0.625438
1,Rumah Sakit Siloam mana saja yang menerima pas...,Data apa yang dibutuhkan saat menggunakan BPJS...,0.65062
2,siloam ada dimana saja,Apa nama lain Siloam Cempaka Putih?,0.386341
3,test,Berapa lama hasil dari MCU / Medical Check Up ...,0.421545
