# **1. Framework, Library, and API Key**

In [6]:
import pandas as pd
import json

from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import Qdrant

import qdrant_client
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

# Load API key.
from dotenv import load_dotenv
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

# **2. Dataset**

In [7]:
# Load FAQ json.
with open("full_faq.json", "r", encoding="utf-8") as file:
    faq = json.load(file)

faq[0]


{'question': 'Di mana ada lokasi Rumah Sakit Siloam?',
 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.',
 'category': 'FAQ Website'}

# **3. Embedding Model**

In [None]:
# Initiate OpenAI embeddings.
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small", 
) #

url = "http://localhost:6333"
collection_name = "test_faq"
distance = Distance.DOT
dimension = 1536 


In [12]:
# Initiate client.
client = qdrant_client.QdrantClient("localhost", port=6333)

# Show collections.
collections = client.get_collections()
print(collections)


collections=[CollectionDescription(name='test_faq'), CollectionDescription(name='test_salesItem')]


In [10]:
# Show collection details.
collection_info = client.get_collection(collection_name="test_faq_openai")
print(collection_info)


ResponseHandlingException: [Errno 111] Connection refused

In [None]:
# Delete collection.
# client.delete_collection(collection_name="test_faq")


True

# **4. Embedding Process**

## **4.1. Single Upsert = 2m 2.47s** 

In [None]:
def moveEmbedding(faq_data):
    """Function to load data from json"""
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    # Process FAQ
    for i, faq in enumerate(faq_data):
        # Use question as text for embedding.
        # text = faq['question']
        text = faq['question'] + "\n" + faq['answer'] + "\n" + faq['category']
        emb = embeddings.embed_query(text)
        
        print(f"Processing FAQ {i+1}/{len(faq_data)}")
        
        # Load to Qdrant
        client.upsert(
            collection_name=collection_name,
            points=[
                PointStruct(
                    id=i,
                    vector=emb,
                    payload={
                        "page_content": text,  
                        "metadata": {          
                            "question": faq['question'],
                            "answer": faq['answer'],
                            "category": faq['category']
                        }
                    },
                )
            ],
        )
        print(f"Added: {faq['question']}")
        

In [94]:
moveEmbedding(faq)


Processing FAQ 1/349
Added: Di mana ada lokasi Rumah Sakit Siloam?
Processing FAQ 2/349
Added: Apa jenis kamar rawat inap yang tersedia?
Processing FAQ 3/349
Added: Apa standar keamanan suplai darah yang diambil di rumah sakit?
Processing FAQ 4/349
Added: Fasilitas apa yang dimiliki pusat keunggulan Rumah Sakit?
Processing FAQ 5/349
Added: Apa standar kualitas yang dipatuhi oleh Siloam Hospitals?
Processing FAQ 6/349
Added: Metode pembayaran apa yang tersedia di Siloam Hospitals?
Processing FAQ 7/349
Added: Fasilitas apa yang disediakan di dalam kamar?
Processing FAQ 8/349
Added: Apakah merokok diperbolehkan di area rumah sakit?
Processing FAQ 9/349
Added: Berapa banyak pasien yang dirawat Siloam Hospitals setiap tahun?
Processing FAQ 10/349
Added: Apakah rumah sakit Siloam menawarkan perkiraan biaya perawatan?
Processing FAQ 11/349
Added: Jenis asuransi kesehatan apa saja yang diterima di Siloam Hospitals?
Processing FAQ 12/349
Added: Bagaimana jika dokter pilihan saya tidak tersedia?

## **4.2. Batch Upsert = 11.1s** 

In [7]:
def moveEmbedding(faq_data, batch_size=100):
    """
    Load FAQ data from a JSON-like list of dictionaries and upsert embeddings into Qdrant,
    processing the data in batches.
    
    Each FAQ should have the following keys: 'question', 'answer', and 'category'.
    The text for embedding is constructed by concatenating these fields with a newline.
    """
    client = qdrant_client.QdrantClient(url=url)
    
    # Create collection if it doesn't exist.
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=dimension, distance=distance),
        )
    
    total_batches = (len(faq_data) - 1) // batch_size + 1
    for batch_num in range(total_batches):
        start = batch_num * batch_size
        end = start + batch_size
        batch_faq = faq_data[start:end]
        
        texts = []
        ids = []
        payloads = []
        
        for i, faq in enumerate(batch_faq, start=start):
            # Concatenate question, answer, and category with newline delimiters.
            # text = faq['question']
            text = faq['question'] + "\n" + faq['answer'] + "\n" + faq['category']
            texts.append(text)
            ids.append(i)
            payloads.append({
                "page_content": text,
                "metadata": {
                    "question": faq['question'],
                    "answer": faq['answer'],
                    "category": faq['category']
                }
            })
        
        # Batch embed the texts.
        batch_embeddings = embeddings.embed_documents(texts)
        
        points = []
        for j, emb in enumerate(batch_embeddings):
            points.append(
                PointStruct(
                    id=ids[j],
                    vector=emb,
                    payload=payloads[j]
                )
            )
        
        # Upsert the current batch of points into Qdrant.
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"Processed batch {batch_num+1}/{total_batches}")
    
    print(f"Added: {len(faq_data)} FAQs")


In [8]:
moveEmbedding(faq)


Processed batch 1/4
Processed batch 2/4
Processed batch 3/4
Processed batch 4/4
Added: 349 FAQs


# **5. Testing** 

In [9]:
client = QdrantClient(
    url=url,
)
qdrant = Qdrant(
    client,
    embeddings=embeddings,
    collection_name=collection_name,
    distance_strategy=distance,
)


  qdrant = Qdrant(


In [10]:
qdrant.similarity_search_with_score(
    query="Di mana ada lokasi Rumah Sakit Siloam",
    k=1
)


[(Document(metadata={'question': 'Di mana ada lokasi Rumah Sakit Siloam?', 'answer': 'Ada 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.', 'category': 'FAQ Website', '_id': 0, '_collection_name': 'test_faq'}, page_content='Di mana ada lokasi Rumah Sakit Siloam?\nAda 40 Rumah Sakit modern yang terdiri dari 13 Rumah Sakit di Jabodetabek dan 27 rumah sakit yang tersebar di Jawa, Sumatera, Kalimantan, Sulawesi, serta Bali dan Nusa Tenggara.\nFAQ Website'),
  0.8097149)]

In [11]:
doc = qdrant.similarity_search_with_score('Di mana ada lokasi Rumah Sakit Siloam', score_threshold=0.5, k=1)
for d in doc:
    print(d[0].metadata['question'])

Di mana ada lokasi Rumah Sakit Siloam?


In [12]:
test_list = [{'query': x['question'], 'expected': x['question']} for x in faq]
print(test_list[0])

{'query': 'Di mana ada lokasi Rumah Sakit Siloam?', 'expected': 'Di mana ada lokasi Rumah Sakit Siloam?'}


In [13]:
test_list = test_list + [
    {
        "query": "siloam ada dimana saja",
        "expected": "Di mana ada lokasi Rumah Sakit Siloam?"
    },
    {
        "query": "jam buka mcu",
        "expected": "Jam buka MCU / Medical Check Up?"
    },
    {
        "query": "layanan kesehatan di rumah",
        "expected": "Apa itu Siloam at Home? Apakah Siloam memberikan layanan kesehatan di rumah atau Homecare?"
    },
    {
        "query": "cek obat beli di siloam dari aplikasi",
        "expected": "Halo saya mau cek obat yang saya beli dari Siloam, bisakah dari aplikasi MySiloam?"
    },
    {
        "query": "asuransi kerjasa sama dengan siloam",
        "expected": "Asuransi apa saja yang bekerja sama dengan Siloam?"
    },
]


In [None]:
def testing(qdrant, q, expected):
    doc = qdrant.similarity_search_with_score(q, score_threshold=0.5, k=3) # rang sesuaikan, k bebas
    i = 0
    for d in doc:
        if(d[0].metadata["question"] == expected):
            return [ True, d]
        i += 1
    if len(doc) == 0:
        return [False, "No result"]
    return [False, doc[0]]


## **5.1. Question Only** 

In [19]:
# With question only.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)

Unnamed: 0,query,result,score
0,siloam ada dimana saja,Apa nama lain Siloam Babakan?,0.668434
1,layanan kesehatan di rumah,Apakah saya bisa melakukan pemesanan homeservi...,0.648123


## **5.2. Question + Answer + Category** 

In [17]:
# With question + answer + category.
results = []
for t in test_list:
    [status, data] = testing(qdrant, t["query"], t["expected"])
    if(status == False):
        _data = {
            "query": t["query"],
            "result": data[0].metadata['question'] if data != "No result" else "No result",
            "score": data[1] if data != "No result" else "No result"
        }
        results.append(_data)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    df = pd.DataFrame(data=results)
    display(df)
    

Unnamed: 0,query,result,score
0,Waiting List,No result,No result
1,siloam ada dimana saja,Bagaimana cara download aplikasi MySiloam?,0.574024
