## Task 1: Search by name and address separately
Given structured data of hospitals as Objects and their metadata, search the name or the address and give the results

### Approach:
Since the data is well structured with clear fields, we can approach the problem with keyword search.
For this problem, I use BM25 Algorithm with a bit of twist to adapt to the given database given that it is a suitable algorithm for keyword search, especially in Vietnamese in this problem. 

For experiment purpose, I migh try two methods: one with keyword search, and one with semantic search (for semantic search, the context is pretty limited actually since the data given are in separate objects and with no context)

### Reference resources:
- BM25: 
- Read more about BM25 on Elastic Search

In [None]:
#import data
import json

# pyvi tách từ tiếng Việt
from pyvi.ViTokenizer import tokenize 
import re, os, string
import pandas as pd
import math 
import numpy as np

fileName = "data/20250612.json"

try:
    with open(fileName, "r", encoding="utf-8") as file:
        data = json.load(file)
except FileNotFoundError:
    print(f"Error: {fileName} not found.")
except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {fileName}")

ModuleNotFoundError: No module named 'pyvi'

In [33]:
# Define cleaning, normalization, and segmentation function 
# Xóa bỏ các kí tự thừa trong text
def cleanText(text):
    text = re.sub('<.*?>', '', text).strip()
    text = re.sub('(\s)+', r'\1', text)
    return text


# Chuẩn hóa văn bản, xóa bỏ các kí tự _ và chuyển sang chữ thường
def normalizeText(text):
    listPunctuation = string.punctuation.replace('_', '')
    for i in listPunctuation:
        text = text.replace(i, '')
    
    return text.lower()

# Thực hiện tách từ (tiếng Việt)
def tokenizeText(sent):
    sent = tokenize(sent.encode('utf-8').decode('utf-8'))
    return sent.split()

# combine label and dia_chi
def preprocessData(data):
    docs = []
    
    for doc in data:
        label = doc.get('label') or ''
        diachi = doc.get('diachi') or ''
        text = label + ' ' + diachi
        
        # clean, normalize and tokenize texts
        cleanedText = cleanText(text)
        normalizedText = normalizeText(cleanedText)
        tokenizedText = tokenizeText(normalizedText)
        
        docs.append(tokenizedText)
        
    return docs

### Some notes
- Higher (Caps Locks) texts (E,g., DS and ds, Phòng khám and phòng khám)
- Abbreviations (E.g., BV and Bệnh Viện, Phòng khám đa khoa and PVDK/ PKĐK, Dược sĩ and DS)

In [34]:
# Xây dựng mô hình BM25:
class BM25:
    def __init__(self, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        
    def fit(self, corpus):
        '''
        fit the statistics that are requird to calculate Bm25 ranking score using the corpus given
        
        Params:
        ---------
        corpus: list[list[str]]
        Each element in the list represents a document, and each document is a list of the terms
        
        Returns
        ---------
        self
        '''
        tf = [] # list[dict[str, int]]: term frequency in each document by order (document 1 --> document n)
        df = {} # dict[str, int]: document frequency number of docs in the text file that contains the term
        idf = {} # IDF of the term
        docLen = [] # list[int]: num of terms in each document
        corpusSize = len(corpus) # int: num of documents in the text file
        
        for document in corpus:
            docLen.append(len(document))
            
            # Compute tf - term frequency per document
            frequencies = {}
            for term in document:
                termCount = frequencies.get(term, 0) + 1
                frequencies[term] = termCount
                
            tf.append(frequencies)
            
            # compute df - document frequency per term
            for term, _ in frequencies.items(): # term and their tf in each document 
                dfCount = df.get(term, 0) + 1
                df[term] = dfCount
                
        # calculate the idf of the terms
        for term, freq in df.items():
            idf[term] = math.log(1 + (corpusSize - freq + 0.5) / (freq + 0.5))
            
        self.tf_ = tf
        self.df_ = df
        self.idf_ = idf
        self.docLen_ = docLen
        self.corpus_ = corpus
        self.corpusSize_ = corpusSize
        self.avgDocLen_ = sum(docLen) / corpusSize # float: the average number of terms in each document in the text file
        
        return self
    
    # score of the query in each document
    def scoreCalc(self, query, index):
        score = float(0)
        docLen = self.docLen_[index]
        frequencies = self.tf_[index]
        
        for term in query:
            if term not in frequencies:
                continue
            
            # Check if term exists in idf (avoid KeyError)
            if term not in self.idf_:
                continue
                
            freq = frequencies[term]
            numerator = self.idf_[term] * freq * (self.k1 + 1)
            denominator = freq + self.k1 * (1 - self.b + self.b * docLen / self.avgDocLen_)
            
            # sum
            score += (numerator / denominator)
        
        return score 
    
    def search(self, query):
        scores = [self.scoreCalc(query, index) for index in range(self.corpusSize_)]
        return scores


In [35]:
# process and get texts
docs = preprocessData(data)

# Train model - pass the processed documents (docs already contains tokenized text)
bm25 = BM25()
bm25.fit(docs)

<__main__.BM25 at 0x2e2d41425e0>

In [None]:
limit = 10
query = "Nha khoa Viet Smile"

# Process query the same way as individual documents (not using preprocessData)
cleanedQuery = cleanText(query)
normalizedQuery = normalizeText(cleanedQuery)
queryTokens = tokenizeText(normalizedQuery)

print(f"Original query: '{query}'")
print(f"Processed tokens: {queryTokens}")

# Get BM25 scores for each document
scores = bm25.search(queryTokens)

print(f"Scores for all documents: {scores}")

# Sort documents by relevance to the query
results = [(i, score) for i, score in enumerate(scores)]
results.sort(key=lambda x: x[1], reverse=True) #x[1]: score

# Top k results
print(f"Top {limit} most relevant results: \n")

for rank, (docIndex, score) in enumerate(results[:limit], 1):
    if score > 0:
        docData = data[docIndex]
        label = docData.get('label', 'N/A')
        diachi = docData.get('diachi', 'N/A')
        
        print(f"Rank {rank}: Score = {score:.4f}")
        print(f"label: {label}")
        print(f"diachi: {diachi}")


Original query: 'Nha khoa Viet Smile'
Processed tokens: ['nha_khoa', 'viet', 'smile']
Scores for all documents: [2.4935080701543106, 0.0, 2.360567086561884, 3.0004387285340606, 0.0, 0.0, 2.902105625529104, 3.2185492864155862, 2.6423164428879966, 0.0, 0.0, 0.0, 3.2185492864155862, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7235859602387795, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7235859602387795, 2.6423164428879966, 2.8100133170532375, 2.902105625529104, 3.1056692521171163, 2.8100133170532375, 0.0, 0.0, 3.2185492864155862, 0.0, 0.0, 0.0, 2.902105625529104, 0.0, 2.4252171177607713, 2.8100133170532375, 3.0004387285340606, 0.0, 0.0, 0.0, 0.0, 2.6423164428879966, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.6423164428879966, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.082938283682268, 0.0, 3.1056692521171163, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7235859602387795, 2.4252171177607713, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.6423164428879966, 0.0, 0.0, 2.