In [1]:
!pip install sentence-transformers pandas tqdm



In [2]:
!python -m spacy download en_core_web_md
!pip install keybert


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
!pip install "elasticsearch<9"



In [4]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm
import re
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
es = Elasticsearch(
    hosts=["http://localhost:9200"],   
)


In [6]:
import requests
print(requests.get("http://localhost:9200").text)


{
  "name" : "941bc3af368d",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "6KQrMvN8QuyudDcEoPEIBA",
  "version" : {
    "number" : "8.13.0",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "09df99393193b2c53d92899662a8b8b3c55b45cd",
    "build_date" : "2024-03-22T03:35:46.757803203Z",
    "build_snapshot" : false,
    "lucene_version" : "9.10.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}



In [7]:
index_name = "places_danang"

mapping = {
    "mappings": {
        "properties": {
            "type": {"type": "keyword"},
            "name": {"type": "text"},
            "description": {"type": "text"},
            "time": {"type": "keyword"},
            "price": {"type": "keyword"},
            "location": {"type": "text"},
            "area": {"type": "keyword"},
            "note": {"type": "text"},
            "id": {"type": "keyword"},
            "full_text": {"type": "text"},  
            "vector_search": {
                "type": "dense_vector",
                "dims": 384,  
                "index": True,           
                "similarity": "cosine"  
            }
        }
    }
}


In [8]:
try:
    es.indices.create(index=index_name, body=mapping)
except Exception as e:
    print("Mapping error details:", getattr(e, 'info', str(e)))

Mapping error details: {'error': {'root_cause': [{'type': 'resource_already_exists_exception', 'reason': 'index [places_danang/ultuLYPyRlejd7t699_KKw] already exists', 'index_uuid': 'ultuLYPyRlejd7t699_KKw', 'index': 'places_danang'}], 'type': 'resource_already_exists_exception', 'reason': 'index [places_danang/ultuLYPyRlejd7t699_KKw] already exists', 'index_uuid': 'ultuLYPyRlejd7t699_KKw', 'index': 'places_danang'}, 'status': 400}


In [9]:
#Delete pervious index and create a new one: 
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)
print(f"Index `{index_name}` đã được tạo!")


Index `places_danang` đã được tạo!


In [10]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [11]:
df = pd.read_csv("data_danang_ok.csv")  # Đường dẫn file của bạn

# Tạo embedding cho từng record
def embed(text):
    return model.encode(text).tolist()

# Nếu cột vector_search đã có, bỏ qua đoạn này, còn không:
tqdm.pandas()
df["vector_search"] = df["full_text"].progress_apply(embed)

#indexing data to elasticsearch
for i, row in tqdm(df.iterrows(), total=len(df)):
    doc = row.to_dict()
    # Nếu vector_search dạng numpy, cần chuyển sang list
    es.index(index=index_name, id=doc["id"], document=doc)

100%|████████████████████████████████| 299/299 [00:05<00:00, 58.76it/s]
100%|███████████████████████████████| 299/299 [00:02<00:00, 128.82it/s]


**Querry processing for better search**

In [31]:
#extract keywords for better keyword search
nlp = spacy.load("en_core_web_md")
def preprocess_bm25_query(query):
    doc = nlp(query)
    return " ".join([chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2])
#preprocess_bm25_query("suggest a noodle soup for breakfast near center")

'a noodle soup breakfast center'

In [13]:
#remove non-sense words 
STOP_WORDS = {
    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if",
    "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that",
    "the", "their", "then", "there", "these", "they", "this", "to", "was",
    "will", "with", "me", "my", "you", "your", "we", "our", "us", "he",
    "him", "his", "she", "her", "hers", "it", "its", "them", "so", "too"
}

def preprocess_query_for_vector(query):
    # Bỏ dấu câu (tuỳ chọn, để nguyên cũng được vì embedding model hiểu)
    query_no_punct = re.sub(r'[^\w\s]', '', query)
    # Bỏ stopword, giữ lại trật tự và ý nghĩa câu
    words = query_no_punct.split()
    filtered = [w for w in words if w.lower() not in STOP_WORDS]
    # Ghép lại thành câu ngắn gọn
    processed_query = " ".join(filtered) if filtered else query
    return processed_query

# Ví dụ:
#preprocess_query_for_vector( "Where to eat Bun Bo Hue in the evening?")
    #-> "Where eat Bun Bo Hue evening"

'Where eat Bun Bo Hue evening'

**Search**

In [49]:
'''
def bm25_search(query, top_k=10):
    processed_query = preprocess_bm25_query(query)  # extract keyword
    body = {
        "size": top_k,
        "query": {
            "multi_match": {
                "query": processed_query,
                "fields": ["name^3", "description^2", "note", "full_text"],
                "operator": "or",  
                "type": "most_fields"
            }
        }
    }
    res = es.search(index=index_name, body=body)
    return [
        {
            "id": hit["_source"]["id"],
            "score": hit["_score"],
            "full_text": hit["_source"]["full_text"]
        }
        for hit in res["hits"]["hits"]
    ]
'''
def bm25_search(query, top_k=10, type_filter=None):
    processed_query = preprocess_bm25_query(query)

    must_clauses = [
        {
            "multi_match": {
                "query": processed_query,
                "fields": ["name^3", "description^2", "note", "full_text"],
                "operator": "or",
                "type": "most_fields"
            }
        }
    ]

    # Nếu có filter, thêm điều kiện
    if type_filter:
        must_clauses.append({"term": {"type": type_filter}})

    body = {
        "size": top_k,
        "query": {
            "bool": {
                "must": must_clauses
            }
        }
    }

    res = es.search(index=index_name, body=body)
    return [
        {
            "id": hit["_source"]["id"],
            "score": hit["_score"],
            "name": hit["_source"]["name"],
            "description": hit["_source"]["description"],
            "time": hit["_source"]["time"],
            "price": hit["_source"]["price"],
            "location": hit["_source"]["location"],
            "area": hit["_source"]["area"],
            "note": hit["_source"]["note"],
            "type": hit["_source"]["type"]
        }
        for hit in res["hits"]["hits"]
    ]
#khi goi bm25_search(query, type_filter="eat") -> tim trong moi muc eat thoi 

In [56]:
'''
def vector_search(query, top_k=10):
    query_vec = model.encode(query).tolist()
    body = {
        "size": top_k,
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'vector_search') + 1.0",
                    "params": {"query_vector": query_vec}
                }
            }
        }
    }
    res = es.search(index=index_name, body=body)
    return [
        {
            "id": hit["_source"]["id"],
            "score": hit["_score"],
            "full_text": hit["_source"]["full_text"]
        }
        for hit in res["hits"]["hits"]
    ]
'''
def vector_search(query, top_k=10, type_filter=None):
    query = preprocess_query_for_vector(query)
    query_vec = model.encode(query).tolist()

    # Nếu có filter, dùng bool; nếu không, dùng match_all như cũ
    if type_filter:
        inner_query = {
            "bool": {
                "must": [
                    {"term": {"type": type_filter}}
                ]
            }
        }
    else:
        inner_query = {"match_all": {}}

    body = {
        "size": top_k,
        "query": {
            "script_score": {
                "query": inner_query,
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'vector_search') + 1.0",
                    "params": {"query_vector": query_vec}
                }
            }
        }
    }
    res = es.search(index=index_name, body=body)
    return [
        {
            "id": hit["_source"]["id"],
            "score": hit["_score"],
            "name": hit["_source"]["name"],
            "description": hit["_source"]["description"],
            "time": hit["_source"]["time"],
            "price": hit["_source"]["price"],
            "location": hit["_source"]["location"],
            "area": hit["_source"]["area"],
            "note": hit["_source"]["note"],
            "type": hit["_source"]["type"]
        }
        for hit in res["hits"]["hits"]
    ]
#vector_search(query, type_filter="eat")

In [59]:
def reciprocal_rank_fusion(lexical_hits, semantic_hits, k=60, top_k=5):
    rrf_scores = {}
    # Lexical hits
    for rank, hit in enumerate(lexical_hits, start=1):
        doc_id = hit["id"]
        score = 1 / (k + rank)
        if doc_id in rrf_scores:
            rrf_scores[doc_id]["rrf_score"] += score
            rrf_scores[doc_id]["lexical_score"] = hit["score"]
        else:
            rrf_scores[doc_id] = {**hit, "lexical_score": hit["score"], "semantic_score": 0, "rrf_score": score}
    # Semantic hits
    for rank, hit in enumerate(semantic_hits, start=1):
        doc_id = hit["id"]
        score = 1 / (k + rank)
        if doc_id in rrf_scores:
            rrf_scores[doc_id]["rrf_score"] += score
            rrf_scores[doc_id]["semantic_score"] = hit["score"]
        else:
            rrf_scores[doc_id] = {**hit, "lexical_score": 0, "semantic_score": hit["score"], "rrf_score": score}
    results = sorted(rrf_scores.values(), key=lambda x: x["rrf_score"], reverse=True)[:top_k]
    return results


In [60]:
'''
def hybrid_search(query, top_k=5, alpha=0.5):
    # BM25
    bm25_results = bm25_search(query, top_k=top_k*2)
    bm25_ids = {doc["id"]: doc for doc in bm25_results}

    # Vector
    vec_results = vector_search(query, top_k=top_k*2)
    vec_ids = {doc["id"]: doc for doc in vec_results}

    # Gộp tất cả id
    all_ids = set(bm25_ids.keys()) | set(vec_ids.keys())

    # Tính điểm hybrid
    hybrid_results = []
    for id_ in all_ids:
        bm25_score = bm25_ids.get(id_, {}).get("score", 0)
        vec_score = vec_ids.get(id_, {}).get("score", 0)
        score = (1 - alpha) * bm25_score + alpha * vec_score
        hybrid_results.append({
            "id": id_,
            "hybrid_score": score,
            "full_text": bm25_ids.get(id_, vec_ids.get(id_, {})).get("full_text", "")
        })

    # Sort theo điểm hybrid
    hybrid_results = sorted(hybrid_results, key=lambda x: x["hybrid_score"], reverse=True)[:top_k]
    return hybrid_results
'''
def hybrid_search(query, top_k=5, k_rrf=60, type_filter=None):
    bm25_results = bm25_search(query, top_k=top_k*2,type_filter=type_filter )   # Lấy nhiều hơn để RRF hiệu quả hơn
    vector_results = vector_search(query, top_k=top_k*2, type_filter=type_filter)
    results = reciprocal_rank_fusion(bm25_results, vector_results, k=k_rrf, top_k=top_k)
    return results


**Testing search**

In [71]:
#query = "Where to eat beef noodles"
#print("BM25:", bm25_search(query, type_filter = 'eat'))
#print("Vector:", vector_search(query, type_filter="see"))
#print("Hybrid:", hybrid_search(query, type_filter="eat"))


**Build Prompt**

In [67]:
entry_template = """
Name: {name}
Type: {type}
Description: {description}
Time: {time}
Price: {price}
Location: {location}
Area: {area}
Note: {note}
""".strip()

prompt_template = """
You are a helpful local travel assistant for Da Nang. Answer the QUESTION based on the CONTEXT from our database of places to eat, see, and stay.
Only use the facts from the CONTEXT when answering the QUESTION. If you don't know, say you don't know.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [68]:
def build_prompt(query, search_results):
    context = ""
    for doc in search_results:
        context += entry_template.format(**doc) + "\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [70]:
sample_query = "Where can I eat grilled fish in Da Nang?"
search_results = hybrid_search(sample_query, top_k=2, type_filter="eat")  # hoặc merged từ RRF
prompt = build_prompt(sample_query, search_results)
print(prompt)


You are a helpful local travel assistant for Da Nang. Answer the QUESTION based on the CONTEXT from our database of places to eat, see, and stay.
Only use the facts from the CONTEXT when answering the QUESTION. If you don't know, say you don't know.

QUESTION: Where can I eat grilled fish in Da Nang?

CONTEXT:
Name: Bun Cha Ca Ba Lu
Type: eat
Description: grilled fish noodle soup
Time: morning
Price: 35k VND (~1.4$)
Location: 319 Hung Vuong
Area: center
Note: Bun Cha Ca Ba Lu is known for its rich fish-based broth, crispy fried fish cakes, and fresh herbs.

Name: Bun Cha Ca Ong Ta
Type: eat
Description: grilled fish cake noodle soup
Time: morning
Price: 35k VND (~1.4$)
Location: 113 Nguyen Van Thoai
Area: beach
Note: Ong Ta’s Bun Cha Ca is known for tender fish cake slices in a mildly sweet and spicy broth with green onions.


In [73]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',  
)

In [84]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3.1',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [80]:
#test llm
#print(llm("Hello, what is the capital of Viet Nam?"))

The capital of Vietnam is Hanoi.


In [85]:
def rag(query, type_filter=None, top_k=3):
    search_results = hybrid_search(query, top_k=top_k, type_filter=type_filter)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

# Test end-to-end
#print(rag("Suggest a noodle soup for breakfast in the center", model="llama3.1", type_filter="eat"))
