In [1]:
!pip install elasticsearch sentence-transformers pandas tqdm

Collecting elasticsearch
  Downloading elasticsearch-9.0.2-py3-none-any.whl.metadata (8.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2024.11.6-cp312-c

In [2]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
es = Elasticsearch(
    hosts=["http://localhost:9200"],
    # Nếu không cần auth thì không cần user/pass
)
# Test kết nối
assert es.ping(), "Elasticsearch không kết nối được!"


AssertionError: Elasticsearch không kết nối được!

In [None]:
index_name = "places_danang"

mapping = {
    "mappings": {
        "properties": {
            "type": {"type": "keyword"},
            "name": {"type": "text"},
            "description": {"type": "text"},
            "time": {"type": "keyword"},
            "price": {"type": "keyword"},
            "location": {"type": "text"},
            "area": {"type": "keyword"},
            "note": {"type": "text"},
            "id": {"type": "keyword"},
            "full_text": {"type": "text"},  # Dùng cho BM25
            "vector_search": {
                "type": "dense_vector",
                "dims": 384,  # all-MiniLM-L6-v2 output 384 dimensions
                "index": True,           # Bật index cho vector search (Elastic >=8.5)
                "similarity": "cosine"   # hoặc "l2_norm"
            }
        }
    }
}


In [None]:
#Delete pervious index and create a new one: 
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)
print(f"Index `{index_name}` đã được tạo!")


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
df = pd.read_csv("data_danang_ok.csv")  # Đường dẫn file của bạn

# Tạo embedding cho từng record
def embed(text):
    return model.encode(text).tolist()

# Nếu cột vector_search đã có, bỏ qua đoạn này, còn không:
tqdm.pandas()
df["vector_search"] = df["full_text"].progress_apply(embed)

#indexing data to elasticsearch
for i, row in tqdm(df.iterrows(), total=len(df)):
    doc = row.to_dict()
    # Nếu vector_search dạng numpy, cần chuyển sang list
    es.index(index=index_name, id=doc["id"], document=doc)