In [1]:
# !pip install -U sentence-transformers qdrant-client llama-index llama-index-vector-stores-qdrant -q

In [2]:
from qdrant_client import QdrantClient

from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, StorageContext, Document
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd
import torch
from llama_index.core.embeddings import BaseEmbedding
from pydantic import PrivateAttr
from sentence_transformers import SentenceTransformer
from typing import List

In [3]:
# Tạo class embedding custom
class BGEEmbedding(BaseEmbedding):
    _model: SentenceTransformer = PrivateAttr()

    def __init__(self, model_name: str = "BAAI/bge-small-en", device: str = "cpu"):
        super().__init__()
        self._model = SentenceTransformer(model_name, device=device)

    def _get_query_embedding(self, query: str) -> List[float]:
        return self._model.encode(query).tolist()

    def _get_text_embedding(self, text: str) -> List[float]:
        return self._model.encode(text).tolist()

    async def _aget_query_embedding(self, query: str) -> List[float]:
        return self._get_query_embedding(query)

    async def _aget_text_embedding(self, text: str) -> List[float]:
        return self._get_text_embedding(text)

device = "cuda" if torch.cuda.is_available() else "cpu"
# model_name = "BAAI/bge-m3"
# model_name = "AITeamVN/Vietnamese_Embedding"
# model_name = "AITeamVN/Vietnamese_Reranker"
model_name = "BAAI/bge-small-en"
embed_model = BGEEmbedding(model_name=model_name, device=device)

In [4]:
qdrant_client = QdrantClient(
    url="https://09a6d049-00c4-4b77-8e95-1dcc9ea5df34.eu-west-1-0.aws.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.-ZPZib9FxehqbTuqxsk7QdVjBQd0LlQEq7dpjF1b4PI",
)

In [5]:
def addIntoVectorDB(qdrant_client, embed_model, collection_name,filename):
    df = pd.read_csv(filename)
    df.columns = df.columns.str.strip()

    documents = [
        Document(
            text=str(row["description"]).strip(),
            metadata={"id": str(row["id"])}
        )
        for _, row in df.iterrows() if pd.notna(row["description"])
    ]
    
    node_parser = SentenceSplitter.from_defaults(chunk_size=512, chunk_overlap=10)
    nodes = node_parser.get_nodes_from_documents(documents, show_progress=True)

    vector_store = QdrantVectorStore(collection_name=collection_name,client=qdrant_client)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=embed_model, show_progress=True)
    return index

In [None]:
collection_name = "MSVD"
filename = "features/image_descriptions.csv"
res = addIntoVectorDB(qdrant_client=qdrant_client,
                embed_model= embed_model,
                collection_name=collection_name,
                filename=filename)

Parsing nodes:   0%|          | 0/1970 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1970 [00:00<?, ?it/s]

In [7]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name="MSVD")
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model)

In [8]:
retriever = index.as_retriever(similarity_top_k=3)
query = "a plane flying in the sky"
nodes = retriever.retrieve(query)

In [9]:
for i, node in enumerate(nodes):
    print(f"🔎 Kết quả {i+1}:")
    print(f"Score: {node.score:.4f}")
    print("Metadata:", node.metadata)      # metadata
    print(f"Nội dung: {node.get_content()}\n")

🔎 Kết quả 1:
Score: 0.8941
Metadata: {'id': 'Gn4Iv5ARIXc_37_40'}
Nội dung: The image captures a moment of an airplane taking off from an airport runway. The plane is in the process of ascending into the sky, with its wings and tail clearly visible against the backdrop of a cloudy sky. The runway extends into the distance, flanked by a line of trees on either side. In the bottom right

🔎 Kết quả 2:
Score: 0.8787
Metadata: {'id': 'ZbzDGXEwtGc_6_15'}
Nội dung: The image captures a South African Airways Boeing 747-400 aircraft in flight against a clear blue sky. The aircraft is a large, white, four-engine jet with the airline's logo on the tail and the words "South African Airways" written on the fuselage. The plane is captured mid-flight

🔎 Kết quả 3:
Score: 0.8740
Metadata: {'id': '3chNlP5TeO8_0_10'}
Nội dung: The image captures a fighter jet soaring through a clear blue sky, leaving a trail of smoke behind it. The jet is moving at high speed, as indicated by the smoke trail. The sky is 