In [21]:
from dotenv import load_dotenv
load_dotenv(override=True)

import json
from langchain_community.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from uuid import uuid4
from qdrant_client import QdrantClient

# start docker:
# docker pull qdrant/qdrant
# docker run -p 6333:6333 -v C:\Users\Justyna\Documents\GitHub\AI_DEV2\qdrant_data:/qdrant/storage qdrant/qdrant

In [24]:
# połączenie z Qdrant
MEMORY_PATH = "memory.md"
COLLECTION_NAME = "ai_devs"

qdrant = QdrantClient()
embeddings = OpenAIEmbeddings()
result = qdrant.get_collections()
indexed = next((collection for collection in result.collections if collection.name == COLLECTION_NAME), None)
print(result)

# stworzenie kolekcji jeśli nie istnieje
if not indexed:
    qdrant.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={"size": 1536, "distance": "Cosine", "on_disk": True},
    )

collection_info = qdrant.get_collection(collection_name=COLLECTION_NAME)

collections=[]


In [25]:
# jeśli nie ma dokumentów w kolekcji
if not collection_info.points_count:
    # wczytanie pliku
    loader = TextLoader(MEMORY_PATH)
    memory = loader.load()
    documents = [Document(page_content=content) for content in memory[0].page_content.split("\n\n")]

    # dodanie metadanych
    for document in documents:
        document.metadata["source"] = COLLECTION_NAME
        document.metadata["content"] = document.page_content
        document.metadata["uuid"] = str(uuid4()) # unikalny identyfikator (będzie przydatny później np. do filtrowania)

    # generowanie embeddingów
    points = []
    for document in documents:
        embedding = embeddings.embed_documents([document.page_content])[0]
        points.append(
            {
                "id": document.metadata["uuid"],
                "payload": document.metadata,
                "vector": embedding,
            }
        )

    # index
    qdrant.upsert(
        collection_name=COLLECTION_NAME,
        wait=True,
        points=points,
    )

# collection_info = qdrant.get_collection(collection_name=COLLECTION_NAME)
# json.loads(collection_info.json())

In [20]:
# przeszukiwanie dokumentów w wybranej kolekcji powiązanych z pytaniem
query = "Do you know the name of Adam's dog?"
query_embedding = embeddings.embed_query(query)

search_result = qdrant.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_embedding,
    limit=2,
    query_filter={"must": [{"key": "source", "match": {"value": COLLECTION_NAME}}]},
)

for result in search_result:
    print("ID: ", result.id)
    print("Score: ", result.score)
    print(json.dumps(result.payload,indent=4))

ID:  f876900e-659a-4ae3-bc13-3761b8506d37
Score:  0.9010632
{
    "content": "Adam have a dog named Alexa.",
    "source": "ai_devs",
    "uuid": "f876900e-659a-4ae3-bc13-3761b8506d37"
}
ID:  24a37b68-baba-4caa-b656-30e8092a304f
Score:  0.84295344
{
    "content": "Adam lives in Krakow with his fianc\u0102\u00a9e and dog.",
    "source": "ai_devs",
    "uuid": "24a37b68-baba-4caa-b656-30e8092a304f"
}
