In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
# from semantic_router.encoders import OpenAIEncoder
# from encoder import OpenAIEncoder

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


# Prepare Embedder & DB

In [3]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = settings.embedding_base_url

In [4]:
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)

In [5]:
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

In [6]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [7]:
client = QdrantClient(":memory:")

client.create_collection(
    collection_name="demo_collection",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)
vector_store = QdrantVectorStore(
    client=client,
    collection_name="demo_collection",
    embedding=embeddings,
)

In [8]:
vector_store

<langchain_qdrant.qdrant.QdrantVectorStore at 0x12bb8d6f0>

# Test Document

In [9]:
from uuid import uuid4
from langchain_core.documents import Document

In [10]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [
    document_1,
    document_2
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [11]:
vector_store.add_documents(documents=documents, ids=uuids)

['45d27a41-87bc-40f7-b84e-3bc2dd54f7e0',
 '05c36f54-8575-4d65-bac4-ac51c7a8aa9b']

In [12]:
## delete
# vector_store.delete(ids=[uuids[-1]])

# Test Search

In [13]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet', '_id': '45d27a41-87bc-40f7-b84e-3bc2dd54f7e0', '_collection_name': 'demo_collection'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news', '_id': '05c36f54-8575-4d65-bac4-ac51c7a8aa9b', '_collection_name': 'demo_collection'}]


## vector search

In [14]:
from langchain_qdrant import RetrievalMode

qdrant = QdrantVectorStore.from_documents(
    documents,
    embedding=embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.DENSE,
)

In [15]:
query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant.similarity_search(query)
found_docs

[Document(metadata={'source': 'news', '_id': 'a7684053ae924b239437250f183bc5b8', '_collection_name': 'my_documents'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet', '_id': '0de4d6bf562e442b89381b48fdc7a372', '_collection_name': 'my_documents'}, page_content='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.')]

## Hybrid

In [16]:
from langchain_qdrant import FastEmbedSparse, RetrievalMode
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant = QdrantVectorStore.from_documents(
    documents,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    location=":memory:",
    collection_name="my_documents",
    retrieval_mode=RetrievalMode.HYBRID,
)

In [17]:
query = "What did the president say about Ketanji Brown Jackson"
found_docs = qdrant.similarity_search(query)
found_docs

[Document(metadata={'source': 'news', '_id': 'eaa6f9687f164751a4f0c8990cea7527', '_collection_name': 'my_documents'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet', '_id': 'bc663eaffdd24c5281fd4763b08f27ee', '_collection_name': 'my_documents'}, page_content='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.')]

## Metadata Filtering

In [18]:
from qdrant_client import models

results = vector_store.similarity_search(
    query="Who are the best soccer players in the world?",
    k=1,
    filter=models.Filter(
        should=[
            models.FieldCondition(
                key="page_content",
                match=models.MatchValue(
                    value="The top 10 soccer players in the world right now."
                ),
            ),
        ]
    ),
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")