In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


In [3]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/weaviate_langchain/.env", env_file_encoding="utf-8", extra="ignore"
    )
    weaviate_port: str
    weaviate_grpc_port: str

db_settings = DBSettings()
print(db_settings.weaviate_port)

6026


# Prepare Embedder

In [4]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = "{}/v1/".format(settings.embedding_base_url)
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)

In [5]:
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

# Prepare DB

In [6]:
import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore

In [8]:
# Initialize Cliane
# https://weaviate-python-client.readthedocs.io/en/stable/
weaviate_client = weaviate.connect_to_local(
    host = "localhost",
    port = db_settings.weaviate_port,
    grpc_port = db_settings.weaviate_grpc_port,
)

In [9]:
# https://python.langchain.com/api_reference/weaviate/vectorstores/langchain_weaviate.vectorstores.WeaviateVectorStore.html
# text_key: Key to use for uploading/retrieving text to/from vectorstore.
vector_store = WeaviateVectorStore(
    client = weaviate_client,
    embedding=embeddings,
    text_key="text",
    index_name=None
)

In [10]:
vector_store

<langchain_weaviate.vectorstores.WeaviateVectorStore at 0x12b8b8ac0>

# Test Docs

In [11]:
from uuid import uuid4
from langchain_core.documents import Document

In [12]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [
    document_1,
    document_2
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [13]:
vector_store.add_documents(documents=documents, ids=uuids)

['9fbff2ba-8475-43b2-93bc-b6b19bc2c9a8',
 'a36f59ec-7348-47d9-8b78-64ac30f8a768']

In [14]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [15]:
## filtering -> no filter support
print("FILTERING WITH TWEET")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["tweet"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")
    
print("FILTERING WITH NEWS")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["news"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

FILTERING WITH TWEET


TypeError: _HybridQueryAsync.hybrid() got an unexpected keyword argument 'filter'