In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
# from semantic_router.encoders import OpenAIEncoder
# from encoder import OpenAIEncoder

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


In [3]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/pgvector_langchain/.env", env_file_encoding="utf-8", extra="ignore"
    )
    postgres_user: str
    postgres_password: str
    postgres_db: str
    postgres_url: str
    postgres_port: str

db_settings = DBSettings()
print(db_settings.postgres_db)

pgvector_langchain


# Prepare Embedder & DB

In [4]:
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = settings.embedding_base_url

In [5]:
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)

In [6]:
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

## Prepare pgvector

In [7]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

In [10]:
# use psycopg3
connection = "postgresql+psycopg://{}:{}@localhost:{}/{}".format(
    db_settings.postgres_user,
    db_settings.postgres_password,
    db_settings.postgres_port,
    db_settings.postgres_db
)
collection_name = "demo_collection"

vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [11]:
vector_store

<langchain_postgres.vectorstores.PGVector at 0x12ca04280>

# Test Docs

In [12]:
from uuid import uuid4
from langchain_core.documents import Document

In [13]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [
    document_1,
    document_2
]
uuids = [str(uuid4()) for _ in range(len(documents))]

In [14]:
vector_store.add_documents(documents=documents, ids=uuids)

['aaef2cd5-0f0f-45c5-b9f3-2806806bdf89',
 'f8894303-0376-404c-b517-d50326d5bac4']

In [15]:
results = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy", k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [16]:
## filtering
print("FILTERING WITH TWEET")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["tweet"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")
    
print("FILTERING WITH NEWS")
results = vector_store.similarity_search(
    "kitty", k=10, filter={"source": {"$in": ["news"]}}
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

FILTERING WITH TWEET
* I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
FILTERING WITH NEWS
* The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [17]:
results = vector_store.similarity_search_with_score(query="cats", k=1)
for doc, score in results:
    print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")

* [SIM=0.438491] I had chocalate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]
