In [1]:
import os
import pandas as pd
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict

In [2]:
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str

settings = Settings()
print(settings.embedding_model)

baai/bge-m3


In [3]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/pgvector_langchain/.env", env_file_encoding="utf-8", extra="ignore"
    )
    postgres_user: str
    postgres_password: str
    postgres_db: str
    postgres_url: str
    postgres_port: str

db_settings = DBSettings()
print(db_settings.postgres_db)

pgvector_langchain


# Prepare Embedder & Vector Store

In [4]:
## Embedder
import os
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_BASE"] = "{}/v1/".format(settings.embedding_base_url)
embeddings = OpenAIEmbeddings(
    model=settings.embedding_model,
    api_key=settings.embedding_api_key
)
vectors = embeddings.embed_documents(["hello", "goodbye"])
len(vectors[0])

1024

In [5]:
## Prepare Connection
from langchain_core.documents import Document

# use psycopg3
connection = "postgresql+psycopg://{}:{}@localhost:{}/{}".format(
    db_settings.postgres_user,
    db_settings.postgres_password,
    db_settings.postgres_port,
    db_settings.postgres_db
)

# Make 2 Collections

In [6]:
from uuid import uuid4
from langchain_core.documents import Document

In [7]:
document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)
documents = [document_1, document_2]
uuids_1 = [str(uuid4()) for _ in range(len(documents))]
print(uuids_1)
uuids_2 = [str(uuid4()) for _ in range(len(documents))]
print(uuids_2)

['c332e63d-fc81-402c-bf62-99418a4d1345', '30a011d9-aabd-425b-a2f8-09fa3e203c1b']
['ca53bd93-4704-4f5a-8578-8261ea064339', '084bd6d1-6289-46c2-80a3-897cfd03a1f6']


In [8]:
from langchain_postgres.vectorstores import PGVector
## collection 1
collection1_name = "demo_collection"
collection1_vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection1_name,
    connection=connection,
    use_jsonb=True,
)
print(collection1_vector_store)

## collection2
collection2_name = "demo_collection2"
collection2_vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection2_name,
    connection=connection,
    use_jsonb=True,
)
print(collection2_vector_store)

<langchain_postgres.vectorstores.PGVector object at 0x1247e83d0>
<langchain_postgres.vectorstores.PGVector object at 0x1247e87f0>


In [9]:
## Insert to Collection1
collection1_ids = collection1_vector_store.add_documents(documents=documents, ids=uuids_1)

In [10]:
## Insert to Collection2
collection2_ids = collection2_vector_store.add_documents(documents=documents, ids=uuids_2)

In [11]:
print(collection1_ids)
print(collection2_ids)

['c332e63d-fc81-402c-bf62-99418a4d1345', '30a011d9-aabd-425b-a2f8-09fa3e203c1b']
['ca53bd93-4704-4f5a-8578-8261ea064339', '084bd6d1-6289-46c2-80a3-897cfd03a1f6']


# Try Retrieval Directly

In [19]:
from sqlalchemy.orm import (
    Session,
    declarative_base,
    relationship,
    scoped_session,
    sessionmaker,
)
from sqlalchemy import create_engine, Column, String, JSON, ForeignKey, Index, select
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from pgvector.sqlalchemy import Vector

In [28]:
# Define the Base and Engine
Base = declarative_base()
engine = create_engine(connection)
Session = sessionmaker(bind=engine)

# Define the Model
class LangChainEmbedding(Base):
    __tablename__ = "langchain_pg_embedding"

    id = Column(String, primary_key=True)
    collection_id = Column(UUID, ForeignKey("langchain_pg_collection.uuid"))
    embedding = Column(Vector(1024))  # Replace 1536 with your embedding dimension
    document = Column(String)
    cmetadata = Column(JSON)

    # Index for vector similarity
    __table_args__ = (
        Index("ix_embedding_vector", "embedding", postgresql_using="ivfflat"),
    )

  Base = declarative_base()


In [35]:
# Perform Similarity Search
def search_similar_embeddings(query_vector, collection_id, top_k=5):
    """
    Perform similarity search on the langchain_pg_embedding table.
    Args:
        query_vector (list): The query embedding vector.
        collection_id (str): The UUID of the collection to search within.
        top_k (int): Number of top results to return.
    Returns:
        list: List of matching rows with similarity scores.
    """
    with Session() as session:
        # SQLAlchemy query
        stmt = (
            select(
                LangChainEmbedding.id,
                LangChainEmbedding.document,
                LangChainEmbedding.cmetadata,
                LangChainEmbedding.embedding.cosine_distance(query_vector).label("similarity") # 0~2
            )
            .where(LangChainEmbedding.collection_id == collection_id)
            .order_by("similarity")
            .limit(top_k)
        )

        results = session.execute(stmt).fetchall()

    # Parse results
    return [
        {
            "id": row.id,
            "document": row.document,
            "cmetadata": row.cmetadata,
            "similarity": row.similarity,
        }
        for row in results
    ]

In [36]:
# Example Usage
query_vector = [0.1]*1024
print(len(query_vector))
collection_id = "054bd89a-e570-4fb4-8466-e7ff6cd644ea"  # Replace with your collection UUID
results = search_similar_embeddings(query_vector, collection_id)

# Print results
for result in results:
    print(f"ID: {result['id']}, Similarity: {result['similarity']}, Document: {result['document']}")

1024
ID: f5de7a22-5f19-4ce3-a667-aec3f62744ac, Similarity: 1.021492688781572, Document: The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.
ID: 30a011d9-aabd-425b-a2f8-09fa3e203c1b, Similarity: 1.021492688781572, Document: The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.
ID: 1f5ee93e-30b5-4b97-aeb5-b6800303a751, Similarity: 1.0296706098358088, Document: I had chocalate chip pancakes and scrambled eggs for breakfast this morning.
ID: c332e63d-fc81-402c-bf62-99418a4d1345, Similarity: 1.0296706098358088, Document: I had chocalate chip pancakes and scrambled eggs for breakfast this morning.


In [37]:
# https://python.langchain.com/api_reference/_modules/langchain_postgres/vectorstores.html#PGVector
session_maker = scoped_session(sessionmaker(bind=collection1_vector_store._engine))
collection1_vector_store.EmbeddingStore

langchain_postgres.vectorstores._get_embedding_collection_store.<locals>.EmbeddingStore

In [33]:
with session_maker() as session:
    pass

In [34]:
session = collection1_vector_store._make_sync_session()
with session:
    print(session)

<contextlib._GeneratorContextManager object at 0x12d76aaa0>
