In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

In [2]:
# Load the document, split it into chunks

raw_document = TextLoader('data/PlainText.txt').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
documents = text_splitter.split_documents(raw_document)

In [3]:
# embed each chunk and insert it into the vector store

embedding_model = OpenAIEmbeddings()
connection = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
db = PGVector.from_documents(documents, embedding_model, connection=connection)

#### Q: “How does PGVector store embeddings internally?”

#### A: PGVector stores embeddings as:

A row in a Postgres table

A single vector column = contiguous float array

Optional ANN index (IVFFLAT, HNSW)

Text + metadata stored in normal columns

This gives Postgres native vector search capabilities without external engines.

#### Q: “How does from_documents() batch embeddings?”

#### A: from_documents() batching pipeline:

Split list of docs into batches

Embed each batch with one model call

Insert each batch into Postgres in one DB operation

Return the final PGVector object

This batching behavior is essential for speed and avoiding rate limits.

#### Q: “What does the table schema created by PGVector look like?”

#### A: Main Table

CREATE TABLE langchain_pg_embedding (
    id SERIAL PRIMARY KEY,
    collection_id INTEGER REFERENCES langchain_pg_collection(id) ON DELETE CASCADE,
    embedding vector(<dim>),        -- pgvector column
    document TEXT,                  -- chunk content
    metadata JSONB,                 -- metadata for the chunk
    ctime TIMESTAMPTZ DEFAULT now() -- insertion timestamp
);

#### Supporting table:

CREATE TABLE langchain_pg_collection (
    id SERIAL PRIMARY KEY,
    name TEXT UNIQUE NOT NULL
);

#### Q: “How do I query similar vectors from this database?”

#### A: Using LangChain PGVector wrapper:

results = db.similarity_search("What is this text about?", k=5) # db is the PGVector instance created earlier

results -> list of Document objects (page_content, metadata)

with scores (if supported version)

results_with_scores = db.similarity_search_with_score("What is this text about?", k=5)

returns [(Document, score), ...]

### Things to try:

the actual internal code path in LangChain

how to override batch size

how to manually batch embeddings using LCEL runnables

how to profile embedding throughput for large corpora

#### Exploring the tables

In [4]:
from sqlalchemy import create_engine, text

engine = create_engine("postgresql+psycopg://langchain:langchain@localhost:6024/langchain")

with engine.connect() as conn:
    print(conn.execute(text("SELECT table_name FROM information_schema.tables WHERE table_schema='public'")).fetchall())

[('langchain_pg_collection',), ('langchain_pg_embedding',)]


In [5]:
with engine.connect() as conn:
    print(conn.execute(text("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='langchain_pg_embedding'")).fetchall())

[('collection_id', 'uuid'), ('embedding', 'USER-DEFINED'), ('cmetadata', 'jsonb'), ('id', 'character varying'), ('document', 'character varying')]


In [6]:
with engine.connect() as conn:
    rows = conn.execute(text("""
    SELECT * FROM langchain_pg_embedding LIMIT 5;
    """)).fetchall()

for row in rows:
    print(row)

('44778751-7166-4ee5-8af7-73d661893dd1', UUID('ca89a2e2-361a-4ae5-a07a-1dd2b64fedf2'), '[-0.0048253546,0.014394158,-0.028047597,-0.019942425,-0.0017111313,0.038944706,-0.0046864697,-0.05162239,0.0045369016,-0.015113509,0.02340386,0.01800 ... (19113 characters truncated) ... 027007742,0.02321868,0.018047895,-0.0015410865,-0.006816036,-0.029030474,0.013361425,-0.0032353024,0.0074499203,-0.016124874,0.011879988,-0.04754844]', 'A TXT file is\xa0a type of file that stores plain text without any special formatting, styling, or', {'source': 'data/PlainText.txt'})
('2ea11aae-37a0-4f7d-996a-497150e89ac4', UUID('ca89a2e2-361a-4ae5-a07a-1dd2b64fedf2'), '[-0.006112092,-0.003568447,0.002396768,-0.022445498,-0.002253229,0.026664877,-0.03223286,-0.03306071,0.0022982934,-0.013753042,-0.00085205433,0.02615 ... (19058 characters truncated) ... ,-0.01187702,0.033514693,0.010147875,-0.0072103324,-0.012217508,-0.0154221,0.007310476,0.0038955824,0.010975729,-0.015662445,0.008078243,-0.03132489]', 'almost an

In [7]:
for r in rows:
    print("ID:", r.id)
    print("Document:", r.document[:200], "...")
    print("Metadata:", r.cmetadata)
    print("Embedding length:", len(r.embedding))
    print("-----")

ID: 44778751-7166-4ee5-8af7-73d661893dd1
Document: A TXT file is a type of file that stores plain text without any special formatting, styling, or ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19411
-----
ID: 2ea11aae-37a0-4f7d-996a-497150e89ac4
Document: almost any device using a basic text editor like Notepad on Windows or TextEdit on Mac. These files ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19356
-----
ID: bc5ef998-a969-40b5-8a1d-38e79ea5444e
Document: on Mac. These files are used for storing simple text documents, source code, and configuration ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19388
-----
ID: bc290200-5ee3-49f2-8d54-070cf5685258
Document: and configuration data. ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19480
-----
ID: e86f69ca-c119-4178-ad2e-ebe016e23c74
Document: Key characteristics ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19451
-----


#### Here the length of embeddings are different because DB driver didn’t know how to decode the vector column, so it returned the raw textual representation.

In [8]:
row = rows[0]
print(type(row.embedding))
print(repr(row.embedding)[:200])

<class 'str'>
'[-0.0048253546,0.014394158,-0.028047597,-0.019942425,-0.0017111313,0.038944706,-0.0046864697,-0.05162239,0.0045369016,-0.015113509,0.02340386,0.01800516,-0.013803007,-0.00502834,-0.016908327,0.014230


In [9]:
import ast

for row in rows:
    vec = ast.literal_eval(row.embedding)
    print(len(vec))

1536
1536
1536
1536
1536


#### Search documents

In [10]:
similar = db.similarity_search("computer", k=4)
similar

[Document(id='e281f62b-dd83-441c-bed8-3a65f2d7b3fa', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='37bae833-ec9a-49cb-8bf6-8a364a565eb6', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='4bde5211-7a6d-42b9-8af0-d1fd8868f90b', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='1f0a2634-6f39-4747-9677-da40af3ff6bd', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.')]

#### How it works:

• The search query—in this case, the word query—will be sent to the embeddings
model to retrieve its embedding.
• Then, it will run a query on Postgres to find the N (in this case 4) previously
stored embeddings that are most similar to your query.
• Finally, it will fetch the text content and metadata that relates to each of those
embeddings.
• The model can now return a list of Document sorted by how similar they are to
the query—the most similar first, the second most similar after, and so on.

In [11]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents([
    Document(
        page_content="there are three cats in the bed",
        metadata={"location": "bed", "topic": "animals"},
    ),
    Document(
        page_content="there are also dogs in the bed",
        metadata={"location": "bed", "topic": "animals"},
        ),
    ],
    ids=ids,)

['4d4e656e-5688-41ec-b889-47badaed899c',
 'e7635ea7-eb5c-43e5-88b5-7473fae867ee']

In [12]:
# delete operation
db.delete(ids=["6f44c5f4-4db2-4b3f-98da-402e2a073f57"])

In [13]:
similar = db.similarity_search("cats", k=2)
similar

[Document(id='a80f40cc-c5e3-4afb-b570-ba317a9339a6', metadata={'topic': 'animals', 'location': 'bed'}, page_content='there are three cats in the bed'),
 Document(id='79077df0-32ef-40fd-a65c-c0da5c2a0eba', metadata={'topic': 'animals', 'location': 'bed'}, page_content='there are three cats in the bed')]

### Tracking changes to the documents

In [14]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "my_docs"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Example docs
docs = [
    Document(page_content="there are cats in the pond", metadata={"source":"cats.txt"}),
    Document(page_content="there are also ducks in the pond", metadata={"source":"ducks.txt"}),
]

# Determine the set of source IDs you will index
sources = [d.metadata.get("source") for d in docs if d.metadata.get("source")]
sources

# Connect to DB and delete any existing vectors with the same source metadata
engine = create_engine(connection)
placeholders = ", ".join([f":s{i}" for i in range(len(sources))]) or "NULL"

sql = text(f"""
    DELETE FROM langchain_pg_embedding
    WHERE cmetadata->>'source' IN ({placeholders})
    """)

params = {f"s{i}": src for i, src in enumerate(sources)}

with engine.begin() as conn:
    # if there are no sources this will be a no-op
    if sources:
        conn.execute(sql, params)

db = PGVector.from_documents(docs, embedding_model, connection=connection, collection_name=collection_name)

results = db.similarity_search("pond animals", k=3)
for r in results:
    print(r.metadata, r.page_content[:120])

{'source': 'cats.txt'} there are cats in the pond
{'source': 'ducks.txt'} there are also ducks in the pond


In [15]:
docs = [
    Document(page_content="there are buffalos in the pond", metadata={"source":"cats.txt"}),
    Document(page_content="there are also ducks in the pond", metadata={"source":"ducks.txt"}),
]

sources = [d.metadata.get("source") for d in docs if d.metadata.get("source")]
engine = create_engine(connection)
placeholders = ", ".join([f":s{i}" for i in range(len(sources))]) or "NULL"

sql = text(f"""
    DELETE FROM langchain_pg_embedding
    WHERE cmetadata->>'source' IN ({placeholders})
""")

params = {f"s{i}": src for i, src in enumerate(sources)}

with engine.begin() as conn:
    # if there are no sources this will be a no-op
    if sources:
        conn.execute(sql, params)

db = PGVector.from_documents(docs, embedding_model, connection=connection, collection_name=collection_name)

results = db.similarity_search("pond animals", k=3)
for r in results:
    print(r.metadata, r.page_content[:120])

{'source': 'cats.txt'} there are buffalos in the pond
{'source': 'ducks.txt'} there are also ducks in the pond


In [16]:
# Notice how cats got changed with buffalo when we deleted the old source and added the new

In [17]:
collection_name = "black_holes"

docs = [
    Document(page_content="Ton618 is the largest know black hole in the observable univers", metadata={"id":"1", "source":"ton.txt"}),
    Document(page_content="Sagittarius a* is pretty small as far as black holes go", metadata={"id":"2", "source":"nei.txt"}),
]

sources = [d.metadata.get("id") for d in docs if d.metadata.get("id")]
placeholders = ", ".join([f":id{i}" for i in range(len(sources))]) or "NULL"

sql = text(f"""
    DELETE FROM langchain_pg_embedding
    WHERE cmetadata->>'id' IN ({placeholders})
""")

params = {f"id{i}": src for i, src in enumerate(sources)}

with engine.begin() as conn:
    if sources:
        conn.execute(sql, params)

db = PGVector.from_documents(docs, embedding_model, connection=connection, collection_name=collection_name)

db.similarity_search("black hole", k=2)

[Document(id='cae2e737-ae51-47e7-a186-6337cd5587e8', metadata={'id': '2', 'source': 'nei.txt'}, page_content='Sagittarius a* is pretty small as far as black holes go'),
 Document(id='c536fa96-5a01-44cb-a661-21a3e9c694d8', metadata={'id': '1', 'source': 'ton.txt'}, page_content='Ton618 is the largest know black hole in the observable univers')]

In [18]:
docs = [
    Document(page_content="Ton618 is the largest know black hole in the observable univers", metadata={"id":"1", "source":"ton.txt"}),
    Document(page_content="Messier 87 is the black holes to be picuted using the event horizon telescope", metadata={"id":"2", "source":"nei.txt"}),
]

sources = [d.metadata.get("id") for d in docs if d.metadata.get("id")]
placeholders = ", ".join([f":id{i}" for i in range(len(sources))]) or "NULL"

sql = text(f"""
    DELETE FROM langchain_pg_embedding
    WHERE cmetadata->>'id' IN ({placeholders})
""")

params = {f"id{i}": src for i, src in enumerate(sources)}

with engine.begin() as conn:
    if sources:
        conn.execute(sql, params)

db = PGVector.from_documents(docs, embedding_model, connection=connection, collection_name=collection_name)

db.similarity_search("black hole", k=2)

[Document(id='d7c01236-ec2f-4fa6-bfcb-a8fe38634a98', metadata={'id': '1', 'source': 'ton.txt'}, page_content='Ton618 is the largest know black hole in the observable univers'),
 Document(id='587f30ae-5961-4125-8388-4ee004a86fca', metadata={'id': '2', 'source': 'nei.txt'}, page_content='Messier 87 is the black holes to be picuted using the event horizon telescope')]

Using the id instead of the source to replace the old documents

### Indexing Optimization

#### MultiVectorRetriever

In [19]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_postgres.vectorstores import PGVector
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
import uuid

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "harry_potter_summaries"
embeddings_model = OpenAIEmbeddings()

# load document
docs = TextLoader('data/HP1.txt', encoding='utf-8').load()
print('Length of loaded docs: ', len(docs[0].page_content))

# split the document
splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=500)
chunks = splitter.split_documents(docs)

prompt_text = "Summarize the following document:\n\n{doc}"

prompt = ChatPromptTemplate.from_template(prompt_text)

llm = ChatOpenAI(temperature=0., model='gpt-3.5-turbo')
summarize_chain = {"doc": lambda x: x.page_content} | prompt | llm | StrOutputParser()

# Batch the chain across chunks
summaries = summarize_chain.batch(chunks, {"max_concurracny": 5})

Length of loaded docs:  457730


In [20]:
len("".join(summaries))

22012

In [21]:
summaries

["Mr. and Mrs. Dursley, a normal couple living on Privet Drive, are disturbed by strange occurrences in their town, including people in cloaks, owls flying in daylight, and shooting stars. Mr. Dursley becomes increasingly worried when he hears whispers about the Potters, his wife's estranged sister. The couple tries to ignore the strange events, but Mr. Dursley eventually brings up the topic to his wife, who reacts angrily.",
 'Mr. Dursley overhears his wife, Mrs. Dursley, discussing strange occurrences in town, including owls and shooting stars. They mention the name "Potter" and their son, Harry. Meanwhile, Albus Dumbledore and Professor McGonagall discuss the disappearance of Voldemort and the attack on the Potter family, resulting in the death of Lily and James Potter but the survival of their son, Harry. They speculate on how Harry survived and await the arrival of Hagrid.',
 "The document describes a scene where Professor McGonagall and Dumbledore discuss bringing Harry Potter to

In [22]:
id_key = "doc_id"
doc_ids = [str(uuid.uuid4()) for _ in chunks]

# Each summary is linked to the original document by the doc_id
summary_docs = [Document(page_content=s, metadata={id_key:doc_ids[i]}) for i, s in enumerate(summaries)]

vectorstore = PGVector.from_documents(
    documents=summary_docs,
    embedding=embeddings_model,
    connection=connection,
    collection_name=collection_name,
)

In [23]:
class SimpleInMemoryStore:
    def __init__(self):
        self._store = {}

    def mset(self, pairs):
        # pairs of (doc_id, document)
        for k, v in pairs:
            self._store[k] = v

    def mget(self, keys):
        return [self._store.get(key) for key in keys]

    def get(self, key):
        return self._store.get(key)

store = SimpleInMemoryStore()
store.mset(list(zip(doc_ids, chunks)))

In [24]:
store.get(doc_ids[-1])

Document(metadata={'source': 'data/HP1.txt'}, page_content='It took quite a while for them all to get off the platform. A wizened old guard was up by the ticket barrier, letting them go through the gate in twos and threes so they didnâ€™t attract attention by all bursting out of a solid wall at once and alarming the Muggles.\n\nâ€œYou must come and stay this summer,â€\x9d said Ron, â€œboth of you â€” Iâ€™ll send you an owl.â€\x9d\n\nâ€œThanks,â€\x9d said Harry, â€œIâ€™ll need something to look forward to.â€\x9d People jostled them as they moved forward toward the gateway back to the Muggle world. Some of them called:\n\nâ€œBye, Harry!â€\x9d\n\nâ€œSee you, Potter!â€\x9d\n\nâ€œStill famous,â€\x9d said Ron, grinning at him.\n\nâ€œNot where Iâ€™m going, I promise you,â€\x9d said Harry.\n\nHe, Ron, and Hermione passed through the gateway together. â€œThere he is, Mom, there he is, look!â€\x9d\n\nIt was Ginny Weasley, Ronâ€™s younger sister, but she wasnâ€™t pointing at Ron.\n\nâ€œHarry Pott

In [25]:
class SimpleMultiRetriever:
    def __init__(self, vectorstore: PGVector, docstore: SimpleInMemoryStore, id_key: str = "doc_id"):
        self.vectorstore = vectorstore
        self.docstore = docstore
        self.id_key = id_key

    def similarity_search(self, query: str, k: int = 4):
        # return similar summaries from the vector store
        return self.vectorstore.similarity_search(query, k=k)

    def invoke(self, query:str, k: int = 4):
        similar_docs = self.similarity_search(query, k)

        doc_ids = []
        for s in similar_docs:
            mid = s.metadata or {}
            doc_id = mid.get(self.id_key)
            if doc_id:
                ids.append(doc_id)
            original_docs = self.docstore.mget(ids)
            return [d for d in original_docs if d is not None]

        
retriever = SimpleMultiRetriever(vectorstore, store, id_key=id_key)
retriever.similarity_search("Nicolas Flamel", 4)

[Document(id='7c32e1a1-4ead-4570-9a5c-d37dd0c12a1e', metadata={'doc_id': '1d1bd47a-329f-429a-b92b-b7f264f1a6c7'}, page_content="The document describes a scene from Harry Potter and the Sorcerer's Stone where Harry, Ron, and Hermione are trying to figure out who Nicolas Flamel is and why Snape is trying to steal something. They search the library for information on Flamel, but are unable to find anything. The trio spends their holidays at Hogwarts, enjoying their time together and plotting ways to get Malfoy expelled. Ron teaches Harry wizard chess, using an old set that belonged to his grandfather."),
 Document(id='fab4b63a-c59b-46cd-be9e-e18848f2c985', metadata={'id': 'bdaee4f1-9b88-4523-8182-bbef65542081'}, page_content="The document describes a scene from Harry Potter and the Sorcerer's Stone where Harry, Ron, and Hermione are trying to figure out who Nicolas Flamel is and why Snape is trying to steal something. They search the library for information on Flamel, but are unable to fi

In [26]:
retriever.invoke("Nicolas Flamel", 4)

[Document(metadata={'source': 'data/HP1.txt'}, page_content="â€œBut Snapeâ€™s trying to steal it.â€\x9d\n\nâ€œRubbish,â€\x9d said Hagrid again. â€œSnapeâ€™s a Hogwarts teacher, heâ€™d do nothinâ€™ of the sort.â€\x9d\n\nâ€œSo why did he just try and kill Harry?â€\x9d cried Hermione.\n\nThe afternoonâ€™s events certainly seemed to have changed her mind about Snape.\n\nâ€œI know a jinx when I see one, Hagrid, Iâ€™ve read all about them! Youâ€™ve got to keep eye contact, and Snape wasnâ€™t blinking at all, I saw him!â€\x9d\n\nâ€œIâ€™m tellinâ€™ yeh, yer wrong!â€\x9d said Hagrid hotly. â€œI donâ€™ know why Harryâ€™s broom acted like that, but Snape wouldnâ€™ try anâ€™ kill a student! Now, listen to me, all three of yeh â€” yer meddlinâ€™ in things that donâ€™ concern yeh. Itâ€™s dangerous. You forget that dog, anâ€™ you forget what itâ€™s guardinâ€™, thatâ€™s between Professor Dumbledore anâ€™ Nicolas Flamel â€”â€\x9d\n\nâ€œAha!â€\x9d said Harry, â€œso thereâ€™s someone called Nicolas Flame