In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

In [4]:
# Load the document, split it into chunks

raw_document = TextLoader('data/PlainText.txt').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
documents = text_splitter.split_documents(raw_document)

In [5]:
# embed each chunk and insert it into the vector store

embedding_model = OpenAIEmbeddings()
connection = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
db = PGVector.from_documents(documents, embedding_model, connection=connection)

#### Q: “How does PGVector store embeddings internally?”

#### A: PGVector stores embeddings as:

A row in a Postgres table

A single vector column = contiguous float array

Optional ANN index (IVFFLAT, HNSW)

Text + metadata stored in normal columns

This gives Postgres native vector search capabilities without external engines.

#### Q: “How does from_documents() batch embeddings?”

#### A: from_documents() batching pipeline:

Split list of docs into batches

Embed each batch with one model call

Insert each batch into Postgres in one DB operation

Return the final PGVector object

This batching behavior is essential for speed and avoiding rate limits.

#### Q: “What does the table schema created by PGVector look like?”

#### A: Main Table

CREATE TABLE langchain_pg_embedding (
    id SERIAL PRIMARY KEY,
    collection_id INTEGER REFERENCES langchain_pg_collection(id) ON DELETE CASCADE,
    embedding vector(<dim>),        -- pgvector column
    document TEXT,                  -- chunk content
    metadata JSONB,                 -- metadata for the chunk
    ctime TIMESTAMPTZ DEFAULT now() -- insertion timestamp
);

#### Supporting table:

CREATE TABLE langchain_pg_collection (
    id SERIAL PRIMARY KEY,
    name TEXT UNIQUE NOT NULL
);

#### Q: “How do I query similar vectors from this database?”

#### A: Using LangChain PGVector wrapper:

results = db.similarity_search("What is this text about?", k=5) # db is the PGVector instance created earlier

results -> list of Document objects (page_content, metadata)

with scores (if supported version)

results_with_scores = db.similarity_search_with_score("What is this text about?", k=5)

returns [(Document, score), ...]

### Things to try:

the actual internal code path in LangChain

how to override batch size

how to manually batch embeddings using LCEL runnables

how to profile embedding throughput for large corpora

#### Exploring the tables

In [6]:
from sqlalchemy import create_engine, text

engine = create_engine("postgresql+psycopg://langchain:langchain@localhost:6024/langchain")

with engine.connect() as conn:
    print(conn.execute(text("SELECT table_name FROM information_schema.tables WHERE table_schema='public'")).fetchall())

[('langchain_pg_collection',), ('langchain_pg_embedding',)]


In [7]:
with engine.connect() as conn:
    print(conn.execute(text("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='langchain_pg_embedding'")).fetchall())

[('collection_id', 'uuid'), ('embedding', 'USER-DEFINED'), ('cmetadata', 'jsonb'), ('id', 'character varying'), ('document', 'character varying')]


In [8]:
with engine.connect() as conn:
    rows = conn.execute(text("""
    SELECT * FROM langchain_pg_embedding LIMIT 5;
    """)).fetchall()

for row in rows:
    print(row)

('44778751-7166-4ee5-8af7-73d661893dd1', UUID('ca89a2e2-361a-4ae5-a07a-1dd2b64fedf2'), '[-0.0048253546,0.014394158,-0.028047597,-0.019942425,-0.0017111313,0.038944706,-0.0046864697,-0.05162239,0.0045369016,-0.015113509,0.02340386,0.01800 ... (19113 characters truncated) ... 027007742,0.02321868,0.018047895,-0.0015410865,-0.006816036,-0.029030474,0.013361425,-0.0032353024,0.0074499203,-0.016124874,0.011879988,-0.04754844]', 'A TXT file is\xa0a type of file that stores plain text without any special formatting, styling, or', {'source': 'data/PlainText.txt'})
('2ea11aae-37a0-4f7d-996a-497150e89ac4', UUID('ca89a2e2-361a-4ae5-a07a-1dd2b64fedf2'), '[-0.006112092,-0.003568447,0.002396768,-0.022445498,-0.002253229,0.026664877,-0.03223286,-0.03306071,0.0022982934,-0.013753042,-0.00085205433,0.02615 ... (19058 characters truncated) ... ,-0.01187702,0.033514693,0.010147875,-0.0072103324,-0.012217508,-0.0154221,0.007310476,0.0038955824,0.010975729,-0.015662445,0.008078243,-0.03132489]', 'almost an

In [9]:
for r in rows:
    print("ID:", r.id)
    print("Document:", r.document[:200], "...")
    print("Metadata:", r.cmetadata)
    print("Embedding length:", len(r.embedding))
    print("-----")

ID: 44778751-7166-4ee5-8af7-73d661893dd1
Document: A TXT file is a type of file that stores plain text without any special formatting, styling, or ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19411
-----
ID: 2ea11aae-37a0-4f7d-996a-497150e89ac4
Document: almost any device using a basic text editor like Notepad on Windows or TextEdit on Mac. These files ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19356
-----
ID: bc5ef998-a969-40b5-8a1d-38e79ea5444e
Document: on Mac. These files are used for storing simple text documents, source code, and configuration ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19388
-----
ID: bc290200-5ee3-49f2-8d54-070cf5685258
Document: and configuration data. ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19480
-----
ID: e86f69ca-c119-4178-ad2e-ebe016e23c74
Document: Key characteristics ...
Metadata: {'source': 'data/PlainText.txt'}
Embedding length: 19451
-----


#### Here the length of embeddings are different because DB driver didn’t know how to decode the vector column, so it returned the raw textual representation.

In [10]:
row = rows[0]
print(type(row.embedding))
print(repr(row.embedding)[:200])

<class 'str'>
'[-0.0048253546,0.014394158,-0.028047597,-0.019942425,-0.0017111313,0.038944706,-0.0046864697,-0.05162239,0.0045369016,-0.015113509,0.02340386,0.01800516,-0.013803007,-0.00502834,-0.016908327,0.014230


In [11]:
import ast

for row in rows:
    vec = ast.literal_eval(row.embedding)
    print(len(vec))

1536
1536
1536
1536
1536


#### Search documents

In [12]:
similar = db.similarity_search("computer", k=4)
similar

[Document(id='e281f62b-dd83-441c-bed8-3a65f2d7b3fa', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='4bde5211-7a6d-42b9-8af0-d1fd8868f90b', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='1f0a2634-6f39-4747-9677-da40af3ff6bd', metadata={'source': 'data/PlainText.txt'}, page_content='computer code or scripts.'),
 Document(id='2ea11aae-37a0-4f7d-996a-497150e89ac4', metadata={'source': 'data/PlainText.txt'}, page_content='almost any device using a basic text editor like Notepad on Windows or TextEdit on Mac.\xa0These files')]

#### How it works:

• The search query—in this case, the word query—will be sent to the embeddings
model to retrieve its embedding.
• Then, it will run a query on Postgres to find the N (in this case 4) previously
stored embeddings that are most similar to your query.
• Finally, it will fetch the text content and metadata that relates to each of those
embeddings.
• The model can now return a list of Document sorted by how similar they are to
the query—the most similar first, the second most similar after, and so on.

In [13]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents([
    Document(
        page_content="there are three cats in the bed",
        metadata={"location": "bed", "topic": "animals"},
    ),
    Document(
        page_content="there are also dogs in the bed",
        metadata={"location": "bed", "topic": "animals"},
        ),
    ],
    ids=ids,)

['5e155954-4d7f-4bc3-b18b-8f25ff48e27e',
 'f3bf9ecb-3256-44e6-8d03-7e5246dda7e0']

In [14]:
# delete operation
db.delete(ids=["6f44c5f4-4db2-4b3f-98da-402e2a073f57"])

In [15]:
similar = db.similarity_search("cats", k=2)
similar

[Document(id='a80f40cc-c5e3-4afb-b570-ba317a9339a6', metadata={'topic': 'animals', 'location': 'bed'}, page_content='there are three cats in the bed'),
 Document(id='79077df0-32ef-40fd-a65c-c0da5c2a0eba', metadata={'topic': 'animals', 'location': 'bed'}, page_content='there are three cats in the bed')]

### Tracking changes to the documents

In [16]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "my_docs"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Example docs
docs = [
    Document(page_content="there are cats in the pond", metadata={"source":"cats.txt"}),
    Document(page_content="there are also ducks in the pond", metadata={"source":"ducks.txt"}),
]

In [18]:
# Determine the set of source IDs you will index
sources = [d.metadata.get("source") for d in docs if d.metadata.get("source")]
sources

['cats.txt', 'ducks.txt']

In [22]:
# Connect to DB and delete any existing vectors with the same source metadata
engine = create_engine(connection)
placeholders = ", ".join([f":s{i}" for i in range(len(sources))]) or "NULL"

sql = text(f"""
    DELETE FROM langchain_pg_embedding
    WHERE cmetadata->>'source' IN ({placeholders})
    """)

params = {f"s{i}": src for i, src in enumerate(sources)}

with engine.begin() as conn:
    # if there are no sources this will be a no-op
    if sources:
        conn.execute(sql, params)

In [24]:
db = PGVector.from_documents(docs, embedding_model, connection=connection, collection_name=collection_name)

results = db.similarity_search("pond animals", k=3)
for r in results:
    print(r.metadata, r.page_content[:120])

{'source': 'cats.txt'} there are cats in the pond
{'source': 'ducks.txt'} there are also ducks in the pond
