In [11]:
# pip install langchain openai python-dotenv
# pg vector: https://github.com/pgvector/pgvector

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
loader = TextLoader("state_of_the_union.txt", encoding="utf-8")
documents = loader.load()

print(documents)

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citize

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
texts = text_splitter.split_documents(documents)

print(texts[0])
print(len(texts))

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.' metadata={'source': 'state_of

In [8]:
embeddings = OpenAIEmbeddings()
vector = embeddings.embed_query("Testing embed model")
len(vector)

1536

In [13]:
# pip install tiktoken

In [17]:
doc_vectors = embeddings.embed_documents([
  t.page_content for t in texts[:5]
])

len(doc_vectors)
print(doc_vectors[0])

[-0.0035596876505259024, -0.010357264458449957, -0.01851170328993628, -0.018020711889863743, 0.005898532008571962, 0.020011216458314753, 0.014928792645285766, -0.009527887244487844, -0.0029957110612126338, -0.006641653850720983, 0.01535343383098843, 0.008950640472602214, -0.020887039660526095, 0.0008190101384859731, 0.001997140707475089, 0.006379570829922891, 0.01829938223142366, -0.014782822732465593, 0.029592184433092042, -0.0224661753213226, 0.00974684304504068, -0.013197053260951453, 0.01539324414587487, 0.012520280701849726, -0.0005730997670721906, 0.011113656716001983, 0.03686416532032681, -0.03407745817943734, 0.019055774488975687, -0.02440360009080674, -0.012414120172593416, -0.02969834589367093, -0.024018768288667936, -0.004750673672502205, -0.024775160546219963, -0.020887039660526095, -0.013349657915811839, -0.01220180004540337, 0.0035099249897484986, -0.018498433805855856, 0.007417951265716229, -0.0023521143077877764, -0.011053941243672325, 0.005696163994442238, -0.024841511

In [19]:
# !pip install psycopg2-binary pgvector

In [22]:
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = "postgresql+psycopg2://postgres:test@localhost:5432/vector_db"
COLLECTION_NAME = "state_of_union_vectors"

# run below query in sql editor:
# create extension vector

db = PGVector.from_documents(
    embedding=embeddings,
    documents=texts,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING
)

# will create these two tables & insert data
# langchain_pg_collection
# langchain_pg_embedding

In [23]:
# find similarity
# find the distance in geometric vector space

query = "What did the president say about Russia"

# find 2 documents
similar_docs = db.similarity_search_with_score(query, k=2)
for doc in similar_docs:
    # last column is the similarity
    print(doc)

(Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source'

In [24]:
THIS_IS_VECTORS = embeddings.embed_query(query)

# calculate the cosine distance, top 2 documents
# NOTES: less distance, greater similarity

# SELECT document, (embedding <=> 'THIS_IS_VECTORS') as cosine_distance
# FROM langchain_pg_embedding
# ORDER BY cosine_distance
# LIMIT 2


# find the average vector in the table
# SELECT AVG(embedding) FROM langchain_pg_embedding;



[-0.020218923687934875,
 -0.011762093752622604,
 0.00525162648409605,
 -0.003708739299327135,
 -0.0037184227257966995,
 -0.012246263213455677,
 -0.013750417158007622,
 -0.0190956499427557,
 -0.010838943533599377,
 -0.005964969750493765,
 0.03906926140189171,
 0.006035981234163046,
 -0.006875208579003811,
 -0.030108895152807236,
 0.013750417158007622,
 -0.0028678979724645615,
 0.027629945427179337,
 -0.034576166421175,
 0.054846733808517456,
 -0.015971142798662186,
 -0.004322020802646875,
 -0.01279498916119337,
 0.003182608401402831,
 -0.0072883665561676025,
 -0.016306832432746887,
 -0.0009981964249163866,
 0.026054780930280685,
 -0.02487986348569393,
 0.0007528837886638939,
 -0.01288536749780178,
 0.008347083814442158,
 -0.028043104335665703,
 -0.02153586409986019,
 -0.006294204853475094,
 -0.020593347027897835,
 -0.027242610231041908,
 -0.007378744892776012,
 -0.02049005776643753,
 0.009612380526959896,
 -0.012924101203680038,
 0.0224138256162405,
 0.0069784983061254025,
 0.0163713879

![Problem](pg_vector.png)