# Postgres + PGVECTOR examples (with llama-index framework)

In [159]:
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
import textwrap
import openai
import os
from getpass import getpass

In [160]:
# you need to have you openAI key exported like 
#! export OPENAI_API_KEY = ....

In [161]:
!mkdir -p 'data/paul_graham/'
!curl -L 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -o 'data/paul_graham/paul_graham_essay.txt'

91646.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
91651.83s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75042  100 75042    0     0   278k      0 --:--:-- --:--:-- --:--:--  277k


In [162]:
with open("./data/paul_graham/paul_graham_essay.txt", 'r', encoding='utf-8') as file:
    content = file.read()

lines = content.splitlines()
print(len(lines))

353


In [163]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Node, TextNode
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
#print(documents)
sentence_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=100)
#nodes = sentence_splitter.get_nodes_from_documents(documents)
nodes = [TextNode(text=line) for line in lines]
print(f'nodes size: {len(nodes)}')
# for node in nodes:
#     print(node.text) 


#print("Document ID:", documents[0].doc_id)

nodes size: 353


In [164]:
import psycopg2

connection_string = "postgresql://postgres:postgres@localhost:5432"
db_name = "vector_db_llama_index"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    kill_connection_query  = f"""
                                SELECT 
                                    pg_terminate_backend(pid) 
                                FROM 
                                    pg_stat_activity 
                                WHERE 
                                    -- don't kill my own connection!
                                    pid <> pg_backend_pid()
                                    -- don't kill the connections to other databases
                                    AND datname = '{db_name}'
                                    ;
                            """
    
    c.execute(kill_connection_query)

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [165]:
from sqlalchemy import make_url
from llama_index.embeddings.ollama import OllamaEmbedding

embed_model = OllamaEmbedding(
    model_name="all-minilm",  # or your preferred Ollama embedding model
    ollama_base_url="http://localhost:11434"  # default Ollama endpoint
)

url = make_url(connection_string)

vector_store_hnsw = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="paul_graham_essay_hnsw",
    #llama 3.1 4096
    # all-minilm 284
    embed_dim=384, 
    hnsw_kwargs={
        "hnsw_m": 16,
        "hnsw_ef_construction": 64,
        "hnsw_ef_search": 40,
        "hnsw_dist_method": "vector_cosine_ops",
    },    
)




In [166]:
storage_context_hnsw = StorageContext.from_defaults(vector_store=vector_store_hnsw)
index_hnsw = VectorStoreIndex(nodes=nodes, storage_context=storage_context_hnsw, show_progress=True, embed_model=embed_model)
# .from_documents(
#     documents, storage_context=storage_context, show_progress=True, embed_model=embed_model, 
# )
query_engine_hnsw = index_hnsw.as_query_engine()
print(type(query_engine_hnsw))
retriever_hnsw = index_hnsw.as_retriever()

Some nodes are missing content, skipping them...


Generating embeddings: 100%|██████████| 171/171 [00:27<00:00,  6.23it/s]


<class 'llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine'>


In [167]:
retriever_response = retriever_hnsw.retrieve("What did the author do ?",)
#print(textwrap.fill(str(retriever_response), 100))
for result in retriever_response:
    print("Text:", result.node.text)
    print("Score:", result.score)


# response = query_engine.query("What did the author do?",)
# print(textwrap.fill(str(response), 100))
# for result in response.source_nodes:
#     print("Text:", result.node.text)
#     print("Score:", result.score)

Text: Thanks to Trevor Blackwell, John Collison, Patrick Collison, Daniel Gackle, Ralph Hazell, Jessica Livingston, Robert Morris, and Harj Taggar for reading drafts of this.
Score: 0.4695280188853659
Text: In the print era, the channel for publishing essays had been vanishingly small. Except for a few officially anointed thinkers who went to the right parties in New York, the only people allowed to publish essays were specialists writing about their specialties. There were so many essays that had never been written, because there had been no way to publish them. Now they could be, and I was going to write them. [12]
Score: 0.37026984668241836


### resultados anteriores

```log
Text: I wanted to go back to RISD, but I was now broke and RISD was very expensive, so I decided to get a job for a year and then return to RISD the next fall. I got one at a company called Interleaf, which made software for creating documents. You mean like Microsoft Word? Exactly. That was how I learned that low end software tends to eat high end software. But Interleaf still had a few years to live yet. [5]
Score: 0.5052353369340625
Text: The problem with systems work, though, was that it didn't last. Any program you wrote today, no matter how good, would be obsolete in a couple decades at best. People might mention your software in footnotes, but no one would actually use it. And indeed, it would seem very feeble work. Only people with a sense of the history of the field would even realize that, in its time, it had been good.
Score: 0.5035586739636687
```