In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#initializing embedding mdoel
embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-small-en')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from llama_index.llms.ollama import Ollama

#initializing llm
llm = Ollama(model="llama2", request_timeout=300)

In [8]:
import psycopg2

#important parameters used for database connections
db_name = 'md_vector_sore'
host = 'localhost'
user = 'vineet'
password = 'password'
port = '5432'

#connecting to existing database named 'postgres'
db = psycopg2.connect(
    host=host,
    database="postgres",
    user=user,
    password=password,
    port=port
)

# autocommit = true, cause create database and drop database commands 
# can not be executed in transaction state.
db.autocommit = True

#removing old database with same name and creating new database
with db.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")

In [9]:
from sqlalchemy import make_url
from llama_index.vector_stores.postgres import PGVectorStore

# creating vector database from our norimal database by using PGVectorStore
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=host,
    user=user,
    password=password,
    port=port,
    table_name="research paper",
    embed_dim=384
)

In [10]:
from llama_index.core import SimpleDirectoryReader

# loading pdf's in our environment 

documents = SimpleDirectoryReader("~/obs_brain/BRAIN/Daily Writings").load_data()

In [11]:
from llama_index.core.node_parser import SentenceSplitter
from transformers import AutoTokenizer

# chunking or splitting our docs in smaller chunks

splitter = SentenceSplitter(chunk_size=1024)

chunks = []
chunk_idx = []

for id, doc in enumerate(documents):
    cur_chunks = splitter.split_text(doc.text)
    chunks.extend(cur_chunks)
    chunk_idx.extend([id]* len(cur_chunks))

In [12]:
from llama_index.core.schema import TextNode

# saving those chunks as nodes 

nodes = []

for idx, chunk in enumerate(chunks):
    node = TextNode(text=chunk)
    src_doc = documents[chunk_idx[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

In [13]:
# embedding part

for node in nodes:
    node_embedding = embed_model.get_text_embedding(node.get_content(metadata_mode=all))
    node.embedding = node_embedding

In [14]:
print(nodes[6].metadata)

{'file_path': '/Users/vineetdorikar/obs_brain/BRAIN/Daily Writings/Feb 20, 2025.md', 'file_name': 'Feb 20, 2025.md', 'file_type': 'text/markdown', 'file_size': 129, 'creation_date': '2025-02-23', 'last_modified_date': '2025-02-23'}


In [15]:
# adding all nodes to our vector database 
vector_store.add(nodes)

['5f8e5e82-3310-415c-8475-c92f0796c569',
 '9b449d1b-ceea-45d8-8dcc-063e180770cd',
 '828afe97-c438-45e5-b7ce-f837c41ebee4',
 '5459c4eb-9531-4d36-912b-fec6b5c70ed2',
 '3ce6090c-c29f-4e10-9dc6-76bee3756229',
 '532ee323-1595-47b0-b63f-c2947f128e78',
 'aa9dd5c4-b199-42aa-9b92-5294a67ccb93',
 'c6f80c3d-c896-4f41-a113-09db2d4ed009',
 '195b9533-4513-4e3c-8a5b-db8d62f7d4ea',
 '62dcb80e-46ce-4e06-bda8-068b4687e88b',
 '02e39ee8-18cd-42cf-951e-e2c1e122bee5',
 '579908ce-81a3-460e-823e-1a86f216709a',
 '7c200e9a-b1f0-4a37-a7c4-87d5ca1558c8',
 'fda06b3e-d275-4b08-85a8-627c206cdb80',
 '44763a03-6648-4b54-9517-9210c2cb2f63',
 'f0027a48-2270-46cd-be38-2e03951f994c',
 '90469dd0-7691-4069-b0e0-410e9ba54f8b',
 '26d64152-fe66-43d3-ab7d-05f475548966',
 '5dad613f-4857-4848-a876-769518e3fdd3',
 '735f64ad-fcec-4a48-937c-1025f6a06658',
 '502dcc67-8aa6-4609-892e-8c074879983d',
 '17ad981f-6df7-4291-9c1d-12e485c4706b',
 '45539063-af91-4884-8f1a-b37ec5e9f50c',
 '1a6b519a-49b1-48c0-946c-ddf93fc26fe6',
 'c857d0ed-baeb-

In [21]:
from llama_index.core.retrievers import BaseRetriever
from typing import Any, List
from llama_index.core import QueryBundle
from llama_index.core.vector_stores import VectorStoreQuery

# building custom retriver by inheriting BaseRetriever class

class Retriever(BaseRetriever):
    def __init__(self,
                 vector_store : PGVectorStore,
                 embed_model: any,
                 query_mode: str = "default",
                 similarity_top_k: int = 2) -> None:
        self._vector_store = vector_store
        self._embed_model = embed_model
        self._query_mode = query_mode
        self._similarity_top_k = similarity_top_k
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        query_embedding = embed_model.get_query_embedding(
            query_bundle.query_str
        )
        vector_query_obj = VectorStoreQuery(
            query_embedding=query_embedding,
            similarity_top_k=self._similarity_top_k,
            mode=self._query_mode
        )
        query_result = vector_store.query(vector_query_obj)

        nodes_with_scores = []
        for idx, node in enumerate(query_result.nodes):
            if query_result.similarities is not None:
                score = query_result.similarities[idx]

            nodes_with_scores.append(NodeWithScore(node=node, score=score))

        return nodes_with_scores

In [23]:
retriever = Retriever(vector_store, embed_model, similarity_top_k=2, query_mode="default")

In [24]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(retriever=retriever, llm=llm)


In [None]:
query_str = "why "

query_response = query_engine.query(query_str)

print(str(query_response))

Vineet did not directly state what he did when he met his old friends, but based on the context, it can be inferred that he hung out with them outside in the front yard of their house while he was feeling unwell due to a cold. He described the experience as "pretty chill" and good to see his old friends, but also mentioned that one line said by one of his friends, Shivam, made him uncomfortable and left a bitter taste in his mouth.
