In [1]:
import copy
import itertools
import os
from typing import List

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from transformers import AutoTokenizer

from semantic_chunkers import StatisticalChunker
# from semantic_router.encoders import OpenAIEncoder
from src.encoder import OpenAIEncoder

In [2]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class EnvSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str
    embedding_model_dir: str
    
    sample_data_dir: str
    pipeline_src_dir: str
settings = EnvSettings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [3]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/pgvector_llamaindex/.env", env_file_encoding="utf-8", extra="ignore"
    )
    postgres_user: str
    postgres_password: str
    postgres_db: str
    postgres_url: str
    postgres_port: str

db_settings = DBSettings()
print(db_settings.postgres_db)

pgvector_llamaindex


# 1. Prepare Embedder

In [4]:
from llama_index.core import Settings

In [5]:
## OpenAIEmbedding class
from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
# https://github.com/run-llama/llama_index/blob/a8d27fa9c7f7b039768cb0a0685e70de389087be/llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/openai/base.py#L271
'''
mode: str = OpenAIEmbeddingMode.TEXT_SEARCH_MODE,
model: str = OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002,
embed_batch_size: int = 100,
dimensions: Optional[int] = None,
additional_kwargs: Optional[Dict[str, Any]] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
api_version: Optional[str] = None,
max_retries: int = 10,
timeout: float = 60.0,
reuse_client: bool = True,
callback_manager: Optional[CallbackManager] = None,
default_headers: Optional[Dict[str, str]] = None,
http_client: Optional[httpx.Client] = None,
async_http_client: Optional[httpx.AsyncClient] = None,
num_workers: Optional[int] = None,'''

'''
ValueError: 'baai/bge-m3' is not a valid OpenAIEmbeddingModelType
-> https://github.com/run-llama/llama_index/blob/a8d27fa9c7f7b039768cb0a0685e70de389087be/llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/openai/base.py#L27
embedding model names are fixed in code
'''

# embed_model = OpenAIEmbedding(
#     model = settings.embedding_model,
#     dimensions = 1024,
#     api_key = settings.embedding_api_key,
#     api_base = settings.embedding_base_url,
#     embed_batch_size=10
# )
# Settings.embed_model = embed_model

"\nValueError: 'baai/bge-m3' is not a valid OpenAIEmbeddingModelType\n-> https://github.com/run-llama/llama_index/blob/a8d27fa9c7f7b039768cb0a0685e70de389087be/llama-index-integrations/embeddings/llama-index-embeddings-openai/llama_index/embeddings/openai/base.py#L27\nembedding model names are fixed in code\n"

In [6]:
# text-embedding-inference
from llama_index.embeddings.text_embeddings_inference import (
    TextEmbeddingsInference,
)
# https://docs.llamaindex.ai/en/stable/examples/embeddings/text_embedding_inference/
embed_model = TextEmbeddingsInference(
    model_name=settings.embedding_model,
    base_url=settings.embedding_base_url,
    timeout=60,
    embed_batch_size=10,
)
Settings.embed_model = embed_model

In [7]:
## llama index also supports langchain embedder
# needs llama-index-embeddings-langchain install
import os
from langchain_openai import OpenAIEmbeddings

# embed_model = OpenAIEmbeddings(
#     model=settings.embedding_model,
#     api_key=settings.embedding_api_key
# )
# Settings.embed_model = embed_model

# 2. Prepare DB
* connect to pre-initialized postgresql db (pgvector docker container)

In [8]:
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
import textwrap
import openai

## 2-1. create db

In [9]:
import psycopg2
# connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
    db_settings.postgres_user,
    db_settings.postgres_password,
    db_settings.postgres_port,
    db_settings.postgres_db
)

db_name = db_settings.postgres_db
print(f"DB: {db_name}")
conn = psycopg2.connect(connection_string)
conn.autocommit = True

# We already have database created - skip creation
# with conn.cursor() as c:
#     c.execute(f"DROP DATABASE IF EXISTS {db_name}")
#     c.execute(f"CREATE DATABASE {db_name}")

DB: pgvector_llamaindex


## 2-2. create index


In [10]:
from sqlalchemy import make_url

In [11]:
## Initialize vector store instance
url = make_url(connection_string)

## hnsw indexing config
hnsw_config = {
    "hnsw_m": 16,
    "hnsw_ef_construction": 64,
    "hnsw_ef_search": 40,
    "hnsw_dist_method": "vector_cosine_ops",
}

vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="test_documents",
    embed_dim=1024,  #bge-m3
    hnsw_kwargs=hnsw_config,
)

## create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 3. Insert documents

In [12]:
from llama_index.core import Document

In [13]:
document1 = Document(
    text = "I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata = {"source": "tweet"},
    text_template='{content}'
)

document2 = Document(
    text = "The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata = {"source": "news"},
    text_template='{content}'
)
documents = [document1, document2]

In [14]:
## Insert Documents
'''
ProgrammingError: (psycopg2.errors.UndefinedTable) relation "public.data_test_documents" does not exist
LINE 1: INSERT INTO public.data_test_documents (text, metadata_, nod...
'''

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)

Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
## Check DB Schema
'''
Table: public.data_test_documents
  - Column: id
    Data Type: bigint
    Nullable:  NO
    Default:   nextval('data_test_documents_id_seq'::regclass)
  - Column: text
    Data Type: character varying
    Nullable:  NO
    Default:   None
  - Column: metadata_
    Data Type: json
    Nullable:  YES
    Default:   None
  - Column: node_id
    Data Type: character varying
    Nullable:  YES
    Default:   None
  - Column: embedding
    Data Type: USER-DEFINED
    Nullable:  YES
    Default:   None
'''

conn = psycopg2.connect(connection_string)
with conn.cursor() as cur:
    # --- Print out all schema names ---
    print("All Schemas in the Database:")
    cur.execute("SELECT schema_name FROM information_schema.schemata;")
    schemas = cur.fetchall()
    for schema in schemas:
        print(f"  - {schema[0]}")
        
    ## Print table schemas
    cur.execute("""
        SELECT table_schema, table_name
        FROM information_schema.tables
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
        ORDER BY table_schema, table_name;
    """)
    tables = cur.fetchall()

    # 3. Print the schema (columns) of each table
    for schema_name, table_name in tables:
        print(f"\nTable: {schema_name}.{table_name}")
        
        # Fetch column details from information_schema.columns
        cur.execute("""
            SELECT column_name, data_type, is_nullable, column_default
            FROM information_schema.columns
            WHERE table_schema = %s
            AND table_name   = %s
            ORDER BY ordinal_position;
        """, (schema_name, table_name))
        
        columns = cur.fetchall()
        if not columns:
            print("  (No columns found)")
        else:
            for col_name, col_type, is_nullable, default_val in columns:
                print(f"  - Column: {col_name}")
                print(f"    Data Type: {col_type}")
                print(f"    Nullable:  {is_nullable}")
                print(f"    Default:   {default_val}")

    cur.close()
conn.close()

All Schemas in the Database:
  - public
  - information_schema
  - pg_catalog
  - pg_toast

Table: public.data_test_documents
  - Column: id
    Data Type: bigint
    Nullable:  NO
    Default:   nextval('data_test_documents_id_seq'::regclass)
  - Column: text
    Data Type: character varying
    Nullable:  NO
    Default:   None
  - Column: metadata_
    Data Type: json
    Nullable:  YES
    Default:   None
  - Column: node_id
    Data Type: character varying
    Nullable:  YES
    Default:   None
  - Column: embedding
    Data Type: USER-DEFINED
    Nullable:  YES
    Default:   None

Table: public.paper_information
  - Column: id
    Data Type: integer
    Nullable:  NO
    Default:   nextval('paper_information_id_seq'::regclass)
  - Column: created_at
    Data Type: timestamp without time zone
    Nullable:  YES
    Default:   now()
  - Column: updated_at
    Data Type: timestamp without time zone
    Nullable:  YES
    Default:   now()
  - Column: paper_id
    Data Type: text
   

## 3-1. Test Retrieval

In [21]:
retriever_args = {"similarity_top_k": 10}
retriever = index.as_retriever(**retriever_args)

In [22]:
retriever

<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x172193bb0>

In [24]:
## Simple query
query = "LangChain provides abstractions to make working with LLMs easy"
nodes = retriever.retrieve(query)
nodes

[NodeWithScore(node=TextNode(id_='539851ee-16da-4e77-896a-d3616d0c808c', embedding=None, metadata={'source': 'tweet'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='188cc4ce-27e6-48a7-b1e4-03541436f117', node_type='4', metadata={'source': 'tweet'}, hash='c211cf902096529c230ab1394516787d54390c9228e4f376fa12fa2c9699a6d9')}, metadata_template='{key}: {value}', metadata_separator='\n', text='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.', mimetype='text/plain', start_char_idx=0, end_char_idx=76, metadata_seperator='\n', text_template='{content}'), score=0.37611465142790657),
 NodeWithScore(node=TextNode(id_='29822fbf-3667-4186-a3ae-fc14ed13716b', embedding=None, metadata={'source': 'news'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='78feaa43-4954-4ef9-9778-8bdaec765188', node_typ

In [40]:
## metadata filtering
from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
)

filters = MetadataFilters(
    filters=[
        MetadataFilter(key="source", value="tweet"),
    ],
    condition="or",
)

retriever = index.as_retriever(
    filters=filters,
    **retriever_args
)

In [41]:
nodes = retriever.retrieve(query)
nodes

[NodeWithScore(node=TextNode(id_='539851ee-16da-4e77-896a-d3616d0c808c', embedding=None, metadata={'source': 'tweet'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='188cc4ce-27e6-48a7-b1e4-03541436f117', node_type='4', metadata={'source': 'tweet'}, hash='c211cf902096529c230ab1394516787d54390c9228e4f376fa12fa2c9699a6d9')}, metadata_template='{key}: {value}', metadata_separator='\n', text='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.', mimetype='text/plain', start_char_idx=0, end_char_idx=76, metadata_seperator='\n', text_template='{content}'), score=0.37611465142790657)]

In [None]:
## nested filters
filters = MetadataFilters(
    filters=[
        MetadataFilters(
            filters=[
                MetadataFilter(key="source", value="tweet"),
            ],
            condition="or",
        ),
        MetadataFilters(
            filters=[
                MetadataFilter(key="source", value="news"),
            ],
            condition="or",
        ),
    ],
    condition="or",
)
retriever = index.as_retriever(
    filters=filters,
    **retriever_args
)

In [39]:
nodes = retriever.retrieve(query)
nodes

[NodeWithScore(node=TextNode(id_='539851ee-16da-4e77-896a-d3616d0c808c', embedding=None, metadata={'source': 'tweet'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='188cc4ce-27e6-48a7-b1e4-03541436f117', node_type='4', metadata={'source': 'tweet'}, hash='c211cf902096529c230ab1394516787d54390c9228e4f376fa12fa2c9699a6d9')}, metadata_template='{key}: {value}', metadata_separator='\n', text='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.', mimetype='text/plain', start_char_idx=0, end_char_idx=76, metadata_seperator='\n', text_template='{content}'), score=0.37611465142790657),
 NodeWithScore(node=TextNode(id_='29822fbf-3667-4186-a3ae-fc14ed13716b', embedding=None, metadata={'source': 'news'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='78feaa43-4954-4ef9-9778-8bdaec765188', node_typ

## 3-2. Test 'Query'
* using llm call to answer

In [25]:
Settings.llm=None
query_engine = index.as_query_engine()

LLM is explicitly disabled. Using MockLLM.


In [28]:
response = query_engine.query(query)
print(response)

Context information is below.
---------------------
I had chocalate chip pancakes and scrambled eggs for breakfast this morning.

The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.
---------------------
Given the context information and not prior knowledge, answer the query.
Query: LangChain provides abstractions to make working with LLMs easy
Answer: 
