In [7]:
import copy
import itertools
import os
from typing import List

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from transformers import AutoTokenizer

In [8]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class EnvSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str
    embedding_model_dir: str
    
    sample_data_dir: str
    pipeline_src_dir: str
settings = EnvSettings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [9]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/pgvector_llamaindex/.env", env_file_encoding="utf-8", extra="ignore"
    )
    postgres_user: str
    postgres_password: str
    postgres_db: str
    postgres_url: str
    postgres_port: str

db_settings = DBSettings()
print(db_settings.postgres_db)

pgvector_llamaindex_test


# Prepare Embedder & VectorStore

In [10]:
from llama_index.core import Settings
from llama_index.embeddings.text_embeddings_inference import (
    TextEmbeddingsInference,
)

from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

from sqlalchemy import make_url

In [11]:
# https://docs.llamaindex.ai/en/stable/examples/embeddings/text_embedding_inference/
embed_model = TextEmbeddingsInference(
    model_name=settings.embedding_model,
    base_url=settings.embedding_base_url,
    timeout=60,
    embed_batch_size=10,
)
Settings.embed_model = embed_model

In [12]:
import psycopg2
# connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
    db_settings.postgres_user,
    db_settings.postgres_password,
    db_settings.postgres_port,
    db_settings.postgres_db
)

db_name = db_settings.postgres_db
print(f"DB: {db_name}")
conn = psycopg2.connect(connection_string)
conn.autocommit=True

DB: pgvector_llamaindex_test


In [13]:
## Initialize vector store instance
url = make_url(connection_string)

## hnsw indexing config
hnsw_config = {
    "hnsw_m": 16,
    "hnsw_ef_construction": 64,
    "hnsw_ef_search": 40,
    "hnsw_dist_method": "vector_cosine_ops",
}

In [14]:
vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="test_documents",
    embed_dim=1024,  #bge-m3
    hnsw_kwargs=hnsw_config,
)

## create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Prepare Documents

In [15]:
## Insert Documents
from llama_index.core import Document
document1 = Document(
    text = "I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata = {"source": "tweet"},
    text_template='{content}'
)

document2 = Document(
    text = "The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata = {"source": "news"},
    text_template='{content}'
)
documents = [document1, document2]

In [22]:
from llama_index.core.schema import TextNode
from llama_index.core.indices.utils import embed_nodes
from llama_index.core.ingestion.pipeline import run_transformations

In [25]:
Settings.transformations

[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x166e3ae90>, id_func=<function default_id_func at 0x163d3cdc0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')]

In [24]:
node = run_transformations([document1], transformations=[], Settings.)
node

[Document(id_='1cd166df-6982-4de4-b955-3ca09faedfec', embedding=None, metadata={'source': 'tweet'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{content}')]

In [None]:
node1 = TextNode(
    text = "I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    "text_template": 
)

In [17]:
## Document -> TextNodes

In [28]:
ids_to_embed_map = embed_nodes(
    nodes = node, embed_model = embed_model, show_progress = True
)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
ids_to_embed_map
results = []
for node in node:
    embedding = ids_to_embed_map[node.node_id]
    result = node.model_copy()
    result.embedding = embedding
    results.append(result)

In [31]:
results

[Document(id_='1cd166df-6982-4de4-b955-3ca09faedfec', embedding=[0.015847126, 0.04540542, -0.02274002, 0.013086238, -0.0272731, -0.028056595, 0.0338955, 0.032701604, -0.0047102994, -0.046935104, 0.0075831157, -0.009737728, -0.033149317, -0.029250493, 0.031992726, -0.0433161, 0.024232391, -0.020781282, -0.05458351, -0.034902852, -0.02626575, 0.011575211, 0.00434187, -0.018635998, -0.024474902, -0.005633705, -0.027608884, 0.01708766, 0.036152713, -0.011304718, 0.030332463, -0.028989328, 0.015268832, -0.018887835, -0.020874554, -0.0074898424, 0.0008476207, 0.0061373804, -0.018458778, -0.013879061, -0.0040620505, 0.01028804, 0.024120465, -0.024176428, -0.0023446565, 0.014774484, -0.024456248, -0.020557426, -0.026079202, -0.02005375, 0.0050973836, 0.0288774, 0.024456248, -0.030612282, 0.04387574, 0.00805881, -0.021807287, 0.030649591, -0.06148573, 4.080705e-06, -0.0007636748, -0.015697889, 0.073014304, -0.025482254, -0.0016894118, 0.086781435, 0.028728163, 0.013076911, -0.026806735, -0.0124