# Step 1: Setup and Install Required Libraries
# Install necessary libraries if they are not already installed

In [1]:
!pip install pgvector psycopg2 langchain openai

# Step 2: Database Connection Setup

In [2]:
import os
import psycopg2
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores.pgvector import PGVector

# Environment variables for PostgreSQL connection
POSTGRES_HOST = os.getenv('POSTGRES_HOST', 'localhost')
POSTGRES_USER = os.getenv('POSTGRES_USER', 'user')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD', 'password')
POSTGRES_DB = os.getenv('POSTGRES_DB', 'vector_db')

# Create a connection to the PostgreSQL database
conn = psycopg2.connect(
    host=POSTGRES_HOST,
    user=POSTGRES_USER,
    password=POSTGRES_PASSWORD,
    dbname=POSTGRES_DB
)

# Step 3: Load and Process Document

In [3]:
def process_and_store_document(file_path):
    """
    This function processes the document, splits it into chunks, 
    generates embeddings, and stores them in the PostgreSQL database.
    """
    # Initialize the document loader for PDF (adjust this for other formats)
    loader = PyPDFLoader(file_path)

    # Split the document into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = loader.load_and_split(text_splitter)

    # Initialize the OpenAI embeddings model (or use VertexAI if needed)
    embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

    # Initialize PGVector with connection details
    vector_search = PGVector(
        collection_name='documents', 
        connection_string=f'postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}/{POSTGRES_DB}',
        embedding_function=embeddings
    )

    # Remove null characters and prepare document content for insertion
    for document in documents:
        document.page_content = document.page_content.replace('\x00', '')
    
    # Store documents in the PostgreSQL vector database
    vector_search.add_documents(documents)
    
    print(f"{file_path} was successfully processed and embedded.")
    print(f"Number of document chunks: {len(documents)}")

# Step 4: Process a Local File and Send it to Vector Database

In [4]:
# Replace with the path to your local document
file_path = "/home/jovyan/work/sample_document.pdf"  # adjust path if needed

process_and_store_document(file_path)

# Step 5: Perform a Semantic Search Query

In [5]:
def perform_semantic_search(query_text, top_k=2):
    """
    This function performs a semantic search against the vector database.
    It retrieves a maximum of `top_k` results based on similarity to `query_text`.
    """
    # Initialize the embedding model for query embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

    # Reinitialize PGVector for search
    vector_search = PGVector(
        collection_name='documents', 
        connection_string=f'postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}/{POSTGRES_DB}',
        embedding_function=embeddings
    )

    # Perform the similarity search on the query text
    results = vector_search.similarity_search(query_text, k=top_k)

    # Display results
    print(f"Top {top_k} search results for query: '{query_text}'\n")
    for idx, result in enumerate(results):
        print(f"Result {idx + 1}:\n")
        print(result.page_content)
        print("-" * 40)

# Step 6: Run a Semantic Search Query

In [6]:
query_text = "Describe the key points of semantic search in AI."

# Perform a semantic search with the query text
perform_semantic_search(query_text, top_k=2)