In [2]:
import docx
import PyPDF2
import os

def read_text_file(file_path: str):
    """Read content from a text file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_pdf_file(file_path: str):
    """Read content from a PDF file"""
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx_file(file_path: str):
    """Read content from a Word document"""
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

In [3]:
def read_document(file_path: str):
    """Read document content based on file extension"""
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.txt':
        return read_text_file(file_path)
    elif file_extension == '.pdf':
        return read_pdf_file(file_path)
    elif file_extension == '.docx':
        return read_docx_file(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

In [4]:
def split_text(text: str, chunk_size: int = 500):
    """Split text into chunks while preserving sentence boundaries"""
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # Ensure proper sentence ending
        if not sentence.endswith('.'):
            sentence += '.'

        sentence_size = len(sentence)

        # Check if adding this sentence would exceed chunk size
        if current_size + sentence_size > chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_size = sentence_size
        else:
            current_chunk.append(sentence)
            current_size += sentence_size

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


# create table in vector_db in postgres

In [None]:
# CREATE TABLE documents (
#     id SERIAL PRIMARY KEY,
#     document_id TEXT,
#     chunk_id INT,
#     content TEXT,
#     embedding VECTOR(1536)  -- Adjust dimensions to match embedding size
# );


In [5]:
import psycopg2
from pgvector.psycopg2 import register_vector
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

DB_NAME= os.environ['DB_NAME']
DB_PORT= os.environ['DB_PORT']
DB_HOST= os.environ['DB_HOST']
DB_PASSWORD= os.environ['DB_PASSWORD']
DB_USER= os.environ['DB_USER']

# Initialize the embedding model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

connection = psycopg2.connect(
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT
)
register_vector(connection)

In [6]:
def insert_document_chunk(document_id, chunk_id, content, embedding):
    """Insert a document chunk and its embedding into PostgreSQL."""
    with connection.cursor() as cursor:
        cursor.execute(
            "INSERT INTO documents (document_id, chunk_id, content, embedding) VALUES (%s, %s, %s, %s)",
            (document_id, chunk_id, content, embedding)
        )
    connection.commit()



def get_embedding(text):
    """Generate embedding for a given text."""
    return embedding_model.encode(text).tolist()

In [7]:
def process_and_store_document(file_path: str):
    """Process a document, generate embeddings, and store them in PostgreSQL."""
    content = read_document(file_path)
    chunks = split_text(content)

    document_id = os.path.basename(file_path)
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        insert_document_chunk(document_id, i, chunk, embedding)



In [14]:
def semantic_search_pg(query: str, n_results: int = 2):
    """Perform semantic search on the PostgreSQL collection with pgvector embeddings."""
    embedding = embedding_model.encode(query).tolist()
    # query_vector = f"ARRAY{embedding}"

    try:
        with connection.cursor() as cursor:
            # Using the embedding with the vector <-> operator in PostgreSQL
            cursor.execute(
                """
                SELECT content, document_id, chunk_id
                FROM documents
                ORDER BY embedding <-> %s::VECTOR(768)
                LIMIT %s;
                """,
                (embedding, n_results)
            )
            results = cursor.fetchall()

            # Commit if query succeeds
            connection.commit()

            # Process results for context and metadata
            documents = [row[0] for row in results]
            metadatas = [{"source": row[1], "chunk": row[2]} for row in results]

            return {"documents": [documents], "metadatas": [metadatas]}

    except Exception as e:
        print(f"Error during semantic search: {e}")
        connection.rollback()  # Rollback if there's an error
        return {"documents": [[]], "metadatas": [[]]}  # Return empty structure on error



In [None]:
folder_path = "E:\Coding\python/rag\docs"
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    process_and_store_document(file_path)

def get_context_with_sources_pg(results):
    """Format context and source information from PostgreSQL search results."""
    context = "\n\n".join([row[2] for row in results])  # Extract content
    sources = [f"{row[0]} (chunk {row[1]})" for row in results]  # Format source info
    return context, sources

In [15]:
def get_context_with_sources_pg(results):
    """Format context and source information from PostgreSQL search results."""
    context = "\n\n".join([row[2] for row in results])  # Extract content
    sources = [f"{row[0]} (chunk {row[1]})" for row in results]  # Format source info
    return context, sources

In [16]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [17]:
def get_prompt(context: str, conversation_history: str, query: str):
    """Generate a prompt combining context, history, and query"""
    prompt = f"""Based on the following context and conversation history, 
    please provide a relevant and contextual response.Look through every part of the document like tables if they exists and give answers based on that. If the answer cannot 
    be derived from the context, only use the conversation history or say 
    "I cannot answer this based on the provided information."

    Context from documents:
    {context}

    Previous conversation:
    {conversation_history}

    Human: {query}

    Assistant:"""

    return prompt


In [18]:
def generate_response(query: str, context: str, conversation_history: str = ""):
    """Generate a response using Gemini with a dynamic prompt and configurable generation parameters."""
    prompt = get_prompt(context, conversation_history, query)

    try:
        # Initialize the model
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        # Generate response using the prompt with a customized generation config
        response = model.generate_content(
            prompt,  # Use the dynamically generated prompt here
            generation_config=genai.types.GenerationConfig(
                candidate_count=1,  # Generates one response candidate
                # stop_sequences=["\n","End of answer"],  # Adjust stop sequences as needed
                max_output_tokens=800,  # Set your desired max tokens
                temperature=0  # Adjust temperature for response variability
            ),
        )

        # Extract the response content
        return response.text if response else "No content generated."

    except Exception as e:
        return f"Error generating response: {str(e)}"





In [20]:
def rag_query_pg(questions: list, n_chunks: int = 2):
    """Perform RAG query for a list of questions: retrieve relevant chunks and generate answers."""
    responses = []
    sources_used = []
    
    for question in questions:
        try:
            # Get relevant chunks for each question
            results = semantic_search_pg(question, n_chunks)
            context, sources = get_context_with_sources_pg(results)
            
            # Generate response for each question
            response = generate_response(question, context)
            
            # Append results for this question
            responses.append((question, response))
            sources_used.append((question, sources))

        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            connection.rollback()  # Rollback transaction if there's an error
    
    return responses, sources_used


questions = [
    "what are data structures?",
    "What is a stack data structure?",
    # "How has cloud computing evolved in the past decade?",
    "what is case processing in pharmacovigilance?",
    "what is the role of Mr. Ramesh patel in the leadership team?",
    "what is the budget strategy of adrta?"
]
responses, sources = rag_query_pg(questions)

# Print results
for question, response in responses:
    print("\nQuestion:", question)
    print("Answer:", response)

print("\n------------------------------------------")
print("\nSources used for each question:")
for question, source_list in sources:
    print(f"\nQuestion: {question}")
    for source in source_list:
        print(f"- {source}") 



Question: what are data structures?
Answer: Data structures are ways of organizing and storing data in a computer so that it can be accessed and used efficiently. They provide a framework for managing data, allowing for operations like searching, sorting, and inserting new data. 

Here are some common examples of data structures:

* **Arrays:** A collection of elements of the same data type stored in contiguous memory locations.
* **Linked Lists:** A linear data structure where elements are linked together using pointers.
* **Stacks:** A LIFO (Last-In, First-Out) data structure where elements are added and removed from the top.
* **Queues:** A FIFO (First-In, First-Out) data structure where elements are added at the rear and removed from the front.
* **Trees:** A hierarchical data structure where elements are organized in a parent-child relationship.
* **Graphs:** A non-linear data structure consisting of nodes (vertices) connected by edges.

The choice of data structure depends on th