In [1]:
import docx
import PyPDF2
import os

def read_text_file(file_path: str):
    """Read content from a text file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def read_pdf_file(file_path: str):
    """Read content from a PDF file"""
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

def read_docx_file(file_path: str):
    """Read content from a Word document"""
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

In [2]:
def read_document(file_path: str):
    """Read document content based on file extension"""
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.txt':
        return read_text_file(file_path)
    elif file_extension == '.pdf':
        return read_pdf_file(file_path)
    elif file_extension == '.docx':
        return read_docx_file(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

In [16]:
def split_text(text: str, chunk_size: int = 500):
    """Split text into chunks while preserving sentence boundaries"""
    sentences = text.replace('\n', ' ').split('. ')
    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        # Ensure proper sentence ending
        if not sentence.endswith('.'):
            sentence += '.'

        sentence_size = len(sentence)

        # Check if adding this sentence would exceed chunk size
        if current_size + sentence_size > chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_size = sentence_size
        else:
            current_chunk.append(sentence)
            current_size += sentence_size

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


# improved chunking function

In [3]:
import re
import nltk
from typing import List

def split_text(text: str, chunk_size: int = 500, overlap: int = 50, token_based: bool = False) -> List[str]:
    """
    Split text into chunks while preserving sentence and paragraph boundaries, 
    with optional overlap and token-based chunking.
    """
    # Load nltk sentence tokenizer if needed
    nltk.download('punkt', quiet=True)
    
    # Tokenize by paragraphs first
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = []
    current_size = 0

    def get_token_count(text):
        """Helper to get the token count (rough approximation)."""
        return len(re.findall(r'\w+', text))

    # Adjust sentence or token counting function
    count_func = get_token_count if token_based else len

    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph.strip())
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            sentence_size = count_func(sentence)
            
            # Add sentence to current chunk if it fits
            if current_size + sentence_size <= chunk_size:
                current_chunk.append(sentence)
                current_size += sentence_size
            else:
                # Finalize the current chunk
                chunks.append(' '.join(current_chunk))
                
                # Begin new chunk, optionally with overlap from last few sentences
                if overlap > 0 and len(current_chunk) > 0:
                    overlap_sentences = current_chunk[-overlap:]
                    current_chunk = overlap_sentences + [sentence]
                    current_size = count_func(' '.join(current_chunk))
                else:
                    current_chunk = [sentence]
                    current_size = sentence_size

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks


In [4]:
from dotenv import load_dotenv
# import google.generativeai as gen_ai
from sentence_transformers import SentenceTransformer
import psycopg2

load_dotenv()

model = SentenceTransformer('all-mpnet-base-v2')

DB_HOST = os.environ['DB_HOST']
DB_PORT = os.environ['DB_PORT']
DB_NAME = os.environ['DB_NAME']
DB_USER = os.environ['DB_USER']
DB_PASSWORD = os.environ['DB_PASSWORD']

conn = psycopg2.connect(
    host=DB_HOST,
    port=DB_PORT,
    dbname=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD
)
cursor = conn.cursor()

def create_embedding_table():
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS document_embeddings (
        id SERIAL PRIMARY KEY,
        document_text TEXT,
        embedding VECTOR(768),
        source_file TEXT,
        chunk_number INT,
        UNIQUE(document_text, chunk_number, source_file)  -- Ensure uniqueness
    );
    """)
    conn.commit()
 
create_embedding_table()

  from tqdm.autonotebook import tqdm, trange


In [17]:
def add_to_pgvector(collection_name, chunks, source_file):
    embeddings = [model.encode(chunk).tolist() for chunk in chunks]
    for i, embedding in enumerate(embeddings):
        cursor.execute(
            """
            INSERT INTO document_embeddings (document_text, embedding, source_file, chunk_number)
            VALUES (%s, %s, %s, %s)
            """,
            (chunks[i], embedding, source_file, i)
        )
    conn.commit()

In [18]:
def process_and_index_document(file_path):
    text = read_document(file_path)
    # text = clean_text(text)
    chunks = split_text(text)
    add_to_pgvector("document_collection", chunks, file_path)

In [19]:
def add_documents_from_folder(folder_path):
    files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.pdf', '.txt', '.docx'))]
    for file_path in files:
        print(f"Processing {os.path.basename(file_path)}...")
        process_and_index_document(file_path)

In [20]:
def semantic_search(query, n_results=2, similarity_threshold=0.7):
    query_embedding = model.encode(query).tolist()
    cursor.execute(
        """
        SELECT document_text, source_file, chunk_number,
               (embedding <=> %s::vector(768)) AS similarity
        FROM document_embeddings
        ORDER BY similarity ASC
        LIMIT %s;
        """,
        (query_embedding, n_results)
    )
    results = cursor.fetchall()
    relevant_results = [
        {"text": result[0], "source": result[1], "chunk": result[2], "similarity": result[3]}
        for result in results if result[3] <= similarity_threshold
    ]
    return relevant_results

In [21]:
def get_context_with_sources(results):
    context = "\n\n".join([result['text'] for result in results])
    sources = [f"{result['source']} (chunk {result['chunk']})" for result in results]
    return context, sources

In [22]:
def format_response(query, response, sources):
    formatted_sources = "\n".join([f"- {source}" for source in sources])
    return f"""Query: {query}\n\nAnswer: {response}\n\nSources used:\n{formatted_sources}\n"""
 

In [11]:
import google.generativeai as gen_ai
import os
from dotenv import load_dotenv

load_dotenv()

gen_ai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [12]:
def generate_response(query, context, conversation_history=""):
    prompt = f"""Based on the following context and conversation history,
    please provide a relevant and contextual response. If the answer cannot
    be derived from the context, only use the conversation history or say
    "I cannot answer this based on the provided information."
 
    Context from documents:
    {context}
 
    Previous conversation:
    {conversation_history}
 
    Human: {query}
 
    Assistant:"""
   
    try:
        model = gen_ai.GenerativeModel("gemini-1.5-flash")
        response = model.generate_content(
            prompt,
            generation_config=gen_ai.types.GenerationConfig(
                candidate_count=1,
                # stop_sequences=["x"],
                max_output_tokens=900,
                temperature=0.2
            ),
        )
        if response and response.text:
            return response.text
        else:
            return "Unable to generate content due to response restrictions or empty result."
    except Exception as e:
        return f"Error generating response: {str(e)}"

# using llama3 model instead of gemini

In [23]:
import ollama

def generate_response(query, context, conversation_history=""):
    prompt = f"""Based on the following context and conversation history,
    please provide a relevant and contextual response. If the answer cannot
    be derived from the context, only use the conversation history or say
    "I cannot answer this based on the provided information."
 
    Context from documents:
    {context}
 
    Previous conversation:
    {conversation_history}
 
    Human: {query}
 
    Assistant:"""
    
    try:
        response = ollama.generate(
            model="llama3",  # Replace with the specific LLaMA model name if different
            prompt=prompt
        )
        # Extract the response text
        if response and "response" in response:
            return response["response"]
        else:
            return "Unable to generate content due to response restrictions or empty result."
    except Exception as e:
        return f"Error generating response with LLaMA: {str(e)}"


In [24]:
def rag_query(query, n_chunks=2):
    results = semantic_search(query, n_chunks)
    context, sources = get_context_with_sources(results)
    response = generate_response(query, context)
    return format_response(query, response, sources)

In [25]:
def process_multiple_queries(queries, n_chunks=2):
    responses = {}
    for i, query in enumerate(queries):
        formatted_response = rag_query(query, n_chunks)
        responses[f"Question {i+1}"] = formatted_response
    return responses

# adding document embeddings in postgres

In [15]:
folder_path = "E:\Coding\python/rag\docs"
add_documents_from_folder(folder_path)

Processing 2180712_CIS_GTU_Study_Material_e-Notes_All-Units_17062020050424AM.pdf...
Processing 3140705_OOP---I_GTU_Study_Material_e-Notes_Unit-1-to-5_11062022015400PM (1).pdf...
Processing e-Notes_PDF_All-Units_24042019090707AM.pdf...
Processing major project final report.pdf...
Processing Services_Proposal Document - Adrta.docx...


# adding a single file

In [26]:
file_path = "E:\Coding\Github desktop\zentixs_assistant_api\pdf\Templates\services poc.docx"
process_and_index_document(file_path)

# testing code

In [27]:

 
queries = [
    # "What are characteristics of cloud computing?",
    # "Explain arrays in detail.",
    # "Explain the benefits of CloudTrail.",
    # "Describe Amazon Simple Storage Service.",
    # "what are data structures?",
    # "What is a stack data structure?",
    # "How has cloud computing evolved in the past decade?",
    # "what is case processing in pharmacovigilance?",
    # "what is the role of Mr. Ramesh patel in the leadership team?",
    # "what is the budget strategy of adrta?",
    # "What are the various differentiators of ADRTA?",
    # "how can mental health counselling be refined based on the main topic of major project final report?",
    # "Summarize the literature review of the major project final report?",
    # "Who are the authors of the major project final report?",
    # "What is the scope of the mental health counselling refining project?"
    'Please give all the details about introduction and executive summary from the services poc',
    'give a brief about QPPV and PSMF in pharmacovigilance'
]
responses = process_multiple_queries(queries)
 
# Display formatted results for each question
for question, formatted_response in responses.items():
    print(formatted_response)

Query: Please give all the details about introduction and executive summary from the services poc

Answer: I cannot provide a response based on the provided information. The context appears to be related to a business proposal or contract, and I do not have enough information to provide an introduction and executive summary for the services proposed. If you'd like to provide more details or clarify what you're looking for, I'll do my best to assist you!

Sources used:
- E:\Coding\python/rag\docs\Services_Proposal Document - Adrta.docx (chunk 45)
- E:\Coding\python/rag\docs\Services_Proposal Document - Adrta.docx (chunk 16)

Query: give a brief about QPPV and PSMF in pharmacovigilance

Answer: Here's a brief overview of QPPV and PSMF in the context of pharmacovigilance:

QPPV and PSMF are service delivery records that demonstrate the successful handling of clients with varying needs. This implies that both QPPV and PSMF have a proven track record of adapting to different client requirem