In [1]:
print('Hie')

Hie


In [2]:
%pwd

'c:\\Users\\TOBBY\\Documents\\Medical-Chatbot-Generative-AI\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\TOBBY\\Documents\\Medical-Chatbot-Generative-AI'

In [5]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from typing import List

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load environment variables from .env file
load_dotenv()

True

In [7]:
def validate_environment():
    """Validate required environment variables"""
    required_vars = ['PINECONE_API_KEY', 'GEMINI_API_KEY']
    missing_vars = [var for var in required_vars if not os.environ.get(var)]
    
    if missing_vars:
        raise ValueError(f"Missing environment variables: {', '.join(missing_vars)}")
    
    print("✓ Environment variables validated")

# Validate environment before proceeding
validate_environment()

✓ Environment variables validated


In [8]:
# Enhanced PDF Loading with Error Handling
def load_pdf_file(data):
    """Extract Data From the PDF File"""
    try:
        loader = DirectoryLoader(data,
                               glob="*.pdf",
                               loader_cls=PyPDFLoader)
        documents = loader.load()
        print(f"✓ Successfully loaded {len(documents)} PDF documents")
        return documents
    except Exception as e:
        print(f"✗ Error loading PDF files: {e}")
        return []
try:
    extracted_data = load_pdf_file(data='data/')
    print(f"Total documents: {len(extracted_data)}")
except Exception as e:
    print(f"Failed to load documents: {e}")
    extracted_data = []


✓ Successfully loaded 1396 PDF documents
Total documents: 1396


In [9]:
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

# Apply filtering with error handling - keeping your variable name
try:
    minimal_docs = filter_to_minimal_docs(extracted_data)
    print(f"✓ Filtered to {len(minimal_docs)} minimal documents")
except Exception as e:
    print(f"✗ Error filtering documents: {e}")
    minimal_docs = []

✓ Filtered to 1396 minimal documents


In [10]:
# Enhanced Text Splitting - keeping your exact function name
def text_split(minimal_docs):
    """Split the Data into Text Chunks - Enhanced with error handling"""
    try:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500, 
            chunk_overlap=20,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]  # Better separators
        )
        text_chunks = text_splitter.split_documents(minimal_docs)
        print(f"✓ Created {len(text_chunks)} text chunks")
        return text_chunks
    except Exception as e:
        print(f"✗ Error splitting text: {e}")
        return []

# Your original variable names preserved
text_chunks = text_split(minimal_docs)
print("Length of Text Chunks", len(text_chunks))

✓ Created 12832 text chunks
Length of Text Chunks 12832


In [11]:
#Enhanced Embeddings 
def download_hugging_face_embeddings():
    """Download the Embeddings from Hugging Face"""
    try:
        embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        print("✓ HuggingFace embeddings initialized successfully")
        return embeddings
    except Exception as e:
        print(f"✗ Error initializing embeddings: {e}")
        raise
    
embeddings = download_hugging_face_embeddings()

try:
    query_result = embeddings.embed_query("Hello world")
    print("Length", len(query_result))
    print("✓ Embeddings test successful")
except Exception as e:
    print(f"✗ Embeddings test failed: {e}")

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


✓ HuggingFace embeddings initialized successfully
Length 384
✓ Embeddings test successful


In [12]:
#Enhanced API Key Management
try:
    PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
    GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
    
    if not PINECONE_API_KEY:
        raise ValueError("PINECONE_API_KEY environment variable not set")
    if not GEMINI_API_KEY:
        raise ValueError("GEMINI_API_KEY environment variable not set")
        
    print("✓ API keys retrieved from environment")
except Exception as e:
    print(f"✗ Error getting API keys: {e}")
    raise
pinecone_api_key = PINECONE_API_KEY


✓ API keys retrieved from environment


In [13]:
# Enhanced Pinecone Setup
try:
    pc = Pinecone(api_key=pinecone_api_key)
    print("✓ Pinecone client initialized")
except Exception as e:
    print(f"✗ Error initializing Pinecone: {e}")
    raise

index_name = "medicalbot"

# Enhanced Index Management
try:
    if not pc.has_index(index_name):        
        pc.create_index(
            name=index_name,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws", 
                region="us-east-1"
            )
        )
        print(f"✓ Created new index: {index_name}")
    else:
        print(f"✓ Using existing index: {index_name}")
        
    index = pc.Index(index_name)
    print("✓ Index ready")
except Exception as e:
    print(f"✗ Error with index: {e}")
    raise
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY


✓ Pinecone client initialized
✓ Using existing index: medicalbot
✓ Index ready


In [None]:
# Enhanced Vector Store Creation 
def create_or_load_docsearch(text_chunks, index_name, embeddings):
    """Create or load vector store with fallback"""
    if not text_chunks:
        print("⚠ No text chunks available, loading existing vector store")
        return PineconeVectorStore.from_existing_index(
            index_name=index_name,
            embedding=embeddings
        )
    
    try:
        docsearch = PineconeVectorStore.from_documents(
            documents=text_chunks,
            index_name=index_name,
            embedding=embeddings, 
        )
        print(f"✓ Vector store created with {len(text_chunks)} documents")
        return docsearch
    except Exception as e:
        print(f"✗ Error creating vector store: {e}")
        print("⚠ Attempting to load existing vector store...")
        return PineconeVectorStore.from_existing_index(
            index_name=index_name,
            embedding=embeddings
        )

# Your original variable name preserved
docsearch = create_or_load_docsearch(text_chunks, index_name, embeddings)

In [14]:
#Enhanced existing vector store loading
try:
    # Load Existing index 
    docsearch_existing = PineconeVectorStore.from_existing_index(
        index_name=index_name,
        embedding=embeddings
    )
    print("✓ Existing vector store loaded successfully")
    # Use the existing one to ensure we have data
    docsearch = docsearch_existing
except Exception as e:
    print(f"⚠ Could not load existing vector store: {e}")
    # Keep the newly created one

✓ Existing vector store loaded successfully


In [15]:
try:
    dswith = Document(
        page_content="dswithbappy is a youtube channel that provides tutorials on various topics.",
        metadata={"source": "Youtube"}
    )
    docsearch.add_documents(documents=[dswith])
    print("✓ Added custom document successfully")
except Exception as e:
    print(f"✗ Error adding custom document: {e}")

✓ Added custom document successfully


In [16]:
try:
    retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
    print("✓ Retriever configured successfully")
except Exception as e:
    print(f"✗ Error setting up retriever: {e}")
    raise

✓ Retriever configured successfully


In [17]:
try:
    retrieved_docs = retriever.invoke("What is Cardiac shunt?")
    print(f"✓ Retrieved {len(retrieved_docs)} documents for test query")
except Exception as e:
    print(f"✗ Error testing retrieval: {e}")

✓ Retrieved 3 documents for test query


In [18]:
retrieved_docs

[Document(id='1f8bb4cb-79cb-418b-aea3-ef0b51957ec1', metadata={'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf'}, page_content='weak spot in the wall of an artery or heart chamber.\nCardiac shunt —A defect in the wall of the heart\nthat allows blood from different chambers to mix.\nCoronary occlusive artery disease —Blockage of\nthe arteries that supply blood to the heart; fre-\nquently a precursor to a heart attack.\nElectrocardiogram (ECG)—A graph that shows the\nelectrical charges that trigger the heart to contract.\nHeart abnormalities alter the graph, giving clues to\nthe source of the abnormality.'),
 Document(id='eb0b9689-66c0-466d-a6da-7f8d108527cc', metadata={'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf'}, page_content='weak spot in the wall of an artery or heart chamber.\nCardiac shunt —A defect in the wall of the heart\nthat allows blood from different chambers to mix.\nCoronary occlusive artery disease —Blockage of\nthe arteries that supply bl

In [19]:

#Enhanced Chat Model Setup
try:
    chatModel = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash-latest",
        google_api_key=os.environ["GEMINI_API_KEY"],
        temperature=0.1,  
        max_output_tokens=1000 
    )
    print("✓ Chat model initialized successfully")
except Exception as e:
    print(f"✗ Error initializing chat model: {e}")
    raise
system_prompt = (
    "You are NthanziLanga+ AI assistant, created by TecNix to help with health-related questions and information. "
    "You are designed to provide helpful, accurate health guidance while encouraging users to consult healthcare professionals for medical advice. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, say that you don't know. "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)



✓ Chat model initialized successfully


In [21]:
try:
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )
    print("✓ Prompt template created")
except Exception as e:
    print(f"✗ Error creating prompt: {e}")
    raise

✓ Prompt template created


In [22]:
try:
    question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    print("✓ RAG chain created successfully")
except Exception as e:
    print(f"✗ Error creating RAG chain: {e}")
    raise

✓ RAG chain created successfully


In [23]:
#Enhanced query execution - keeping your exact structure
def execute_query(rag_chain, query):
    """Execute query with error handling"""
    try:
        response = rag_chain.invoke({"input": query})
        print(f"✓ Query processed successfully: {query}")
        return response
    except Exception as e:
        print(f"✗ Error processing query '{query}': {e}")
        return {"answer": "Sorry, I encountered an error processing your question."}

In [24]:
try:
    response = rag_chain.invoke({"input": "what is malaria?"})
    print(response["answer"])
except Exception as e:
    print(f"✗ Error in original query: {e}")
    response = {"answer": "Error occurred during query processing"}

Malaria is a disease caused by Plasmodium parasites, transmitted through the bite of infected Anopheles mosquitoes.  It's characterized by severe, recurring chills and fever.  For diagnosis and treatment, consult a healthcare professional.
