<a href="https://colab.research.google.com/github/jay6-dev/conversational-health-risk-analyzer/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
# The following imports are removed as they depend on OpenAI or the RAG chain
# from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_core.runnables import RunnablePassthrough

# 1. Load the PDF document
PDF_PATH = "/content/National ART Clinical Guideline 2023_06_06 version 3 Web.pdf"
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

# 2. Split the document into chunks (this is still useful for context if needed)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Removed: 3. Create embeddings and a vector store (requires API key)
# Removed: 4. Define the RAG prompt template
# Removed: 5. Set up the Language Model (LLM) and RAG chain


# ------------------------------------------------------------
# --- NLTK SECTION ---
# ------------------------------------------------------------
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Added to resolve LookupError
nltk.download('stopwords') # Download stopwords for improved query processing
from nltk.corpus import stopwords # Import stopwords

# Extract raw text for NLTK section
extracted_text = "\n".join([doc.page_content for doc in documents])

# 1. Convert to lowercase
lowercase_text = extracted_text.lower()

# 2. Split into sentences
processed_text_chunks = nltk.sent_tokenize(lowercase_text)

print("Sample of processed text chunks (first 5):")
for i, chunk in enumerate(processed_text_chunks[:5]):
    print(f"Chunk {i+1}: {chunk[:200]}...")
print(f"Total number of chunks: {len(processed_text_chunks)}")

# ------------------------------------------------------------
# --- QUERY FUNCTION ---
# ------------------------------------------------------------
def query_pdf(query, text_chunks):
    lowercase_query = query.lower()
    # Tokenize the query words
    query_words = nltk.word_tokenize(lowercase_query)

    # Get English stopwords
    stop_words = set(stopwords.words('english'))

    # Filter out stopwords and non-alphanumeric words from the query
    significant_query_words = [word for word in query_words if word.isalnum() and word not in stop_words]

    print(f"DEBUG: Query: '{query}'")
    print(f"DEBUG: Significant query words: {significant_query_words}")

    relevant_snippets = []
    if not significant_query_words: # Handle queries that are all stopwords or non-alphanumeric
        print("DEBUG: No significant query words found after filtering.")
        return []

    for i, chunk in enumerate(text_chunks):
        # Tokenize the chunk into words for proper word-based matching
        chunk_words_raw = nltk.word_tokenize(chunk)
        # Create a set of alphanumeric words from the chunk for efficient lookup
        chunk_word_set = set(cw_raw for cw_raw in chunk_words_raw if cw_raw.isalnum())

        all_words_found = True
        # Optional: Print chunk_word_set for first few chunks to debug
        if i < 5:
            print(f"DEBUG: Chunk {i} word set: {list(chunk_word_set)[:10]}...")

        for word in significant_query_words:
            if word not in chunk_word_set:
                all_words_found = False
                # Optional: Print if a significant word is not found in a chunk
                if i < 5:
                    print(f"DEBUG:   Significant word '{word}' NOT found in chunk {i} ({chunk[:50]}...)")
                break

        if all_words_found:
            # Optional: Print if all words are found in a chunk
            if i < 5:
                print(f"DEBUG:   All significant words found in chunk {i} ({chunk[:50]}...)")
            relevant_snippets.append(chunk)
    return relevant_snippets

# Test the function
sample_query = "what is hiv"
found_snippets = query_pdf(sample_query, processed_text_chunks)

print(f"Searching for: '{sample_query}'")
if found_snippets:
    print(f"Found {len(found_snippets)} relevant snippets:")
    for i, snippet in enumerate(found_snippets[:5]):
        print(f"Snippet {i+1}: {snippet[:200]}...")
else:
    print("No relevant snippets found for this query.")

# ------------------------------------------------------------
# --- BASIC CHATBOT INTERACTION ---
# ------------------------------------------------------------
print("\n--- Basic Chatbot Interaction ---")
print("Enter your question about the PDF, or type 'exit' to quit.")

while True:
    user_question = input("\nYour Question: ")

    if user_question.lower() == 'exit':
        print("Exiting chatbot. Goodbye!")
        break

    found_snippets = query_pdf(user_question, processed_text_chunks)

    if found_snippets:
        print(f"\nFound {len(found_snippets)} relevant snippets for '{user_question}':")
        for i, snippet in enumerate(found_snippets[:5]):
            print(f"Snippet {i+1}: {snippet[:200]}...")
    else:
        print(f"No relevant information found for '{user_question}' in the document.\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sample of processed text chunks (first 5):
Chunk 1: 2023 art clinical guidelines
for the management of hiv in adults, pregnancy  
and breastfeeding, adolescents, children, infants  
and neonates
june 2023 version 3
republic of south africa national dep...
Chunk 2: __________________________________________________________ 2
overview ___________________________________________________________________________ 3
the goals of art 3
art eligibility  ________________...
Chunk 3: the principal 
goal of art is to attain and maintain viral suppression, which will 
prevent new hiv infections, increase life expectancy, decrease 
morbidity and mortality as well as improve the quali...
Chunk 4: the “test and treat all” approach has allowed people living with 
hiv (plhiv) to access art timeously....
Chunk 5: south is committed to using available technology and evidence 
to continue the fight against hiv....
Total number of chunks: 569
Searching for: 'what is hiv'
No relevant snippets found for this 