<a href="https://colab.research.google.com/github/karthik6717/GenAI/blob/master/Q%26A_Logic_%26_Prompt_The_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Let's install key libraries
print("Installing necessary libraries...")
!pip install -q transformers accelerate bitsandbytes torch pypdf gradio
print("Libraries installed successfully!")

In [None]:
# !pip uninstall -y transformers accelerate bitsandbytes
# !rm -rf ~/.cache/huggingface/modules
# !rm -rf ~/.cache/huggingface/hub


In [None]:
# Let's import these libraries
import torch  # PyTorch, the backend for transformers
import pypdf  # For reading PDFs
import gradio as gr  # For building the UI
from IPython.display import display, Markdown  # For nicer printing in notebooks
print("Core libraries imported.")

In [None]:
import os
from huggingface_hub import login, notebook_login
print("Attempting Hugging Face login...")

# Use notebook_login() for an interactive prompt in Colab/Jupyter
# This is generally preferred for notebooks.

notebook_login()
print("Login successful (or token already present)!")

In [None]:
!pip install -U bitsandbytes

In [None]:
# Let's import AutoModelForCasualLM
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Let's choose a small, powerful model suitable for Colab.
# Alternatives you could try (might need login/agreement):
# model_id = "unsloth/gemma-3-4b-it-GGUF"
model_id = "Qwen/Qwen2.5-3B-Instruct"
#model_id = "microsoft/Phi-4-mini-instruct"
# model_id = "unsloth/Llama-3.2-3B-Instruct"

In [None]:
# Let's load the Tokenizer
# The tokenizer prepares text input for the model
# trust_remote_code=True is sometimes needed for newer models with custom code.
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
print("Tokenizer loaded successfully.")

In [None]:
import torch

# Force a complete reinstallation of transformers to resolve module caching issues
#print("Uninstalling existing transformers...")
#!pip uninstall -y transformers
#print("Reinstalling latest transformers...")
#!pip install transformers accelerate bitsandbytes torch

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

print(f"Loading model: {model_id}")
print("This might take a few minutes, especially the first time...")

# Create BitsAndBytesConfig for 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                         bnb_4bit_compute_dtype = torch.float16,  # or torch.bfloat16 if available
                                         bnb_4bit_quant_type = "nf4",  # normal float 4 quantization
                                         bnb_4bit_use_double_quant = True  # use nested quantization for more efficient memory usage
                                         )

# Load the model with the quantization config
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = quantization_config,
                                             device_map = "auto",
                                             trust_remote_code = True)

In [None]:
# Let's define a prompt
prompt = "Explain how Electric Vehicles work in a funny way!"

#prompt = "What is the capital of France?"

# Method 1: Let's test the model and Tokenizer using the .generate() method!

# Let's encode the input first and move it to the model's device
inputs = tokenizer(prompt, return_tensors = "pt").to(model.device)

# Then we will generate the output
outputs = model.generate(**inputs, max_new_tokens = 1000)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Using print() instead of print_markdown() because print_markdown() is not yet defined.
print(response)

In [None]:
import requests
from pathlib import Path

# --- Get the PDF File ---
pdf_url = "https://www.arpnjournals.org/jeas/research_papers/rp_2025/jeas_0525_9600.pdf"
pdf_filename = "jeas_0525_9600.pdf"
pdf_path = Path(pdf_filename)

# Define headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Download the file if it doesn't exist
if not pdf_path.exists():
    response = requests.get(pdf_url, headers=headers) # Pass the headers here
    response.raise_for_status()  # Check for download errors
    pdf_path.write_bytes(response.content)
    print(f"PDF downloaded successfully to {pdf_path}")
else:
    print(f"PDF file already exists at {pdf_path}")


# --- Read Text from PDF using pypdf ---
pdf_text = ""

print(f"Reading text from {pdf_path}...")
reader = pypdf.PdfReader(pdf_path)
num_pages = len(reader.pages)
print(f"PDF has {num_pages} pages.")

# Extract text from each page
all_pages_text = []
for i, page in enumerate(reader.pages):

    page_text = page.extract_text()
    if page_text:  # Only add if text extraction was successful
        all_pages_text.append(page_text)
    # print(f"Read page {i+1}/{num_pages}") # Uncomment for progress

# Join the text from all pages
pdf_text = "\n".join(all_pages_text)
print(f"Successfully extracted text. Total characters: {len(pdf_text)}")

In [None]:
def print_markdown(text):
    """Displays text as Markdown in Colab/Jupyter."""
    display(Markdown(text))

In [None]:
# Display a small snippet of the PDF
print("\n--- Snippet of Extracted Text ---")
print_markdown(f"{pdf_text[:1000]}")

In [None]:
# Define a limit for the context length to avoid overwhelming the model

MAX_CONTEXT_CHARS = 6000

def answer_question_from_pdf(document_text, question, llm_pipeline):
    """
    Answers a question based on the provided document text using the loaded LLM pipeline.

    Args:
        document_text (str): The text extracted from the PDF.
        question (str): The user's question.
        llm_pipeline (transformers.pipeline): The initialized text-generation pipeline.

    Returns:
        str: The model's generated answer.
    """
    # Truncate context if necessary
    if len(document_text) > MAX_CONTEXT_CHARS:
        print(f"Warning: Document text ({len(document_text)} chars) exceeds limit ({MAX_CONTEXT_CHARS} chars). Truncating.")
        context = document_text[:MAX_CONTEXT_CHARS] + "..."
    else:
        context = document_text

    # Let's define the Prompt Template
    # We instruct the model to use only the provided document.
    # Using a format the model expects (like Phi-3's chat format) can improve results.
    # <|system|> provides context/instructions, <|user|> is the question.
    # Note: Different models might prefer different prompt structures.
    prompt_template = f"""<|system|>
    You are an AI assistant. Answer the following question based *only* on the provided document text. If the answer is not found in the document, say "The document does not contain information on this topic." Do not use any prior knowledge.

    Document Text:
    ---
    {context}
    ---
    <|end|>
    <|user|>
    Question: {question}<|end|>
    <|assistant|>
    Answer:""" # We prompt the model to start generating the answer

    print(f"\n--- Generating Answer for: '{question}' ---")

    # Run Inference on the chosen model
    outputs = llm_pipeline(prompt_template,
                           max_new_tokens = 500,  # Limit answer length
                           do_sample = True,
                           temperature = 0.2,   # Lower temperature for more factual Q&A
                           top_p = 0.9)

    # Let's extract the answer
    # The output includes the full prompt template. We need the text generated *after* it.
    full_generated_text = outputs[0]['generated_text']
    answer_start_index = full_generated_text.find("Answer:") + len("Answer:")
    raw_answer = full_generated_text[answer_start_index:].strip()

    # Sometimes the model might still include parts of the prompt or trail off.
    # Basic cleanup: Find the end-of-sequence token if possible, or just return raw.
    # Phi-3 uses <|end|> or <|im_end|>
    end_token = "<|end|>"
    if end_token in raw_answer:
            raw_answer = raw_answer.split(end_token)[0]

    print("--- Generation Complete ---")
    return raw_answer


In [None]:
# Let's create the text-generation pipeline first
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Let's test the function
test_question = "What is this document about?"
generated_answer = answer_question_from_pdf(pdf_text, test_question, pipe)

print("\nTest Question:")
print_markdown(f"**Q:** {test_question}")
print("\nGenerated Answer:")
print_markdown(f"**A:** {generated_answer}")

In [None]:
# Make sure we have the pdf_text
# Configuration: Models available for selection
# Use models known to fit in Colab free tier with 4-bit quantization

available_models = {
    "Llama 3.2": "unsloth/Llama-3.2-3B-Instruct",
    "Microsoft Phi-4 Mini": "microsoft/Phi-4-mini-instruct",
    "Google Gemma 3": "unsloth/gemma-3-4b-it-GGUF"
    }

In [None]:
# --- Global State (or use gr.State in Blocks) ---
# To keep track of the currently loaded model/pipeline
current_model_id = None
current_pipeline = None
print(f"Models available for selection: {list(available_models.keys())}")


# Define a function to Load/Switch Models
def load_llm_model(model_name):
    """Loads the selected LLM, unloading the previous one."""
    global current_model_id, current_pipeline, tokenizer, model

    new_model_id = available_models.get(model_name)
    if not new_model_id:
        return "Invalid model selected.", None  # Return error message and None pipeline

    if new_model_id == current_model_id and current_pipeline is not None:
        print(f"Model {model_name} is already loaded.")
        # Indicate success but don't reload
        return f"{model_name} already loaded.", current_pipeline

    print(f"Switching to model: {model_name} ({new_model_id})...")

    # Unload previous model (important for memory)
    # Clear variables and run garbage collection
    current_pipeline = None
    if "model" in locals():
        del model
    if "tokenizer" in locals():
        del tokenizer
    if "pipe" in locals():
        del pipe
    torch.cuda.empty_cache()  # Clear GPU memory cache
    import gc

    gc.collect()
    print("Previous model unloaded (if any).")

    # --- Load the new model ---
    loading_message = f"Loading {model_name}..."
    try:
        # Load Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(new_model_id, trust_remote_code = True)

        # Load Model (Quantized)
        model = AutoModelForCausalLM.from_pretrained(new_model_id,
                                                     torch_dtype = "auto",  # "torch.float16", # Or bfloat16 if available
                                                     load_in_4bit = True,
                                                     device_map = "auto",
                                                     trust_remote_code = True)

        # Create Pipeline
        loaded_pipeline = pipeline(
            "text-generation", model = model, tokenizer = tokenizer, torch_dtype = "auto", device_map = "auto")

        print(f"Model {model_name} loaded successfully!")
        current_model_id = new_model_id
        current_pipeline = loaded_pipeline  # Update global state
        # Use locals() or return values with gr.State for better Gradio practice
        return f"{model_name} loaded successfully!", loaded_pipeline  # Status message and the pipeline object

    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        current_model_id = None
        current_pipeline = None
        return f"Error loading {model_name}: {e}", None  # Error message and None pipeline

In [None]:
# --- Function to handle Q&A Submission ---
# This function now relies on the globally managed 'current_pipeline'
# In a more robust Gradio app, you'd pass the pipeline via gr.State
def handle_submit(question):
    """Handles the user submitting a question."""
    if not current_pipeline:
        return "Error: No model is currently loaded. Please select a model."
    if not pdf_text:
        return "Error: PDF text is not loaded. Please run Section 4."
    if not question:
        return "Please enter a question."

    print(f"Handling submission for question: '{question}' using {current_model_id}")
    # Call the Q&A function defined in Section 5
    answer = answer_question_from_pdf(pdf_text, question, current_pipeline)
    return answer

In [None]:
# --- Build Gradio Interface using Blocks ---
print("Building Gradio interface...")
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        f"""
    # PDF Q&A Bot Using Hugging Face Open-Source Models
    Ask questions about the document ('{pdf_filename}' if loaded, {len(pdf_text)} chars).
    Select an open-source LLM to answer your question.
    **Note:** Switching models takes time as the new model needs to be downloaded and loaded into the GPU.
    """
    )

    # Store the pipeline in Gradio state for better practice (optional for this simple version)
    # llm_pipeline_state = gr.State(None)

    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=list(available_models.keys()),
            label="ü§ñ Select LLM Model",
            value=list(available_models.keys())[0],  # Default to the first model
        )
        status_textbox = gr.Textbox(label="Model Status", interactive=False)

    question_textbox = gr.Textbox(
        label="‚ùì Your Question", lines=2, placeholder="Enter your question about the document here..."
    )
    submit_button = gr.Button("Submit Question", variant="primary")
    answer_textbox = gr.Textbox(label="üí° Answer", lines=5, interactive=False)

    # --- Event Handlers ---
    # When the dropdown changes, load the selected model
    model_dropdown.change(
        fn = load_llm_model,
        inputs = [model_dropdown],
        outputs = [status_textbox],  # Update status text. Ideally also update a gr.State for the pipeline
        # outputs=[status_textbox, llm_pipeline_state] # If using gr.State
    )

    # When the button is clicked, call the submit handler
    submit_button.click(
        fn = handle_submit,
        inputs = [question_textbox],
        outputs = [answer_textbox],
        # inputs=[question_textbox, llm_pipeline_state], # Pass state if using it
    )

    # --- Initial Model Load ---
    # Easier: Manually load first model *before* launching Gradio for simplicity here
    initial_model_name = list(available_models.keys())[0]
    print(f"Performing initial load of default model: {initial_model_name}...")
    status, _ = load_llm_model(initial_model_name)
    status_textbox.value = status  # Set initial status
    print("Initial load complete.")


# --- Launch the Gradio App ---
print("Launching Gradio demo...")
demo.launch(debug=True)  # debug=True provides more detailed logs

