In [None]:
!pip install PyMuPDF requests faiss-cpu numpy

In [None]:
# Import libraries (make sure these are installed: pip install pymupdf faiss-cpu requests pillow)
import fitz  # PyMuPDF
import requests
import faiss
import numpy as np
import json
from PIL import Image
import io
import base64

# --- Helper Functions ---

def read_pdf_as_images(file_path):
    """
    Read the PDF file and convert each page to a PNG image.
    """
    doc = fitz.open(file_path)
    pages = {}
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Increase resolution for better quality
        img = Image.open(io.BytesIO(pix.tobytes()))
        pages[page_num] = img
    return pages


def generate_markdown(text, api_key):

    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"  # Or your preferred model
    headers = {"Content-Type": "application/json"}


    payload = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": "BASED ON THIS GIVEN TEXT, PROVIDE ME READABLE INFORMATION IN A MARKDOWN FORMAT, WHICH INCLUDES HEADINGS, TEXTS, PARAGRAPHS, BULLETS AND TABLES -> IT IS GOING TO BE USED TO CREATE RAG: --> TEXT STARTS HERE: \n" + text},
                ],
            }
        ]
    }

    response = requests.post(f"{api_url}?key={api_key}", json=payload, headers=headers)
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    response_data = response.json()

    # Check for the expected structure in the response
    if "candidates" in response_data and response_data["candidates"] and "content" in response_data["candidates"][0] and "parts" in response_data["candidates"][0]["content"] and response_data["candidates"][0]["content"]["parts"]:
        return response_data["candidates"][0]["content"]["parts"][0]["text"].strip()
    else:
        return "Error: Could not process the page due to unexpected API response."


def generate_markdown_from_image(image, prompt, api_key):
    """
    Generate markdown using Gemini API based on the extracted image from the page.
    Handles potential errors and retries (up to 2 times).
    """
    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"  # Or your preferred model
    headers = {"Content-Type": "application/json"}

    # Convert image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")

    payload = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {"inline_data": {"mime_type": "image/png", "data": img_base64}},
                ],
            }
        ]
    }

    for attempt in range(3):  # Retry up to 2 times
        try:
            response = requests.post(f"{api_url}?key={api_key}", json=payload, headers=headers)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            response_data = response.json()

            # Check for the expected structure in the response
            if "candidates" in response_data and response_data["candidates"] and "content" in response_data["candidates"][0] and "parts" in response_data["candidates"][0]["content"] and response_data["candidates"][0]["content"]["parts"]:
                return response_data["candidates"][0]["content"]["parts"][0]["text"].strip()
            else:
                print(f"Unexpected API response structure (attempt {attempt + 1}):", response_data)
                return "Error: Could not process the page due to unexpected API response."

        except requests.exceptions.RequestException as e:
            print(f"Request error (attempt {attempt + 1}):", e)
        except (KeyError, IndexError, json.JSONDecodeError) as e:
            print(f"JSON parsing error (attempt {attempt + 1}):", e)

    return "Error: Could not process the page after multiple attempts."


def process_pdf(pdf_path, gemini_api_key):
    """
    Process the PDF by converting pages to images and generating markdown for each page.
    """
    pages_dict = read_pdf_as_images(pdf_path)
    markdowns = {}
    for page_num, image in pages_dict.items():
        prompt = """

        **Prompt:**

"Analyze the provided **image of a PDF page** and generate a structured markdown output that includes the following:

1. **Headings and Text**: Extract all headings, subheadings, and body text. Format them as markdown headings (`##`, `###`, etc.) and bullet points or paragraphs where applicable. Include any annotations, references, or footnotes.

2. **Tables**: Identify and extract all tables. Format them as markdown tables, ensuring that headers, rows, and columns are properly structured. Include any metadata or descriptions associated with the table.

3. **Figures and Charts**: Extract descriptions of figures, charts, and graphs. Provide a detailed markdown description of the visual, including axes, data points, trends, and legends. If applicable, include the type of chart (e.g., bar chart, line graph, pie chart).

4. **Page Metadata**: Extract page numbers, footers, and any other metadata. Format them as markdown text with appropriate labels (e.g., `### Page Number`, `### Page Footer`).

5. **Grounding Information**: For each extracted element (text, table, figure, etc.), include grounding information such as the bounding box coordinates (`l`, `t`, `r`, `b`) and the page number where the element is located.

6. **Chunking**: Organize the output into chunks, where each chunk represents a distinct element (e.g., a paragraph, table, figure). Assign a unique `chunk_id` to each chunk and specify its type (e.g., `text`, `table`, `figure`, `page_number`, `page_footer`).

7. **Output Format**: Return the output as a JSON object with two keys:
   - `"markdown"`: A single markdown string containing all the extracted content, formatted as described above.
   - `"chunks"`: A list of objects, where each object represents a chunk of content. Each chunk should include:
     - `"text"`: The extracted content as a string.
     - `"grounding"`: The bounding box and page number information.
     - `"chunk_type"`: The type of chunk (e.g., `text`, `table`, `figure`).
     - `"chunk_id"`: A unique identifier for the chunk.

**Example Output:**

```json
{
  "markdown": "## Heading 1\n\nThis is a paragraph of text.\n\n### Subheading\n\n- Bullet point 1\n- Bullet point 2\n\n## Table 1\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data 1   | Data 2   |\n\n## Figure 1\n\nThis is a description of a bar chart showing...\n\n### Page Number\n\n1\n\n### Page Footer\n\n2024 Annual Report",
  "chunks": [
    {
      "text": "## Heading 1\n\nThis is a paragraph of text.",
      "grounding": [
        {
          "box": {
            "l": 0.1,
            "t": 0.2,
            "r": 0.9,
            "b": 0.3
          },
          "page": 0
        }
      ],
      "chunk_type": "text",
      "chunk_id": "12345"
    },
    {
      "text": "## Table 1\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data 1   | Data 2   |",
      "grounding": [
        {
          "box": {
            "l": 0.1,
            "t": 0.4,
            "r": 0.9,
            "b": 0.5
          },
          "page": 0
        }
      ],
      "chunk_type": "table",
      "chunk_id": "67890"
    }
  ]
}
```

**Instructions:**
- Ensure the output is well-structured and easy to read.
- Handle edge cases (e.g., missing data, overlapping elements) gracefully.
- Use consistent formatting for markdown and JSON.


        DO NOT PROVIDE ANY OTHER TEXT THAN THE DATA OF THE IMAGE I PROVIDED



        """
        markdown = generate_markdown_from_image(image, prompt, gemini_api_key)
        print(markdown)
        markdowns[page_num] = generate_markdown(markdown,gemini_api_key)
    return markdowns


def get_embeddings_from_text(texts):
    """
    Convert a list of texts (markdowns) to vector embeddings.
    """
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-mpnet-base-v2')  # Or any other suitable model
    embeddings = model.encode(texts)
    return embeddings


def create_faiss_index(markdowns):
    """
    Create a FAISS index to store and retrieve markdown data.
    """
    embeddings = get_embeddings_from_text(list(markdowns.values()))
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Flat index for simplicity
    index.add(embeddings)
    return index, embeddings

# --- Main Processing (Run this part once) ---

# Replace with your actual PDF path and API key
pdf_path = "Test.pdf"
gemini_api_key = ""

# Process the PDF and create the RAG system
markdowns = process_pdf(pdf_path, gemini_api_key)
index, embeddings = create_faiss_index(markdowns)

print("PDF processed and RAG system ready!")
# Now the 'index', 'embeddings', and 'markdowns' are available for the chatbot.

In [6]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- Chatbot Functions ---

def retrieve_relevant_markdowns(query, index, embeddings, markdowns, k=3):
    """
    Retrieve the most relevant markdown(s) for the user query from FAISS index.
    """
    query_embedding = get_embeddings_from_text([query])  # Get embedding for the query
    distances, indices = index.search(query_embedding, k)  # Search for k nearest neighbors
    relevant_markdowns = [markdowns[i] for i in indices[0] if i in markdowns]  # Ensure valid indices
    return relevant_markdowns


def generate_response(prompt, api_key):
    """
    Generate markdown using Gemini API based on the extracted image from the page.
    Handles potential errors and retries (up to 2 times).
    """
    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"  # Or your preferred model
    headers = {"Content-Type": "application/json"}


    payload = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                ],
            }
        ]
    }

    response = requests.post(f"{api_url}?key={api_key}", json=payload, headers=headers)
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    response_data = response.json()

    # Check for the expected structure in the response
    if "candidates" in response_data and response_data["candidates"] and "content" in response_data["candidates"][0] and "parts" in response_data["candidates"][0]["content"] and response_data["candidates"][0]["content"]["parts"]:
        return response_data["candidates"][0]["content"]["parts"][0]["text"].strip()
    else:
        return "Error: Could not process the page due to unexpected API response."


def generate_response_from_markdowns(query, relevant_markdowns, api_key):
    """
    Generate a response from the relevant markdowns by feeding them back to Gemini API.
    """
    if not relevant_markdowns:
        return "I couldn't find any relevant information to answer your question."

    # Combine the relevant markdowns into a single context
    context = "\n\n".join(relevant_markdowns)

    # Improved prompt to reduce hallucinations
    prompt = f"""
    **Instructions:**
    - You are a helpful assistant that answers questions based on the provided context.
    - The context is extracted from a PDF document and may include text, tables, figures, and metadata.
    - Your task is to provide a concise and accurate answer to the user's question using ONLY the information in the context.
    - If the context does not contain enough information to answer the question, say "I don't have enough information to answer that."
    - Do not make up or guess any information. Stick strictly to the context.

    **Context:**
    {context}

    **Question:**
    {query}

    **Answer:**
    """

    return generate_response(prompt, api_key)


def chatbot(query, index, embeddings, markdowns, api_key):
    """
    Chatbot function that uses the RAG system to generate responses.
    """
    relevant_markdowns = retrieve_relevant_markdowns(query, index, embeddings, markdowns, k=3)
    response = generate_response_from_markdowns(query, relevant_markdowns, api_key)
    return response

# --- UI Elements ---

# Text box for user input
user_input = widgets.Textarea(
    value='',
    placeholder='Type your question here...',
    description='Your Question:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='150px')
)

# Output area for chatbot response
output_area = widgets.Output()

# Button to trigger response
submit_button = widgets.Button(
    description="Ask",
    disabled=False,
    button_style='success',
    icon='check'
)

# Function to handle button click
def on_button_click(b):
    with output_area:
        clear_output()  # Clear the previous output
        user_query = user_input.value
        if user_query.lower() == 'exit':
            print("Exiting chatbot...")
            return
        print("You: " + user_query)
        response = chatbot(user_query, index, embeddings, markdowns, gemini_api_key)
        print("Chatbot: " + response)

# Bind button click event
submit_button.on_click(on_button_click)

# Display widgets
display(user_input, submit_button, output_area)


Textarea(value='', description='Your Question:', layout=Layout(height='150px', width='100%'), placeholder='Typ…

Button(button_style='success', description='Ask', icon='check', style=ButtonStyle())

Output()


You are an expert in extracting and structuring financial data from images. Your task is to analyze the provided image of a financial document and convert it into a well-structured Markdown format. The output should include all details such as text, tables, charts, and images, exactly as they appear in the image. Follow these guidelines:

1. **Text**: Convert all text into Markdown format. Use headings (`#`, `##`, `###`) for titles and sections, and paragraphs for regular text.
2. **Tables**: Convert tables into Markdown tables. Ensure that row and column spans are preserved. Use `|` for columns and `-` for row separators. If the table has borders or special formatting, replicate it in Markdown.
3. **Charts/Images**: If the image contains charts or graphs, describe them in Markdown format and include any relevant data points or labels. If the image is purely visual (e.g., a logo), mention it as `[Image: Description]`.
4. **Years and Values**: Ensure that all years, monetary values (in $), and percentages are accurately captured and formatted.
5. **Structure**: Maintain the original structure of the document, including sections, subsections, and any hierarchical relationships.

**Output Format**:
- Only provide the Markdown output. Do not include any additional text or explanations.
- Ensure the output is clean, well-formatted, and ready to be fed into a RAG system.

**Example Input (Image of a Financial Document)**:
- The image contains a table with financial data, text describing the data, and a chart.

**Example Output**:
```markdown
# Financial Performance Overview

## Net Income
Net income was $7,892 million in 2024 compared to $7,450 million in 2023, an increase of 6%.

## Table: Impact of Foreign Currency Translation
| Currency Pair               | 2024 Avg. Rate | % Change | 2023 Avg. Rate | % Change |
|-----------------------------|----------------|----------|----------------|----------|
| U.S. Dollar/Canadian Dollar | 0.735          | (0.9)%   | 0.742          | (4.5)%   |
| Mexican Peso/Canadian Dollar | 13.091         | (2.5)%   | 13.424         | (15.0)%  |

## Chart: Revenue Growth
- **Description**: A line chart showing revenue growth from 2020 to 2024.
- **Data Points**:
  - 2020: $10,000M
  - 2021: $11,500M
  - 2022: $12,300M
  - 2023: $13,000M
  - 2024: $14,200M

[Image: Company Logo]
```
