In [None]:
!pip install PyMuPDF requests faiss-cpu numpy

In [18]:
# Import libraries
import fitz  # PyMuPDF
import requests
import faiss
import numpy as np
import json

# Step 1: Read PDF and Split into Pages
def read_pdf(file_path):
    """
    Read the PDF file and split it into individual pages.
    """
    doc = fitz.open(file_path)
    pages = {}
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pages[page_num] = page.get_text()  # Extract text from page
    return pages


def generate_markdown_from_page(text, prompt, api_key):
    """
    Generate markdown using Gemini API based on the extracted text from the page.
    """

    api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
    headers = {"Content-Type": "application/json"}

    payload = {"contents": [{"role": "user", "parts": [{"text": prompt + " \n " + text}]}]}
    response = requests.post(f"{api_url}?key={api_key}", json=payload, headers=headers)
    return response.json()["candidates"][0]["content"]["parts"][0]["text"].strip()

    if response.status_code == 200:
        try:
            response_data = response.json()
            print(response_data)  # Debug: Print the full response to understand its structure
            return response_data.get('output', '')
        except json.JSONDecodeError:
            print("Error decoding JSON response")
            return ""
    else:
        print(f"Error: {response.status_code}")
        print(response.text)  # Debug: Print the error message returned by the API
        return ""



def process_pdf(pdf_path, gemini_api_key):
    """
    Process the PDF by extracting text and generating markdown for each page.
    """
    pages_dict = read_pdf(pdf_path)
    markdowns = {}
    for page_num, text in pages_dict.items():
        prompt = """

**Prompt:**

"Analyze the provided  **textual information** and generate a structured markdown output that includes the following:

1. **Headings and Text**: Extract all headings, subheadings, and body text. Format them as markdown headings (`##`, `###`, etc.) and bullet points or paragraphs where applicable. Include any annotations, references, or footnotes.

2. **Tables**: Identify and extract all tables. Format them as markdown tables, ensuring that headers, rows, and columns are properly structured. Include any metadata or descriptions associated with the table.

3. **Figures and Charts**: Extract descriptions of figures, charts, and graphs. Provide a detailed markdown description of the visual, including axes, data points, trends, and legends. If applicable, include the type of chart (e.g., bar chart, line graph, pie chart).

4. **Page Metadata**: Extract page numbers, footers, and any other metadata. Format them as markdown text with appropriate labels (e.g., `### Page Number`, `### Page Footer`).

5. **Grounding Information**: For each extracted element (text, table, figure, etc.), include grounding information such as the bounding box coordinates (`l`, `t`, `r`, `b`) and the page number where the element is located.

6. **Chunking**: Organize the output into chunks, where each chunk represents a distinct element (e.g., a paragraph, table, figure). Assign a unique `chunk_id` to each chunk and specify its type (e.g., `text`, `table`, `figure`, `page_number`, `page_footer`).

7. **Output Format**: Return the output as a JSON object with two keys:
   - `"markdown"`: A single markdown string containing all the extracted content, formatted as described above.
   - `"chunks"`: A list of objects, where each object represents a chunk of content. Each chunk should include:
     - `"text"`: The extracted content as a string.
     - `"grounding"`: The bounding box and page number information.
     - `"chunk_type"`: The type of chunk (e.g., `text`, `table`, `figure`).
     - `"chunk_id"`: A unique identifier for the chunk.

**Example Output:**

```json
{
  "markdown": "## Heading 1\n\nThis is a paragraph of text.\n\n### Subheading\n\n- Bullet point 1\n- Bullet point 2\n\n## Table 1\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data 1   | Data 2   |\n\n## Figure 1\n\nThis is a description of a bar chart showing...\n\n### Page Number\n\n1\n\n### Page Footer\n\n2024 Annual Report",
  "chunks": [
    {
      "text": "## Heading 1\n\nThis is a paragraph of text.",
      "grounding": [
        {
          "box": {
            "l": 0.1,
            "t": 0.2,
            "r": 0.9,
            "b": 0.3
          },
          "page": 0
        }
      ],
      "chunk_type": "text",
      "chunk_id": "12345"
    },
    {
      "text": "## Table 1\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data 1   | Data 2   |",
      "grounding": [
        {
          "box": {
            "l": 0.1,
            "t": 0.4,
            "r": 0.9,
            "b": 0.5
          },
          "page": 0
        }
      ],
      "chunk_type": "table",
      "chunk_id": "67890"
    }
  ]
}
```

**Instructions:**
- Ensure the output is well-structured and easy to read.
- Handle edge cases (e.g., missing data, overlapping elements) gracefully.
- Use consistent formatting for markdown and JSON.

      **TEXT STARTS HERE**:
        """
        markdown = generate_markdown_from_page(text, prompt, gemini_api_key)
        markdowns[page_num] = markdown
    return markdowns


# Step 3: Create the RAG System (Using FAISS)
def get_embeddings_from_text(texts):
    """
    Convert a list of texts (markdowns) to vector embeddings.
    Placeholder function, replace it with actual embedding model.
    """
    return np.random.rand(len(texts), 512)  # Dummy embeddings for illustration


def create_faiss_index(markdowns):
    """
    Create a FAISS index to store and retrieve markdown data.
    """
    embeddings = get_embeddings_from_text(list(markdowns.values()))
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Flat index for simplicity
    index.add(embeddings)
    return index, embeddings


def retrieve_relevant_markdowns(query, index, embeddings, markdowns, k=1):
    """
    Retrieve the most relevant markdown(s) for the user query from FAISS index.
    """
    query_embedding = get_embeddings_from_text([query])[0]  # Get embedding for the query
    _, indices = index.search(np.array([query_embedding]), k)  # Search for k nearest neighbors
    relevant_markdowns = [markdowns[i] for i in indices[0]]
    return relevant_markdowns


# Step 4: Build the Chatbot
def generate_response_from_markdowns(query, relevant_markdowns, api_key):
    """
    Generate a response from the relevant markdowns by feeding them back to Gemini API.
    """
    context = " ".join(relevant_markdowns)
    prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
    return generate_markdown_from_page(prompt, prompt, api_key)


def chatbot(query, index, embeddings, markdowns, api_key):
    """
    Chatbot function that uses the RAG system to generate responses.
    """
    relevant_markdowns = retrieve_relevant_markdowns(query, index, embeddings, markdowns)
    response = generate_response_from_markdowns(query, relevant_markdowns, api_key)
    return response


# Full Pipeline to read PDF, generate markdown, create RAG system, and use the chatbot
def main(pdf_path, gemini_api_key, user_query):
    """
    Full pipeline: Read PDF -> Generate markdown -> Create RAG system -> Query chatbot
    """
    # Step 1: Process the PDF to generate markdown for each page
    markdowns = process_pdf(pdf_path, gemini_api_key)

    # Step 2: Create the FAISS-based RAG system
    index, embeddings = create_faiss_index(markdowns)

    # Step 3: Query the chatbot and get a response
    response = chatbot(user_query, index, embeddings, markdowns, gemini_api_key)
    return response



In [19]:
# Example usage
pdf_path = 'sample.pdf'  # Path to your PDF
gemini_api_key = 'AIzaSyA0hQWaUYTRVzD7EGKAtZAYnCYKpr3UEC4'  # Your Gemini API key
user_query = "What is the Card revenues for 2024?"  # Example user query

response = main(pdf_path, gemini_api_key, user_query)
print(f"Chatbot Response: {response}")

Chatbot Response: The table provides the average balance of credit cards in 2024 which is $17.3 billions and the average rate of 21.53%. It does not provide the revenues.
