In [1]:
import os
import re
import json
import time
import requests

from pathlib import Path
from io import BytesIO

from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

import fitz  # PyMuPDF
from PIL import Image
import pytesseract

# Load environment variables from .env file.
load_dotenv()


True

In [2]:
MATHPIX_API_KEY = os.getenv("MATHPIX_API_KEY")
MATHPIX_APP_ID = os.getenv("MATHPIX_APP_ID")

# Upload PDF to Mathpix and returns a Markdown file with the content.
with open("pdfs/example.pdf", "rb") as file:
    r = requests.post(
        "https://api.mathpix.com/v3/pdf",
        headers={
            "app_id": MATHPIX_APP_ID,
            "app_key": MATHPIX_API_KEY,
        },
        files={"file": file},
    )
    pdf_id = r.json()["pdf_id"]
    print("PDF ID:", pdf_id)
    print("Response:", r.json())

    url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.md"
    headers = {
        "app_id": MATHPIX_APP_ID,
        "app_key": MATHPIX_API_KEY,
    }

    max_retries = 10
    retry_delay = 5  # seconds
    for attempt in range(max_retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Save the result if the request is successful
            with open(f"pdfs/example.md", "w") as f:
                f.write(response.text)
            print("Downloaded MD successfully.")
            break
        else:
            print(f"Attempt {attempt + 1}/{max_retries}: Processing not complete. Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
    else:
        print("Failed to retrieve processed PDF after multiple attempts:", response.status_code, response.text)

PDF ID: 2025_07_04_b41bcc42f6f303f464c5g
Response: {'pdf_id': '2025_07_04_b41bcc42f6f303f464c5g'}
Attempt 1/10: Processing not complete. Retrying in 5 seconds...
Downloaded MD successfully.


In [3]:
with open("pdfs/example.md", "r") as f:
    md_content = f.read()

# Print out a summary.
print("Markdown text: ")
print(f"  pdfs/example.md: {len(md_content)} characters")
print("Markdown text: ")
print(f"  pdfs/example.md: {md_content}")

Markdown text: 
  pdfs/example.md: 4405 characters
Markdown text: 
  pdfs/example.md: ## BIOE95039: Mechanics 2 Fluids

## Problem Set 1: Fluid Properties

1. Which of the following statements correctly describes a fluid?
a) A fluid flows under shear force
b) A fluid resists shear force
c) A fluid deforms indefinitely under shear force
d) A fluid is one of the three states of matter
2. Using diagrams write a brief explanation of the difference between a solid and a fluid
3. Draw a graph showing the changes in denisty, $\rho$ of a fluid as the size of the element volume, $V$ increases. Mark the region where the continuum assumption is valid.
4. If the continuum assumption holds for a particular fluid which of the following statements are true?
a) The density of the fluid does not vary as a function of position, but can vary as a function of time
b) The density of the fluid varies as a function of position, and time
c) The density of the fluid does not vary in the element volume $V$
d) T

In [4]:
# Fetch the figures from the paper and answers.
def extract_figures_from_text(text): #, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                # "answerFile": ans
            }
    return figures


figures = extract_figures_from_text(md_content)

set_path = "pdfs/"

#saving the extracted figures to the set_path directory
Path(f"{set_path}").mkdir(exist_ok=True)
Path(f"{set_path}media/").mkdir(exist_ok=True)
for idx, (fig_name, fig_info) in enumerate(figures.items()):
    print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
    # image_name = f"figure_{fig_info['title']}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
    # if image_name in os.listdir(f"{set_path}media/"):
    #     image_name = f"figure_{fig_info['title']}_{idx}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
    image_name = f"{fig_name}.jpg"
    fig_info["image"].save(f"{set_path}media/{image_name}")

Matches found: ['https://cdn.mathpix.com/cropped/2025_07_04_b41bcc42f6f303f464c5g-2.jpg?height=398&width=706&top_left_y=335&top_left_x=681', 'https://cdn.mathpix.com/cropped/2025_07_04_b41bcc42f6f303f464c5g-2.jpg?height=312&width=706&top_left_y=832&top_left_x=681']
FIGURE Title='', Label='', URL='https://cdn.mathpix.com/cropped/2025_07_04_b41bcc42f6f303f464c5g-2.jpg?height=398&width=706&top_left_y=335&top_left_x=681'
FIGURE Title='', Label='', URL='https://cdn.mathpix.com/cropped/2025_07_04_b41bcc42f6f303f464c5g-2.jpg?height=312&width=706&top_left_y=832&top_left_x=681'


In [None]:
#replace the image URLs in the markdown content with local paths
for fig_name, fig_info in figures.items():
    image_path = f"{set_path}media/{fig_name}.jpg"
    md_content = md_content.replace(fig_info["url"], image_path)
# Save the modified markdown content to a file
with open(f"{set_path}example.md", "w") as f:
    f.write(md_content)