In [1]:
import os
import re
import json

from pathlib import Path
from io import BytesIO
from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain_openai import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI


import fitz  # PyMuPDF
from PIL import Image
import pytesseract

In [32]:
def convert_math_to_markdown(text: str, llm) -> str:
    """
    Uses the LLM to convert scientific text (including mathematics)
    into markdown (e.g. wrapping inline math with $ or display math with $$).
    """
    prompt = (
        "Convert the following scientific text with mathematics into markdown format. "
        "Ensure that all mathematical expressions are properly formatted using inline ($...$) "
        "markdown syntax as appropriate.\n\n"
        f"{text}"
    )
    markdown_text = llm.invoke(prompt)
    return markdown_text.content


def extract_caption_from_bbox(page, bbox, threshold: float = 50) -> str:
    caption_candidates = []
    # Iterate over each block.
    for block in page.get_text("blocks"):
        # Unpack the first five elements and ignore the rest.
        bx0, by0, bx1, by1, text, *_ = block
        if not text.strip():
            continue
        # If the block's top coordinate is just below the image bounding box.
        if by0 >= bbox[3] and (by0 - bbox[3]) < threshold:
            caption_candidates.append((by0, text.strip()))
    if caption_candidates:
        # Return the caption of the block closest to the image.
        caption_candidates.sort(key=lambda t: t[0])
        return caption_candidates[0][1]
    return None


def parse_caption(caption: str) -> (str, str):
    """
    Given a caption string, try to parse out a title and label.
    For example, if the caption includes tokens like “Title: …” or “Label: …”.
    """
    title = ""
    label = ""
    m_title = re.search(r"Title\s*[:\-]\s*(.+?)(,|$)", caption, re.IGNORECASE)
    if m_title:
        title = m_title.group(1).strip()
    m_label = re.search(r"Label\s*[:\-]\s*(.+?)(,|$)", caption, re.IGNORECASE)
    if m_label:
        label = m_label.group(1).strip()
    return title, label


def get_figure_name(caption: str, default_name: str) -> str:
    """
    If the caption contains a reference like “Figure 2”, return a name based on that.
    Otherwise return the provided default name.
    """
    if caption:
        m_fig = re.search(r"Figure\s*(\d+)", caption, re.IGNORECASE)
        if m_fig:
            return f"Figure_{m_fig.group(1)}"
    return default_name


def process_pdf(pdf_path: str, llm):
    """
    Reads a PDF file, extracts its text (with mathematics converted to markdown)
    and extracts images (with associated captions parsed for title and label).
    
    Returns a LangChain Document whose page_content is the markdown text and whose
    metadata includes a dictionary 'figures' mapping figure names to a dict:
      { "image": <PIL.Image>, "title": <str>, "label": <str> }.
    """
    # 1. Extract the main text using a loader optimized for scientific content.
    text_loader = UnstructuredPDFLoader(pdf_path)
    docs = text_loader.load()
    combined_text = "\n".join(doc.page_content for doc in docs)
    
    # Ignire the first page if it contains a title.
    if len(docs) > 1 and docs[0].page_content.strip() == docs[1].page_content.strip():
        combined_text = "\n".join(doc.page_content for doc in docs[1:])
    # Remove any leading or trailing whitespace.
    combined_text = combined_text.strip()

    # Stop processing text after "Appendices"
    appendices_index = combined_text.find("Notes on the Paper")
    if appendices_index != -1:
        combined_text = combined_text[:appendices_index]
    
    # Remove the [number%]
    combined_text = re.sub(r"\[\d+%\]", "", combined_text)
    
    # 2. Use the LLM to convert mathematics in the text to proper markdown.
    markdown_text = convert_math_to_markdown(combined_text, llm)
    
    # 3. Extract images and associated metadata using PyMuPDF.
    pdf_doc = fitz.open(pdf_path)
    figures = {}
    
    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        # Use the "dict" interface for richer info.
        page_dict = page.get_text("dict")
        blocks = page_dict.get("blocks", [])

        # Extract embedded images
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_obj = Image.open(BytesIO(image_bytes))
            img_obj.save(f"media_ans/page_{page_num + 1}_img_{img_index}.png")

        for block in blocks:
            if isinstance(block, dict) and block.get("type") == 1:
                xref = block.get("image")
                bbox = block.get("bbox")
                if not xref:
                    continue

                # Check the type of xref.
                if isinstance(xref, int):
                    # xref is valid; extract the image using PyMuPDF.
                    base_image = pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]
                elif isinstance(xref, bytes):
                    # xref already contains the image bytes (inline image).
                    image_bytes = xref
                else:
                    continue

                # Open the image with PIL.
                img_obj = Image.open(BytesIO(image_bytes))
                
                # Try to extract a caption near the image.
                caption = extract_caption_from_bbox(page, bbox)
                title, label = ("", "")
                if caption:
                    title, label = parse_caption(caption)
                default_fig_name = f"page{page_num+1}_img_{xref if isinstance(xref, int) else 'inline'}"
                fig_name = get_figure_name(caption, default_fig_name) if caption else default_fig_name
                figures[fig_name] = {"image": img_obj, "title": title, "label": label}
    
    pdf_doc.close()
    
    # 4. Create and return a LangChain Document.
    metadata = {"figures": figures}
    final_doc = Document(page_content=markdown_text, metadata=metadata)
    return final_doc


In [33]:
# Load environment variables from .env file.
load_dotenv()

# Set up the Azure OpenAI LLM via LangChain.
temperature = 0
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )
# llm = ChatGoogleGenerativeAI(
#             model=os.environ['GOOGLE_AI_MODEL'],
#             temperature=temperature,
#             google_api_key=os.environ['GOOGLE_AI_API_KEY'],
#         )

pdf_file_path = "pastpapers/2023_answers.pdf"
doc = process_pdf(pdf_file_path, llm)

# Print out a summary.
print("Markdown text: ")
print(doc.page_content)

# Needs to be improved to infer name/label/caption based on context.
figures = doc.metadata.get("figures", {})
print("\nExtracted figures:")

Path("media").mkdir(exist_ok=True)
for idx, (fig_name, fig_info) in enumerate(figures.items()):
    print(f"  {fig_name}: Title='{fig_info['title']}', Label='{fig_info['label']}'")
    # Save each image as a PNG file with a sequential name: figure_0.png, figure_1.png, etc.
    fig_info["image"].save(f"media_ans/figure_{idx}.png")


CropBox missing from /Page, defaulting to MediaBox


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Markdown text: 
Below is the converted text in Markdown format with all mathematical expressions formatted using inline ($…$) or display ($$ … $$) mathematics as appropriate.

---

# ME2–FMX 2023 Exam-Paper Solutions  
A. Giusti, P. Johnson

## Question 1

### Soap Dispenser

**(a)**

The fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier–Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question.

**(b)**

In general, the velocity is a function of time and space, i.e.  
$$
\vec{u} = \vec{u}(t, r, \theta, z).
$$  
Since the flow is steady, and assumed uniform along $z$ and $\theta$ (i.e. $\partial \vec{u}/\partial \theta = 0$, $\partial \vec{u}/\partial z = 0$), the velocity also does not depend on $\theta$ and $z$. This

In [34]:
# Define the schema for the tutorial output.
class Exercise(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    workedSolutions: list[str] = Field(..., description="List of worked solution to subquestions within the exercise (no numbering or counting)")
    
class Tutorial(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[Exercise] = Field(..., description="List of tutorial questions")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            { title: "exercise text 2", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Tutorial)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [35]:
imported_tutorial = extract_tutorial_questions(doc.page_content)

Raw LLM Response:
content='{\n  "name": "ME2–FMX 2023 Exam-Paper Solutions",\n  "year": "2023",\n  "exercises": [\n    {\n      "title": "Soap Dispenser",\n      "workedSolutions": [\n        " **(a)**\\n\\nThe fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier–Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question.",\n        " **(b)**\\n\\nIn general, the velocity is a function of time and space, i.e.  $\\\\vec{u} = \\\\vec{u}(t, r, \\\\theta, z)$.  Since the flow is steady, and assumed uniform along $z$ and $\\\\theta$ (i.e. $\\\\partial \\\\vec{u}/\\\\partial \\\\theta = 0$, $\\\\partial \\\\vec{u}/\\\\partial z = 0$), the velocity also does not depend on $\\\\theta$ and $z$. This means that the velocity must be 

In [36]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["exercises"]

# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question.get("title")}\n")
    print("Subquestions:")
    for subquestion in question.get("workedSolutions", []):
        print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


Title: ME2–FMX 2023 Exam-Paper Solutions 2023

**Question 1**:
Soap Dispenser

Subquestions:
-  **(a)**

The fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier–Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question.
-  **(b)**

In general, the velocity is a function of time and space, i.e.  $\vec{u} = \vec{u}(t, r, \theta, z)$.  Since the flow is steady, and assumed uniform along $z$ and $\theta$ (i.e. $\partial \vec{u}/\partial \theta = 0$, $\partial \vec{u}/\partial z = 0$), the velocity also does not depend on $\theta$ and $z$. This means that the velocity must be a function of $r$ only.
-  **(c)**

The equation for mass conservation, with the assumptions of uniform velocity in $z$ and $\theta$, reduces to

$\frac

In [37]:

# Define the nested Pydantic models based on the JSON schema.

class Part(BaseModel):
    answer: str = Field(..., description="Part answer text")
    content: str = Field(..., description="Part content text")
    orderNumber: int = Field(..., description="The order number of this part")

class QuestionJson(BaseModel):
    orderNumber: int = Field(..., description="The order number of the question")
    parts: list[Part] = Field(..., description="List of question parts")
    title: str = Field(..., description="Question title")

def create_question_json(question: str) -> dict:
    # Initialize the output parser using the defined Pydantic model.
    parser = PydanticOutputParser(pydantic_object=QuestionJson)

    # Minimum JSON template to guide the model. (Used as context.)
    minimum_json_template = r'''{
      "orderNumber": 0,
      "parts": [
        {
          "answer": "",
          "content": "Part text here",
          "orderNumber": 0,
        }
      ],
      "title": "Question title here"
    }'''

    # Construct the prompt, appending the parser's format instructions.
    question_prompt = f'''
      JSON_TEMPLATE
      ```json
      {minimum_json_template}
      ```

      IMPORTED_QUESTION
      ```markdown
      {question}
      ```

      If you see something like "HII 5\u201310 mins\n\n", drop it from the text. Preserve the Katex
      math formatting. Do not modify the original text of the question.

      From the content of each part, infer the final answer and store it in the "answer" field.

      Carefully map IMPORTED_QUESTION into the JSON_TEMPLATE and return valid JSON.

      {parser.get_format_instructions()}
      '''

    # Invoke the language model.
    response = llm.invoke(question_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.model_dump()  # Return as a dictionary.
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [38]:
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

# Define a Pydantic model representing the expected output schema.
class QuestionName(BaseModel):
    question_name: str = Field(..., description="A short tag representing the question’s topic.")

def create_question_name(question: str) -> str:
    # Initialize the output parser with the schema.
    parser = PydanticOutputParser(pydantic_object=QuestionName)

    # Build a prompt that includes the parser's format instructions.
    question_name_prompt = f'''
      IMPORTED_QUESTION
      ```markdown
      {question}
      ```
      
      QUERY:
      Based on the above markdown content, infer a suitable short name tag that represents the question’s topic.
      Follow these rules:
      
        1. Look for the heading text (for example, if the text starts with "Question 1:" followed by "Hydraulic scale", then the main topic is "Hydraulic scale").
        2. Normalize the name by replacing spaces with underscores and removing punctuation.
        3. Return only a valid JSON object with a single property "question_name".
      
      {parser.get_format_instructions()}
    '''
    # Invoke the language model.
    response = llm.invoke(question_name_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.question_name
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [39]:
def create_tutorial_metadata(tutorial_title: str) -> dict:
    """
    Args:
        tutorial_title (str): The full tutorial title.

    Returns:
        str: name of the tutorial.
    """
    # Generate a short name for the tutorial (could name fancier using LLM).
    # - convert to lower-case,
    # - replace spaces with underscores,
    # - remove any characters except letters, numbers, underscores, and hyphens.
    normalized_name = tutorial_title.lower()  # convert to lower-case
    normalized_name = re.sub(r'\s+', '_', normalized_name)         # replace spaces with underscores
    normalized_name = re.sub(r'[^a-z0-9_-]', '', normalized_name)   # remove other characters
    tutorial_normalized_title = normalized_name
    
    return tutorial_normalized_title

tutorial_title = imported_tutorial["name"] + " " + imported_tutorial["year"]
tutorial_normalized_title = create_tutorial_metadata(tutorial_title)

set_filename = f"ans_{tutorial_normalized_title}.json"
set_filepath = set_filename.split(".")[0]
print(f"Set filename: {set_filename}, set filepath: {set_filepath}")

Set filename: ans_me2fmx_2023_exam-paper_solutions_2023.json, set filepath: ans_me2fmx_2023_exam-paper_solutions_2023


In [40]:
# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question}\n")

    print("INFO: Mapping question in markdown into JSON")
    question_json = create_question_json(question)
    question_json["orderNumber"] = idx-1
    print(f"INFO: JSON {idx}:\n{question_json}\n")
    
    print("INFO: Get question name.")
    question_name = create_question_name(question)

    if f"{set_filepath}" not in os.listdir("pastpapers/json/"):
        os.mkdir(f"pastpapers/json/{set_filepath}")
    
    question_index = f"{(idx-1):03}" 
    filename = f"pastpapers/json/{set_filepath}/question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(question_json, indent=2))
    
    # break # breaking here as just doing quick test

**Question 1**:
{'title': 'Soap Dispenser', 'workedSolutions': [' **(a)**\n\nThe fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier–Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question.', ' **(b)**\n\nIn general, the velocity is a function of time and space, i.e.  $\\vec{u} = \\vec{u}(t, r, \\theta, z)$.  Since the flow is steady, and assumed uniform along $z$ and $\\theta$ (i.e. $\\partial \\vec{u}/\\partial \\theta = 0$, $\\partial \\vec{u}/\\partial z = 0$), the velocity also does not depend on $\\theta$ and $z$. This means that the velocity must be a function of $r$ only.', ' **(c)**\n\nThe equation for mass conservation, with the assumptions of uniform velocity in $z$ and $\\theta$, reduces to\n\n$\\frac{\\par

In [41]:
# MERGE ANSWERS INTO QUESTIONS
# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question}\n")

    # read the JSON file
    question_index = f"{(idx-1):03}" 
    print("INFO: Get question name.")
    question_name = create_question_name(question)
  
    ans_filename = f"pastpapers/json/{set_filepath}/question_{question_index}_{question_name}.json"
    print(f"INFO: reading {ans_filename}")
    ans_json = json.loads(open(ans_filename).read())

    # in the set_fluid_mechanics_{year} folder, find the file with same question_{question_index}_{question_name}.json name
    q_filename = f"pastpapers/json/set_fluid_mechanics_{imported_tutorial['year']}/question_{question_index}_{question_name}.json"
    question_json = json.loads(open(q_filename).read())

    # find the part with the same orderNumber in both json files under parts
    for part in question_json["parts"]:
        for ans_part in ans_json["parts"]:
            if part["orderNumber"] == ans_part["orderNumber"]:
                # merge the workedSolution into the question
                print(f"INFO: ans_workedSolution: {ans_part['content']}")
                part["workedSolution"]["content"] = ans_part["content"]
                print(f"INFO:orderNumber {part['orderNumber']} matched, merged workedSolution: {part['workedSolution']}")
                break
        else:
            # If no match was found, set workedSolution to None
            part["workedSolution"] = None

    print(f"INFO: JSON {idx}:\n{str(question_json)}\n")

    # SAVE TO NEW JSON FILE
    combined_set_filepath = f"set_fluid_mechanics_{imported_tutorial['year']}"
    if f"{combined_set_filepath}" not in os.listdir("pastpapers/json_combined/"):
        os.mkdir(f"pastpapers/json_combined/{combined_set_filepath}")
    
    question_index = f"{(idx-1):03}" 
    filename = f"pastpapers/json_combined/{combined_set_filepath}/question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(question_json, indent=2))

**Question 1**:
{'title': 'Soap Dispenser', 'workedSolutions': [' **(a)**\n\nThe fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier–Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question.', ' **(b)**\n\nIn general, the velocity is a function of time and space, i.e.  $\\vec{u} = \\vec{u}(t, r, \\theta, z)$.  Since the flow is steady, and assumed uniform along $z$ and $\\theta$ (i.e. $\\partial \\vec{u}/\\partial \\theta = 0$, $\\partial \\vec{u}/\\partial z = 0$), the velocity also does not depend on $\\theta$ and $z$. This means that the velocity must be a function of $r$ only.', ' **(c)**\n\nThe equation for mass conservation, with the assumptions of uniform velocity in $z$ and $\\theta$, reduces to\n\n$\\frac{\\par