In [56]:
import os
import re
import json

from pathlib import Path
from io import BytesIO
from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain_openai import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI


import fitz  # PyMuPDF
from PIL import Image
import pytesseract

In [57]:
def convert_math_to_markdown(text: str, llm) -> str:
    """
    Uses the LLM to convert scientific text (including mathematics)
    into markdown (e.g. wrapping inline math with $ or display math with $$).
    """
    prompt = (
        "Convert the following scientific text with mathematics into markdown format. "
        "Ensure that all mathematical expressions are properly formatted using inline ($...$) "
        "markdown syntax as appropriate.\n\n"
        f"{text}"
    )
    markdown_text = llm.invoke(prompt)
    return markdown_text.content


def extract_caption_from_bbox(page, bbox, threshold: float = 50) -> str:
    caption_candidates = []
    # Iterate over each block.
    for block in page.get_text("blocks"):
        # Unpack the first five elements and ignore the rest.
        bx0, by0, bx1, by1, text, *_ = block
        if not text.strip():
            continue
        # If the block's top coordinate is just below the image bounding box.
        if by0 >= bbox[3] and (by0 - bbox[3]) < threshold:
            caption_candidates.append((by0, text.strip()))
    if caption_candidates:
        # Return the caption of the block closest to the image.
        caption_candidates.sort(key=lambda t: t[0])
        return caption_candidates[0][1]
    return None


def parse_caption(caption: str) -> (str, str):
    """
    Given a caption string, try to parse out a title and label.
    For example, if the caption includes tokens like “Title: …” or “Label: …”.
    """
    title = ""
    label = ""
    m_title = re.search(r"Title\s*[:\-]\s*(.+?)(,|$)", caption, re.IGNORECASE)
    if m_title:
        title = m_title.group(1).strip()
    m_label = re.search(r"Label\s*[:\-]\s*(.+?)(,|$)", caption, re.IGNORECASE)
    if m_label:
        label = m_label.group(1).strip()
    return title, label


def get_figure_name(caption: str, default_name: str) -> str:
    """
    If the caption contains a reference like “Figure 2”, return a name based on that.
    Otherwise return the provided default name.
    """
    if caption:
        m_fig = re.search(r"Figure\s*(\d+)", caption, re.IGNORECASE)
        if m_fig:
            return f"Figure_{m_fig.group(1)}"
    return default_name


def process_pdf(pdf_path: str, llm):
    """
    Reads a PDF file, extracts its text (with mathematics converted to markdown)
    and extracts images (with associated captions parsed for title and label).
    
    Returns a LangChain Document whose page_content is the markdown text and whose
    metadata includes a dictionary 'figures' mapping figure names to a dict:
      { "image": <PIL.Image>, "title": <str>, "label": <str> }.
    """
    # 1. Extract the main text using a loader optimized for scientific content.
    text_loader = UnstructuredPDFLoader(pdf_path)
    docs = text_loader.load()
    combined_text = "\n".join(doc.page_content for doc in docs)
    
    # Ignire the first page if it contains a title.
    if len(docs) > 1 and docs[0].page_content.strip() == docs[1].page_content.strip():
        combined_text = "\n".join(doc.page_content for doc in docs[1:])
    # Remove any leading or trailing whitespace.
    combined_text = combined_text.strip()

    # Stop processing text after "Appendices"
    appendices_index = combined_text.find("Appendices")
    if appendices_index != -1:
        combined_text = combined_text[:appendices_index]
    
    # Remove the [number%]
    combined_text = re.sub(r"\[\d+%\]", "", combined_text)
    
    # 2. Use the LLM to convert mathematics in the text to proper markdown.
    markdown_text = convert_math_to_markdown(combined_text, llm)
    
    # 3. Extract images and associated metadata using PyMuPDF.
    pdf_doc = fitz.open(pdf_path)
    figures = {}
    
    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        # Use the "dict" interface for richer info.
        page_dict = page.get_text("dict")
        blocks = page_dict.get("blocks", [])

        # Extract embedded images
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_obj = Image.open(BytesIO(image_bytes))
            img_obj.save(f"media/page_{page_num + 1}_img_{img_index}.png")

        for block in blocks:
            if isinstance(block, dict) and block.get("type") == 1:
                xref = block.get("image")
                bbox = block.get("bbox")
                if not xref:
                    continue

                # Check the type of xref.
                if isinstance(xref, int):
                    # xref is valid; extract the image using PyMuPDF.
                    base_image = pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]
                elif isinstance(xref, bytes):
                    # xref already contains the image bytes (inline image).
                    image_bytes = xref
                else:
                    continue

                # Open the image with PIL.
                img_obj = Image.open(BytesIO(image_bytes))
                
                # Try to extract a caption near the image.
                caption = extract_caption_from_bbox(page, bbox)
                title, label = ("", "")
                if caption:
                    title, label = parse_caption(caption)
                default_fig_name = f"page{page_num+1}_img_{xref if isinstance(xref, int) else 'inline'}"
                fig_name = get_figure_name(caption, default_fig_name) if caption else default_fig_name
                figures[fig_name] = {"image": img_obj, "title": title, "label": label}
    
    pdf_doc.close()
    
    # 4. Create and return a LangChain Document.
    metadata = {"figures": figures}
    final_doc = Document(page_content=markdown_text, metadata=metadata)
    return final_doc


In [78]:
# Load environment variables from .env file.
load_dotenv()

# Set up the Azure OpenAI LLM via LangChain.
temperature = 0
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )
# llm = ChatGoogleGenerativeAI(
#             model=os.environ['GOOGLE_AI_MODEL'],
#             temperature=temperature,
#             google_api_key=os.environ['GOOGLE_AI_API_KEY'],
#         )

pdf_file_path = "pastpapers/2023_paper.pdf"
doc = process_pdf(pdf_file_path, llm)

# Print out a summary.
print("Markdown text: ")
print(doc.page_content)

# Needs to be improved to infer name/label/caption based on context.
figures = doc.metadata.get("figures", {})
print("\nExtracted figures:")

Path("media").mkdir(exist_ok=True)
for idx, (fig_name, fig_info) in enumerate(figures.items()):
    print(f"  {fig_name}: Title='{fig_info['title']}', Label='{fig_info['label']}'")
    # Save each image as a PNG file with a sequential name: figure_0.png, figure_1.png, etc.
    fig_info["image"].save(f"media/figure_{idx}.png")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Markdown text: 
```markdown
# IMPERIAL COLLEGE LONDON

## MEng EXAMINATIONS 2023

### Part II

*for Internal Students of the Imperial College of Science, Technology and Medicine*  
This paper is also taken for the relevant examination for the Associateship or Diploma

# FLUID MECHANICS

**Tuesday, 23rd May 14:00-15:30 (duration: 1 hour and 30 minutes)**

This paper contains **THREE** questions. Attempt every question. The numbers shown by each question are for your guidance; they indicate approximately how the examiners intend to distribute the marks for this paper. An appendix is included, and a Data and Formulæ Book is provided.

© 2023 Imperial College London

Page 1 of 8

---

Turn over

## Q1. Soap dispenser

A soap dispenser uses a hand-driven pump. The tube labelled ($14$) in **Figure Q1** is flooded with soap. The tube is pushed downwards against a spring and soap leaves through the top opening of the tube. You are required to estimate the resistance that the soap exerts on the

In [79]:
# Define the schema for the tutorial output.
class Exercise(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    content: str = Field(..., description="Content of the exercise (no exercise title, no subquestions)")
    subquestions: list[str] = Field(..., description="List of subquestions within the exercise (only the text, no numbering)")
    
class Tutorial(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[Exercise] = Field(..., description="List of tutorial questions")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", content: "content text exercise 1", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            { title: "exercise text 2", content: "content text exercise 2", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Tutorial)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [80]:
imported_tutorial = extract_tutorial_questions(doc.page_content)

Raw LLM Response:
content='{\n  "name": "FLUID MECHANICS",\n  "year": "2023",\n  "exercises": [\n    {\n      "title": "Soap dispenser",\n      "content": "A soap dispenser uses a hand-driven pump. The tube labelled ($14$) in **Figure Q1** is flooded with soap. The tube is pushed downwards against a spring and soap leaves through the top opening of the tube. You are required to estimate the resistance that the soap exerts on the tube ($14$) as the tube is depressed.\\n\\nUse the cylindrical coordinate system in **Figure Q1**, where the origin is fixed in space. You may make the following assumptions:\\n- The tube is moving steadily downwards at a speed $U_0$ relative to the origin.\\n- The tube is not rotating.\\n- The flow is steady, with no azimuthal velocity component ($u_\\\\theta = 0$).\\n- The fluid velocity is uniform in the axial direction, $z$, and in the azimuthal direction, $\\\\theta$.\\n- The fluid is Newtonian with density, $\\\\rho$, and dynamic viscosity, $\\\\mu$, both

In [81]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["exercises"]

# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question}\n")
    # print("Subquestions:")
    # for subquestion in question.get("subquestions", []):
    #     print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


Title: FLUID MECHANICS 2023

**Question 1**:
{'title': 'Soap dispenser', 'content': 'A soap dispenser uses a hand-driven pump. The tube labelled ($14$) in **Figure Q1** is flooded with soap. The tube is pushed downwards against a spring and soap leaves through the top opening of the tube. You are required to estimate the resistance that the soap exerts on the tube ($14$) as the tube is depressed.\n\nUse the cylindrical coordinate system in **Figure Q1**, where the origin is fixed in space. You may make the following assumptions:\n- The tube is moving steadily downwards at a speed $U_0$ relative to the origin.\n- The tube is not rotating.\n- The flow is steady, with no azimuthal velocity component ($u_\\theta = 0$).\n- The fluid velocity is uniform in the axial direction, $z$, and in the azimuthal direction, $\\theta$.\n- The fluid is Newtonian with density, $\\rho$, and dynamic viscosity, $\\mu$, both constant and uniform.\n- The axial pressure gradient, $\\frac{\\partial p}{\\partial 

In [82]:
def create_tutorial_metadata(tutorial_title: str) -> dict:
    """
    Creates a metadata JSON object for a tutorial.

    The metadata includes a normalized short name (generated by lowercasing the
    title, replacing spaces with underscores, and removing unsafe characters),
    as well as several fixed visibility settings and a release date.

    Args:
        tutorial_title (str): The full tutorial title.

    Returns:
    
        dict: A dictionary containing the metadata.
    """
    # Generate a short name for the tutorial (could name fancier using LLM).
    # - convert to lower-case,
    # - replace spaces with underscores,
    # - remove any characters except letters, numbers, underscores, and hyphens.
    normalized_name = tutorial_title.lower()  # convert to lower-case
    normalized_name = re.sub(r'\s+', '_', normalized_name)         # replace spaces with underscores
    normalized_name = re.sub(r'[^a-z0-9_-]', '', normalized_name)   # remove other characters

    # Build the metadata dictionary
    metadata = {
        "name": tutorial_title,
        "description": "",  # Optional description of the tutorial
        "releasedAt": "2024-09-30T11:00:00.000Z",  # ISO 8601 release date
        "manuallyHidden": True,  # Defaults to true
        "finalAnswerVisibility": "OPEN_WITH_WARNINGS",
        "workedSolutionVisibility": "OPEN_WITH_WARNINGS",
        "structuredTutorialVisibility": "OPEN",
        "chatbotVisibility": "HIDE"
    }
    tutorial_normalized_title = normalized_name
    
    return metadata, tutorial_normalized_title

tutorial_title = imported_tutorial["name"] + " " + imported_tutorial["year"]
metadata, tutorial_normalized_title = create_tutorial_metadata(tutorial_title)

set_filename = f"set_{tutorial_normalized_title}.json"
set_filepath = set_filename.split(".")[0]
if f"{set_filepath}" not in os.listdir("pastpapers/json/"):
        os.mkdir(f"pastpapers/json/{set_filepath}")

print(f"Saving metadata to pastpapers/json/{set_filepath}/{set_filename}...")
json.dump(metadata, open(f"pastpapers/json/{set_filepath}/{set_filename}", "w"), indent=4)


Saving metadata to pastpapers/json/set_fluid_mechanics_2023/set_fluid_mechanics_2023.json...


In [83]:

# Define the nested Pydantic models based on the JSON schema.
class WorkedSolution(BaseModel):
    content: str = Field(..., description="Worked solution content")
    id: str = Field(..., description="Identifier for the worked solution")
    title: str = Field(..., description="Worked solution title")
    children: list = []

class Part(BaseModel):
    answerContent: str = Field(..., description="Part answer text")
    content: str = Field(..., description="Part content text")
    orderNumber: int = Field(..., description="The order number of this part")
    responseAreas: list = Field(..., description="List of response areas")
    tutorial: list = Field(..., description="List of tutorial items")
    universalPartId: str = Field(..., description="Universal part identifier")
    workedSolution: WorkedSolution = Field(..., description="Worked solution details")

class QuestionJson(BaseModel):
    orderNumber: int = Field(..., description="The order number of the question")
    displayFinalAnswer: bool = Field(..., description="Flag to display the final answer")
    displayStructuredTutorial: bool = Field(..., description="Flag to display the structured tutorial")
    displayWorkedSolution: bool = Field(..., description="Flag to display the worked solution")
    masterContent: str = Field(..., description="Top level question content")
    parts: list[Part] = Field(..., description="List of question parts")
    publish: bool = Field(..., description="Publish flag")
    title: str = Field(..., description="Question title")

def create_question_json(question: str) -> dict:
    # Initialize the output parser using the defined Pydantic model.
    parser = PydanticOutputParser(pydantic_object=QuestionJson)

    # Minimum JSON template to guide the model. (Used as context.)
    minimum_json_template = r'''{
      "orderNumber": 0,
      "displayFinalAnswer": true,
      "displayStructuredTutorial": true,
      "displayWorkedSolution": true,
      "masterContent": "Top level question here",
      "parts": [
        {
          "answerContent": "",
          "content": "Part text here",
          "orderNumber": 0,
          "responseAreas": [],
          "tutorial": [],
          "universalPartId": "N/A",
          "workedSolution": {
            "content": "Part worked solution here",
            "id": "N/A",
            "title": "",
            "children": []
          }
        }
      ],
      "publish": false,
      "title": "Question title here"
    }'''

    # Construct the prompt, appending the parser's format instructions.
    question_prompt = f'''
      JSON_TEMPLATE
      ```json
      {minimum_json_template}
      ```

      IMPORTED_QUESTION
      ```markdown
      {question}
      ```

      If you see something like "HII 5\u201310 mins\n\n", drop it from the text. Preserve the Katex
      math formatting. Do not modify the original text of the question.

      Carefully map IMPORTED_QUESTION into the JSON_TEMPLATE and return valid JSON.

      {parser.get_format_instructions()}
      '''

    # Invoke the language model.
    response = llm.invoke(question_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.model_dump()  # Return as a dictionary.
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [84]:
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

# Define a Pydantic model representing the expected output schema.
class QuestionName(BaseModel):
    question_name: str = Field(..., description="A short tag representing the question’s topic.")

def create_question_name(question: str) -> str:
    # Initialize the output parser with the schema.
    parser = PydanticOutputParser(pydantic_object=QuestionName)

    # Build a prompt that includes the parser's format instructions.
    question_name_prompt = f'''
      IMPORTED_QUESTION
      ```markdown
      {question}
      ```
      
      QUERY:
      Based on the above markdown content, infer a suitable short name tag that represents the question’s topic.
      Follow these rules:
      
        1. Look for the heading text (for example, if the text starts with "Question 1:" followed by "Hydraulic scale", then the main topic is "Hydraulic scale").
        2. Normalize the name by replacing spaces with underscores and removing punctuation.
        3. Return only a valid JSON object with a single property "question_name".
      
      {parser.get_format_instructions()}
    '''
    # Invoke the language model.
    response = llm.invoke(question_name_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.question_name
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [85]:
# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question}\n")

    print("INFO: Mapping question in markdown into JSON")
    question_json = create_question_json(question)
    question_json["orderNumber"] = idx-1
    print(f"INFO: JSON {idx}:\n{question_json}\n")
    
    print("INFO: Get question name.")
    question_name = create_question_name(question)

    if f"{set_filepath}" not in os.listdir("pastpapers/json/"):
        os.mkdir(f"pastpapers/json/{set_filepath}")
    
    question_index = f"{(idx-1):03}" 
    filename = f"pastpapers/json/{set_filepath}/question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(question_json, indent=2))
    
    # break # breaking here as just doing quick test

**Question 1**:
{'title': 'Soap dispenser', 'content': 'A soap dispenser uses a hand-driven pump. The tube labelled ($14$) in **Figure Q1** is flooded with soap. The tube is pushed downwards against a spring and soap leaves through the top opening of the tube. You are required to estimate the resistance that the soap exerts on the tube ($14$) as the tube is depressed.\n\nUse the cylindrical coordinate system in **Figure Q1**, where the origin is fixed in space. You may make the following assumptions:\n- The tube is moving steadily downwards at a speed $U_0$ relative to the origin.\n- The tube is not rotating.\n- The flow is steady, with no azimuthal velocity component ($u_\\theta = 0$).\n- The fluid velocity is uniform in the axial direction, $z$, and in the azimuthal direction, $\\theta$.\n- The fluid is Newtonian with density, $\\rho$, and dynamic viscosity, $\\mu$, both constant and uniform.\n- The axial pressure gradient, $\\frac{\\partial p}{\\partial z}$, is constant.\n\n**Figure

KeyboardInterrupt: 