In [2]:
import os
import re
import json

from pathlib import Path
from io import BytesIO
from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain_openai import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI


import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import requests

# Load environment variables from .env file.
load_dotenv()

True

In [21]:
year = 2024
path = "./pastpapers/"
paper_file_path = f"{year}_paper.mmd"
answers_file_path = f"{year}_answers.mmd"
set_path = f"{path}mathpixJSON/set_fluid_mechanics_{year}/"

# Load the Markdown files
with open(path+paper_file_path, "r") as f:
    paper_text = f.read()
    #  ignore anything after Appendices
    paper_text = paper_text.split("Appendices")[0]
with open(path+answers_file_path, "r") as f:
    answers_text = f.read()
    # ignore anything after Notes on the paper
    answers_text = answers_text.split("Notes on the paper")[0]

# Print out a summary.
print("Markdown text: ")
print(f"  {paper_file_path}: {len(paper_text)} characters")
print(f"  {answers_file_path}: {len(answers_text)} characters")
print("Markdown text: ")
print(f"  {paper_file_path}: {paper_text}")
print(f"  {answers_file_path}: {answers_text}")

# Fetch the figures from the paper and answers.
figures = {}
def extract_figures_from_text(text):
    """Extracts figures from the text using regex.
    using: ![name](https:url) - get the image from the URL"""
    figures = {}
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    for match in matches:
        url = match.strip()
        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {"image": image, "title": "", "label": ""}
    return figures

figures.update(extract_figures_from_text(paper_text))
figures.update(extract_figures_from_text(answers_text))

Path(f"{set_path}").mkdir(exist_ok=True)
Path(f"{set_path}media/").mkdir(exist_ok=True)
for idx, (fig_name, fig_info) in enumerate(figures.items()):
    print(f"  {fig_name}: Title='{fig_info['title']}', Label='{fig_info['label']}'")
    fig_info["image"].save(f"{set_path}media/figure_{idx}.png")


Markdown text: 
  2024_paper.mmd: 7201 characters
  2024_answers.mmd: 12318 characters
Markdown text: 
  2024_paper.mmd: \section*{IMPERIAL COLLEGE LONDON}

\section*{MEng EXAMINATIONS 2024}

\section*{Part II}
for Internal Students of the Imperial College of Science, Technology and Medicine This paper is also taken for the relevant examination for the Associateship or Diploma

\section*{FLUID MECHANICS}

Monday, 20th May 14:00-15:30 (duration: 1 hour and 30 minutes)
This paper contains THREE questions. Attempt every question.
The numbers shown by each question are for your guidance; they indicate approximately how the examiners intend to distribute the marks for this paper. An appendix is included, and a Data and Formul√¶Book is provided.

This is a CLOSED BOOK Examination

Q1. Tapered pipe. Two reservoirs are connected by a tube of length \(L\), illustrated in Figure Q1. The liquid level in Reservoir A is higher than in Reservoir B by \(\Delta h\). A diameter, \(D\), must be selected

In [22]:
# Set up the LLM via LangChain.
temperature = 0
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )
# llm = ChatGoogleGenerativeAI(
#             model=os.environ['GOOGLE_AI_MODEL'],
#             temperature=temperature,
#             google_api_key=os.environ['GOOGLE_AI_API_KEY'],
#         )

# Extract Questions and Parts
- get questions

In [23]:
# Define the schema for the tutorial output.
class Exercise(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    content: str = Field(..., description="Content of the exercise (no exercise title, no subquestions)")
    subquestions: list[str] = Field(..., description="List of subquestions within the exercise (only the text, no numbering)")
    
class Tutorial(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[Exercise] = Field(..., description="List of tutorial questions")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", content: "content text exercise 1", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            { title: "exercise text 2", content: "content text exercise 2", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Tutorial)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [24]:
imported_tutorial = extract_tutorial_questions(paper_text)

Raw LLM Response:
content='{\n  "name": "FLUID MECHANICS",\n  "year": "2024",\n  "exercises": [\n    {\n      "title": "Tapered pipe",\n      "content": "Two reservoirs are connected by a tube of length $L$, illustrated in Figure Q1. The liquid level in Reservoir A is higher than in Reservoir B by $\\\\Delta h$. A diameter, $D$, must be selected for the tube, to achieve a target volume flow rate of $Q_{target}$. The liquid in the reservoirs is Newtonian, with dynamic viscosity, $\\\\mu$, and density, $\\\\rho$, both uniform and constant. Assume steady flow in the tube. Consider a cylindrical frame of reference with $z$-axis aligned with the axis of the tube, as schematically shown in Figure Q1, and origin at the entrance of the tube. In this frame of reference, the velocity is uniform along $\\\\theta$ and pressure is a function of $z$ only. The effect of body forces on the flow in the tube can be neglected. For parts (a) to (f) assume the tube has a constant diameter, $D_{0}$, and tha

In [25]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["exercises"]

# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question.get("title")}\n")
    print("Subquestions:")
    for subquestion in question.get("subquestions", []):
        print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


Title: FLUID MECHANICS 2024

**Question 1**:
Tapered pipe

Subquestions:
- (a) Which set of equations would you use to study the flow in the tube and why? [2%]
- (b) Using the given assumptions, explain why the velocity of the fluid in the tube is a function of the radial coordinate, $r$, only.
- (c) Show that the $r$-component of the velocity is zero everywhere in the tube.
- (d) Derive an expression for the z-component of the velocity in the tube, $u_{z}=u_{z}(r)$, in terms of $D_{0}, \mathrm{d} p/\mathrm{d} z, \rho, \mu$.
- (e) Find an expression for the pressure gradient $\mathrm{d} p/\mathrm{d} z$, in terms of $D_{0}, \rho, \mu, Q_{target}$.
- (f) Suggest a tube diameter, $D_{0}$, to meet flow rate requirements as a function of $\Delta h$.
- (g) In practice, the manufacturing process creates a tapered tube, where $D(z)=D_{0}+\alpha z$, where $D_{0}$ is the minimum diameter and $\alpha$ is a dimensionless constant. Write the simplified continuity and $z$-momentum equations, includi

# Extract Answers
- get question_answers

In [26]:
# Define the schema for the tutorial output.
class ExerciseAnswers(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    workedSolutions: list[str] = Field(..., description="List of worked solution to subquestions within the exercise (no numbering or counting)")
    
class TutorialAnswers(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[ExerciseAnswers] = Field(..., description="List of tutorial questions")

def extract_tutorial_answers(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            { title: "exercise text 2", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the TutorialAnswers schema.
    parser = PydanticOutputParser(pydantic_object=TutorialAnswers)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [27]:
imported_tutorial_answers = extract_tutorial_answers(answers_text)

Raw LLM Response:
content='{\n  "name": "Tapered Pipe, Turbine, and Wind Noise Tutorial",\n  "year": "2023",\n  "exercises": [\n    {\n      "title": "Tapered pipe",\n      "workedSolutions": [\n        "(a) The fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier-Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question (which asks to use a cylindrical frame of reference).",\n        "(b) In general, the velocity is a function of time and space, i.e. $\\\\vec{u}=\\\\vec{u}(t, r, \\\\theta, z)$. Since the flow is steady, the velocity does not depend on time. The velocity is also assumed uniform along $z$ and $\\\\theta$ (i.e. $\\\\partial \\\\vec{u} / \\\\partial \\\\theta=0$, $\\\\partial \\\\vec{u} / \\\\partial z=0$), 

In [None]:
title = imported_tutorial_answers["name"] + " " + imported_tutorial_answers["year"]

print(f"Title: {title}\n")

question_answers = imported_tutorial_answers["exercises"]

# Loop over and print each question
for idx, question in enumerate(question_answers, start=1):
    print(f"**Question {idx}**:\n{question.get("title")}\n")
    print("Subquestions:")
    for subquestion in question.get("workedSolutions", []):
        print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


Title: Tapered Pipe, Turbine, and Wind Noise Tutorial 2023

**Question 1**:
Tapered pipe

Subquestions:
- (a) The fluid is Newtonian, the density is constant and uniform, therefore the flow is incompressible, and the dynamic viscosity is uniform. Considering the fluid as a continuum, the aforementioned assumptions allow us to model the flow using the Navier-Stokes equations for incompressible flow. The formulation in cylindrical coordinates is used as suggested in the text of the question (which asks to use a cylindrical frame of reference).
- (b) In general, the velocity is a function of time and space, i.e. $\vec{u}=\vec{u}(t, r, \theta, z)$. Since the flow is steady, the velocity does not depend on time. The velocity is also assumed uniform along $z$ and $\theta$ (i.e. $\partial \vec{u} / \partial \theta=0$, $\partial \vec{u} / \partial z=0$), therefore the velocity also does not depend on $\theta$ and $z$. This means that the velocity must be a function of $r$ only.
- (c) The equat

# Form JSON Schemas

In [None]:

# Define the nested Pydantic models based on the JSON schema.
class WorkedSolution(BaseModel):
    content: str = Field(..., description="Worked solution content")
    id: str = Field(..., description="Identifier for the worked solution")
    title: str = Field(..., description="Worked solution title")
    children: list = []

class Part(BaseModel):
    answerContent: str = Field(..., description="Part answer text")
    content: str = Field(..., description="Part content text")
    orderNumber: int = Field(..., description="The order number of this part")
    responseAreas: list = Field(..., description="List of response areas")
    tutorial: list = Field(..., description="List of tutorial items")
    universalPartId: str = Field(..., description="Universal part identifier")
    workedSolution: WorkedSolution = Field(..., description="Worked solution details")

class QuestionJson(BaseModel):
    orderNumber: int = Field(..., description="The order number of the question")
    displayFinalAnswer: bool = Field(..., description="Flag to display the final answer")
    displayStructuredTutorial: bool = Field(..., description="Flag to display the structured tutorial")
    displayWorkedSolution: bool = Field(..., description="Flag to display the worked solution")
    masterContent: str = Field(..., description="Top level question content")
    parts: list[Part] = Field(..., description="List of question parts")
    publish: bool = Field(..., description="Publish flag")
    title: str = Field(..., description="Question title")

def create_question_json(question: str, answers: str) -> dict:
    # Initialize the output parser using the defined Pydantic model.
    parser = PydanticOutputParser(pydantic_object=QuestionJson)

    # Minimum JSON template to guide the model. (Used as context.)
    minimum_json_template = r'''{
      "orderNumber": 0,
      "displayFinalAnswer": true,
      "displayStructuredTutorial": true,
      "displayWorkedSolution": true,
      "displayChatbot": false,
      "masterContent": "Top level question here",
      "parts": [
        {
          "answerContent": "",
          "content": "Part text here",
          "orderNumber": 0,
          "responseAreas": [],
          "tutorial": [],
          "workedSolution": {
            "content": "Part worked solution here",
            "title": "",
            "children": []
          }
        }
      ],
      "publish": false,
      "title": "Question title here"
    }'''

    # Construct the prompt, appending the parser's format instructions.
    question_prompt = f'''
      JSON_TEMPLATE
      ```json
      {minimum_json_template}
      ```

      IMPORTED_QUESTION
      ```markdown
      {question}
      ```

      IMPORTED_ANSWERS
      ```markdown
      {answers}
      ```

      Preserve the markdown math formatting. Do not modify the original text of the question.

      From the worked solution content of each part of IMPORTED_ANSWERS, infer the 
      final answer and put it in the answerContent field of the part. The worked solution
      should be the full worked solution for the part, including all steps. The worked
      solution should be in the workedSolution.content field. 

      Carefully map IMPORTED_QUESTION and IMPORTED_ANSWERS into the JSON_TEMPLATE and return valid JSON.

      {parser.get_format_instructions()}
      '''

    # Invoke the language model.
    response = llm.invoke(question_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.model_dump()  # Return as a dictionary.
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [44]:
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

# Define a Pydantic model representing the expected output schema.
class QuestionName(BaseModel):
    question_name: str = Field(..., description="A short tag representing the question’s topic.")

def create_question_name(question: str) -> str:
    # Initialize the output parser with the schema.
    parser = PydanticOutputParser(pydantic_object=QuestionName)

    # Build a prompt that includes the parser's format instructions.
    question_name_prompt = f'''
      IMPORTED_QUESTION
      ```markdown
      {question}
      ```
      
      QUERY:
      Based on the above markdown content, infer a suitable short name tag that represents the question’s topic.
      Follow these rules:
      
        1. Look for the heading text (for example, if the text starts with "Question 1:" followed by "Hydraulic scale", then the main topic is "Hydraulic scale").
        2. Normalize the name by replacing spaces with underscores and removing punctuation.
        3. Return only a valid JSON object with a single property "question_name".
      
      {parser.get_format_instructions()}
    '''
    # Invoke the language model.
    response = llm.invoke(question_name_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.question_name
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [45]:
questions = imported_tutorial["exercises"]
question_answers = imported_tutorial_answers["exercises"]

# Loop over all questions and question_answers and print each question
for idx, question, question_ans in zip(range(1, len(questions)+1), questions, question_answers):
    print(f"**Question {idx}**:\n{question}\n")
    print(f"**Question Answers {idx}**:\n{question_ans}\n")

    print("INFO: Mapping question in markdown into JSON")
    question_json = create_question_json(question, question_ans)
    question_json["orderNumber"] = idx-1
    print(f"INFO: JSON {idx}:\n{question_json}\n")
    
    print("INFO: Get question name.")
    question_name = create_question_name(question)
    
    question_index = f"{(idx-1):03}" 
    filename = f"{set_path}question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(question_json, indent=2))
    
    # break # breaking here as just doing quick test

**Question 1**:
{'title': 'Tapered pipe', 'content': 'Two reservoirs are connected by a tube of length $L$, illustrated in Figure Q1. The liquid level in Reservoir A is higher than in Reservoir B by $\\Delta h$. A diameter, $D$, must be selected for the tube, to achieve a target volume flow rate of $Q_{target}$. The liquid in the reservoirs is Newtonian, with dynamic viscosity, $\\mu$, and density, $\\rho$, both uniform and constant. Assume steady flow in the tube. Consider a cylindrical frame of reference with $z$-axis aligned with the axis of the tube, as schematically shown in Figure Q1, and origin at the entrance of the tube. In this frame of reference, the velocity is uniform along $\\theta$ and pressure is a function of $z$ only. The effect of body forces on the flow in the tube can be neglected. For parts (a) to (f) assume the tube has a constant diameter, $D_{0}$, and that the velocity is also uniform along $z$.', 'subquestions': ['(a) Which set of equations would you use to st

In [42]:
def create_tutorial_metadata(tutorial_title: str) -> dict:
    """
    Creates a metadata JSON object for a tutorial.

    The metadata includes a normalized short name (generated by lowercasing the
    title, replacing spaces with underscores, and removing unsafe characters),
    as well as several fixed visibility settings and a release date.

    Args:
        tutorial_title (str): The full tutorial title.

    Returns:
    
        dict: A dictionary containing the metadata.
    """
    # Generate a short name for the tutorial (could name fancier using LLM).
    # - replace spaces with underscores,
    # - remove any characters except letters, numbers, underscores, and hyphens.
    normalized_name = re.sub(r'\s+', '_', tutorial_title)  # replace spaces with underscores        
    normalized_name = re.sub(r'[^a-zA-Z0-9_]', '', normalized_name)  # remove unsafe characters

    # Build the metadata dictionary
    metadata = {
        "name": tutorial_title,
        "description": "",  # Optional description of the tutorial
        "manuallyHidden": True,  # Defaults to true
        "finalAnswerVisibility": "OPEN_WITH_WARNINGS",
        "workedSolutionVisibility": "OPEN_WITH_WARNINGS",
        "structuredTutorialVisibility": "OPEN",
        "chatbotVisibility": "HIDE"
    }
    tutorial_normalized_title = normalized_name
    
    return metadata, tutorial_normalized_title

tutorial_title = imported_tutorial["name"] + " " + imported_tutorial["year"]
metadata, tutorial_normalized_title = create_tutorial_metadata(tutorial_title)

set_filename = f"set_{tutorial_normalized_title}.json"

print(f"Saving metadata to {set_path}{set_filename}...")
json.dump(metadata, open(f"{set_path}{set_filename}", "w"), indent=4)


Saving metadata to ./pastpapers/mathpixJSON/set_fluid_mechanics_2024/set_FLUID_MECHANICS_2024.json...
