In [None]:
import os
import re
import json

from pathlib import Path
from io import BytesIO
from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.docstore.document import Document
from langchain_openai import AzureChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI


import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import requests

# Load environment variables from .env file.
load_dotenv()

In [None]:
year = 2023
path = "./pastpapers/"
paper_file_path = f"{year}_paper.mmd"
answers_file_path = f"{year}_answers.mmd"
set_path = f"{path}mathpixJSON/set_fluid_mechanics_{year}/"

# Load the Markdown files
with open(path+paper_file_path, "r") as f:
    paper_text = f.read()
    #  ignore anything after Appendices
    paper_text = paper_text.split("Appendices")[0]
with open(path+answers_file_path, "r") as f:
    answers_text = f.read()
    # ignore anything after Notes on the paper
    answers_text = answers_text.split("Notes on the paper")[0]

# Print out a summary.
print("Markdown text: ")
print(f"  {paper_file_path}: {len(paper_text)} characters")
print(f"  {answers_file_path}: {len(answers_text)} characters")
print("Markdown text: ")
print(f"  {paper_file_path}: {paper_text}")
print(f"  {answers_file_path}: {answers_text}")


In [None]:
# Fetch the figures from the paper and answers.
figures = {}
def extract_figures_from_text(text, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                "answerFile": ans
            }
    return figures

figures.update(extract_figures_from_text(paper_text))
figures.update(extract_figures_from_text(answers_text, ans=True))

Path(f"{set_path}").mkdir(exist_ok=True)
Path(f"{set_path}media/").mkdir(exist_ok=True)
for idx, (fig_name, fig_info) in enumerate(figures.items()):
    print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
    image_name = f"figure_{fig_info['title']}{"_ans" if fig_info["answerFile"] else ""}.png"
    if image_name in os.listdir(f"{set_path}media/"):
        image_name = f"figure_{fig_info['title']}_{idx}{"_ans" if fig_info["answerFile"] else ""}.png"
    fig_info["image"].save(f"{set_path}media/{image_name}")

In [None]:
# Set up the LLM via LangChain.
temperature = 0
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )
# llm = ChatGoogleGenerativeAI(
#             model=os.environ['GOOGLE_AI_MODEL'],
#             temperature=temperature,
#             google_api_key=os.environ['GOOGLE_AI_API_KEY'],
#         )

# Extract Questions and Parts
- get questions

In [None]:
# Define the schema for the tutorial output.
class Exercise(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    content: str = Field(..., description="Content of the exercise (no exercise title, no subquestions)")
    subquestions: list[str] = Field(..., description="List of subquestions within the exercise (only the text, no numbering)")
    
class Tutorial(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[Exercise] = Field(..., description="List of tutorial questions")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", content: "content text exercise 1", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            { title: "exercise text 2", content: "content text exercise 2", subquestions: ["subquestion text 1", "subquestion text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Tutorial)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        If the exercise mentions figures, then find all the captions of figures (no links). 
        Keep the captions as "Figure Q1 - ...".

        If the exercise mentions tables, then include the table in the content.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [None]:
imported_tutorial = extract_tutorial_questions(paper_text)

In [None]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["exercises"]

# Loop over and print each question
for idx, question in enumerate(questions, start=1):
    print(f"**Question {idx}**:\n{question.get("title")}\n")
    print(f"Content: {question.get("content")}\n")
    print("Subquestions:")
    for subquestion in question.get("subquestions", []):
        print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


# Extract Answers
- get question_answers

In [None]:
# Define the schema for the tutorial output.
class ExerciseAnswers(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    workedSolutions: list[str] = Field(..., description="List of worked solution to subquestions within the exercise (no numbering or counting)")
    
class TutorialAnswers(BaseModel):
    name: str = Field(..., description="Title of the tutorial")
    year: str = Field(..., description="Year of the tutorial")
    exercises: list[ExerciseAnswers] = Field(..., description="List of tutorial questions")

def extract_tutorial_answers(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            { title: "exercise text 2", workedSolutions: ["workedSolution text 1", "workedSolution text 2", ...] },
            ...
        ]
    }
    
    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    
    Args:
        doc_page_content (str): The content of the tutorial sheet.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the TutorialAnswers schema.
    parser = PydanticOutputParser(pydantic_object=TutorialAnswers)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_TUTORIAL
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may
        not include reference solutions. Please infer the title of the tutorial from
        the content, and extract each individual question as a separate string. Do
        not modify the text of the exercises. Only use $...$ for math expressions.

        If the exercise mentions figures, then find all the captions of figures (no links). 
        Keep the captions as "Figure Q1 - ...".

        If the exercise mentions tables, then include the table in the content.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None


In [None]:
imported_tutorial_answers = extract_tutorial_answers(answers_text)

In [None]:
title = imported_tutorial_answers["name"] + " " + imported_tutorial_answers["year"]

print(f"Title: {title}\n")

question_answers = imported_tutorial_answers["exercises"]

# Loop over and print each question
for idx, question in enumerate(question_answers, start=1):
    print(f"**Question {idx}**:\n{question.get("title")}\n")
    print("Subquestions:")
    for subquestion in question.get("workedSolutions", []):
        print(f"- {subquestion}")
    print("-" * 40)  # Separator for readability


# Form JSON Schemas

In [None]:

# Define the nested Pydantic models based on the JSON schema.
class WorkedSolution(BaseModel):
    content: str = Field(..., description="Worked solution content")
    title: str = Field(..., description="Worked solution title")
    children: list = []

class Part(BaseModel):
    answerContent: str = Field(..., description="Part answer text")
    content: str = Field(..., description="Part content text")
    orderNumber: int = Field(..., description="The order number of this part")
    responseAreas: list = Field(..., description="List of response areas")
    tutorial: list = Field(..., description="List of tutorial items")
    workedSolution: WorkedSolution = Field(..., description="Worked solution details")

class QuestionJson(BaseModel):
    orderNumber: int = Field(..., description="The order number of the question")
    displayFinalAnswer: bool = Field(..., description="Flag to display the final answer")
    displayStructuredTutorial: bool = Field(..., description="Flag to display the structured tutorial")
    displayWorkedSolution: bool = Field(..., description="Flag to display the worked solution")
    masterContent: str = Field(..., description="Top level question content")
    parts: list[Part] = Field(..., description="List of question parts")
    publish: bool = Field(..., description="Publish flag")
    title: str = Field(..., description="Question title")

def create_question_json(question: str, answers: str) -> dict:
    # Initialize the output parser using the defined Pydantic model.
    parser = PydanticOutputParser(pydantic_object=QuestionJson)

    # Minimum JSON template to guide the model. (Used as context.)
    minimum_json_template = r'''{
      "orderNumber": 0,
      "displayFinalAnswer": true,
      "displayStructuredTutorial": true,
      "displayWorkedSolution": true,
      "displayChatbot": false,
      "masterContent": "Top level question here",
      "parts": [
        {
          "answerContent": "",
          "content": "Part text here",
          "orderNumber": 0,
          "responseAreas": [],
          "tutorial": [],
          "workedSolution": {
            "content": "Part worked solution here",
            "title": "",
            "children": []
          }
        }
      ],
      "publish": false,
      "title": "Question title here"
    }'''

    # Construct the prompt, appending the parser's format instructions.
    question_prompt = f'''
      JSON_TEMPLATE
      ```json
      {minimum_json_template}
      ```

      IMPORTED_QUESTION
      ```markdown
      {question}
      ```

      IMPORTED_ANSWERS
      ```markdown
      {answers}
      ```

      Preserve the markdown math formatting to use $...$ for math expressions. Do not modify the original text of the question.

      From the worked solution content of each part of IMPORTED_ANSWERS, infer the 
      final answer and put it in the answerContent field of the part. The worked solution
      should be the full worked solution for the part, including all steps. The worked
      solution should be in the workedSolution.content field. 

      Carefully map IMPORTED_QUESTION and IMPORTED_ANSWERS into the JSON_TEMPLATE and return valid JSON.

      {parser.get_format_instructions()}
      '''

    # Invoke the language model.
    response = llm.invoke(question_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.model_dump()  # Return as a dictionary.
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None


In [None]:
def add_figure_references_to_questions(figures: dict, question_json: dict) -> dict:
    """
    Replaces figure reference text in the question_json content with markdown links,
    specifically for references matching the pattern 'Figure Q1 - label'.

    Args:
        figures (dict): Dictionary of figures with their metadata (title, label, URL).
        question_json (dict): JSON object containing question data.

    Returns:
        dict: Updated question_json with figure references replaced.
    """
    for fig_name, fig_info in figures.items():
        figure_title = fig_info["title"]
        figure_label = fig_info["label"]
        figure_url = fig_info["url"]
        print(f"Figure: {fig_info}")
        if fig_info["answerFile"] == True:
            # Skip figures that are part of the answers
            print(f"Skipping figure {fig_name} as it is part of the answers.")
            continue
        print(f"Adding figure {figure_title} - {figure_label} to question JSON.")

        # Construct the markdown format for the figure
        markdown_figure = f"![Figure {figure_title} - {figure_label}]({figure_url})\n"

        # Define the specific pattern to match 'Figure Q1 - label'
        pattern = rf"Figure\s+{re.escape(figure_title)}\s*-\s*{re.escape(figure_label)}"

        # Replace exact matches in masterContent
        question_json["masterContent"] = re.sub(
            pattern, markdown_figure, question_json["masterContent"]
        )

        # Replace exact matches in each part's content
        for part in question_json.get("parts", []):
            part["content"] = re.sub(
                pattern, markdown_figure, part["content"]
            )

    return question_json

# NOTE: Should not depend on mathpix jpeg urls, and just use the local files
# TODO: but this errors when uploading to LambdaFeedback
def add_local_figures_to_questions(figures: dict, question_json: dict) -> dict:
    """
    Replaces figure reference text in the question_json content with local file paths,
    specifically for references matching the pattern 'Figure Q1 - label'.

    Args:
        figures (dict): Dictionary of figures with their metadata (title, label, URL).
        question_json (dict): JSON object containing question data.

    Returns:
        dict: Updated question_json with figure references replaced.
    """
    for fig_name, fig_info in figures.items():
        figure_title = fig_info["title"]
        figure_label = fig_info["label"]
        figure_url = fig_info["url"]
        print(f"Figure: {fig_info}")
        if fig_info["answerFile"] == True:
            # Skip figures that are part of the answers
            print(f"Skipping figure {fig_name} as it is part of the answers.")
            continue
        print(f"Adding figure {figure_title} - {figure_label} to question JSON.")
        
        # Construct the markdown format for the figure
        local_figure_path = f"media/figure_{figure_title}.png"
        markdown_figure = f"![Figure {figure_title} - {figure_label}]({local_figure_path})\n"

        # Define the specific pattern to match 'Figure Q1 - label'
        pattern = rf"Figure\s+{re.escape(figure_title)}\s*-\s*{re.escape(figure_label)}"

        # Replace exact matches in masterContent
        question_json["masterContent"] = re.sub(
            pattern, markdown_figure, question_json["masterContent"]
        )

        # Replace exact matches in each part's content
        for part in question_json.get("parts", []):
            part["content"] = re.sub(
                pattern, markdown_figure, part["content"]
            )

    return question_json

# NOTE: FIGURES IN ANSWERS ARE HANDLED, they need to be added manually

In [None]:
questions = imported_tutorial["exercises"]
question_answers = imported_tutorial_answers["exercises"]

# Loop over all questions and question_answers and print each question
for idx, question, question_ans in zip(range(1, len(questions)+1), questions, question_answers):
    print(f"**Question {idx}**:\n{question}\n")
    print(f"**Question Answers {idx}**:\n{question_ans}\n")

    print("INFO: Mapping question in markdown into JSON")
    question_json = create_question_json(question, question_ans)
    question_json["orderNumber"] = idx-1
    print(f"INFO: JSON {idx}:\n{question_json}\n")
    
    print("INFO: Get figures")
    # updated_question_json = add_figure_references_to_questions(figures, question_json)
    # updated_question_json = add_local_figures_to_questions(figures, question_json)
    updated_question_json = question_json

    question_name = updated_question_json["title"].replace(" ", "_")
    question_index = f"{(idx-1):03}" 
    filename = f"{set_path}question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(updated_question_json, indent=2))
    
    # break # breaking here as just doing quick test

In [None]:
def create_tutorial_metadata(tutorial_title: str) -> dict:
    """
    Creates a metadata JSON object for a tutorial.

    The metadata includes a normalized short name (generated by lowercasing the
    title, replacing spaces with underscores, and removing unsafe characters),
    as well as several fixed visibility settings and a release date.

    Args:
        tutorial_title (str): The full tutorial title.

    Returns:
    
        dict: A dictionary containing the metadata.
    """
    # Generate a short name for the tutorial (could name fancier using LLM).
    # - replace spaces with underscores,
    # - remove any characters except letters, numbers, underscores, and hyphens.
    normalized_name = re.sub(r'\s+', '_', tutorial_title)  # replace spaces with underscores        
    normalized_name = re.sub(r'[^a-zA-Z0-9_]', '', normalized_name)  # remove unsafe characters

    # Build the metadata dictionary
    metadata = {
        "name": tutorial_title,
        "description": "",  # Optional description of the tutorial
        "manuallyHidden": True,  # Defaults to true
        "finalAnswerVisibility": "OPEN_WITH_WARNINGS",
        "workedSolutionVisibility": "OPEN_WITH_WARNINGS",
        "structuredTutorialVisibility": "OPEN",
        "chatbotVisibility": "HIDE"
    }
    tutorial_normalized_title = normalized_name
    
    return metadata, tutorial_normalized_title

tutorial_title = imported_tutorial["name"] + " " + imported_tutorial["year"]
metadata, tutorial_normalized_title = create_tutorial_metadata(tutorial_title)

set_filename = f"set_{tutorial_normalized_title}.json"

print(f"Saving metadata to {set_path}{set_filename}...")
json.dump(metadata, open(f"{set_path}{set_filename}", "w"), indent=4)
