# process description  

the program takes in a pdf  
mathpix is used to scan the pdf and turning it into markdown  
markdown then processed to get the images  

llm is used to extract the questions and solutions in **ONE** go.  
the final JSON is made using the in2lambda api.

In [None]:
import os
import re
import json
import time
import requests

from pathlib import Path

from dotenv import load_dotenv
from pydantic import BaseModel, Field, ValidationError

from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

from in2lambda.api.module import Module
from in2lambda.api.question import Question
from in2lambda.api.part import Part

from PIL import Image

# Load environment variables from .env file.
load_dotenv()

# scanning/processing the initial pdf into markdown

In [None]:
MATHPIX_API_KEY = os.getenv("MATHPIX_API_KEY")
MATHPIX_APP_ID = os.getenv("MATHPIX_APP_ID")

def pdf_to_markdown(source_path: str, result_path: str):
    ''' 
    converts the pdf at `source_path` to a markdown file at `result_path` using Mathpix API.
    '''
    # Upload PDF to Mathpix and returns a Markdown file with the content.
    with open(source_path, "rb") as file:
        r = requests.post(
            "https://api.mathpix.com/v3/pdf",   
            headers={
                "app_id": MATHPIX_APP_ID,
                "app_key": MATHPIX_API_KEY,
            },
            files={"file": file},
        )
        pdf_id = r.json()["pdf_id"]
        print("PDF ID:", pdf_id)
        print("Response:", r.json())

        url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.md"
        headers = {
            "app_id": MATHPIX_APP_ID,
            "app_key": MATHPIX_API_KEY,
        }

        max_retries = 10
        retry_delay = 5  # seconds
        for attempt in range(max_retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                # Save the result if the request is successful
                with open(result_path, "w") as f:
                    f.write(response.text)
                print("Downloaded MD successfully.")
                break
            else:
                print(f"Attempt {attempt + 1}/{max_retries}: Processing not complete. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
        else:
            print("Failed to retrieve processed PDF after multiple attempts:", response.status_code, response.text)

# setting up the directories

In [None]:
folder_path = "conversion_content"
output_path = f"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out"
media_path = f"{output_path}/media"

Path(media_path).mkdir(parents=True, exist_ok=True)

source_path = f"{folder_path}/example.pdf"
result_path = f"{output_path}/example.md"

# Only activate mathpix if the markdown has not been created yet.
# This avoids unnecessary reprocessing of the same PDF.
if not Path(result_path).exists():
    if Path(source_path).exists():
        pdf_to_markdown(source_path, result_path)
    else:
        print(f"Error: Source PDF file not found at {source_path}")
        exit(1)

try:
    with open(result_path, "r") as f:
        md_content = f.read()
except FileNotFoundError:
    print(f"Error: Markdown file not found at {result_path}")
    exit(1)

# Print out a summary.
print("Markdown text: ")
print(f"  {result_path}: {len(md_content)} characters")
print("Markdown text: ")
print(f"  {result_path}: {md_content}")

# downlaoding extracted images from Mathpix

In [None]:
# Fetch the figures from the paper and answers.
def extract_figures_from_text(text): #, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                "local_path": "",
                # "answerFile": ans
            }
    return figures

# a dictionary storing information on the figures
figures = extract_figures_from_text(md_content)

# saving the images locally

In [None]:
def save_figures_to_path(figures):
    for idx, (fig_name, fig_info) in enumerate(figures.items()):
        print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
        # Extract file extension and create a clean filename
        if "?" in fig_name:
            end_location = fig_name.index("?")
            image_name = f"{idx}_{fig_name[:end_location]}"
        else:
            image_name = f"{idx}_{fig_name}"
        
        fig_info["local_path"] = image_name
        try:
            fig_info["image"].save(f"{media_path}/{fig_info['local_path']}")
            print(f"Saved image: {fig_info['local_path']}")
        except Exception as e:
            print(f"Error saving image {image_name}: {e}")

save_figures_to_path(figures)

# replacing url for images with local path

In [None]:
def replace_figures_in_markdown(md_content, figures):
    #replace the image URLs in the markdown content with local paths
    # add pictureTag for Lambda Feedback to recognise it as a picture
    md_content = md_content.replace("![]", "![pictureTag]")
    for fig_name, fig_info in figures.items():
        md_content = md_content.replace(fig_info["url"], fig_info["local_path"])
        print(f"Replaced {fig_info['url']} with {fig_info['local_path']} in markdown content.")
    # Save the modified markdown content to a file
    try:
        with open(f"{output_path}/example.md", "w") as f:
            f.write(md_content)
        print("Modified markdown saved successfully.")
    except Exception as e:
        print(f"Error saving modified markdown: {e}")

replace_figures_in_markdown(md_content, figures)

# Initialising llm

In [None]:
# Set up the LLM via LangChain.

# Uses gpt-4.1-nano:
#    - a faster model
#    - less intelligent

llm_nano = ChatOpenAI(
            model="gpt-4.1-nano",
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Uses gpt-4o-mini:
#    - more intelligent
llm_mini = ChatOpenAI(
            model="gpt-4o-mini",
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Spelling and structure check

In [None]:
llm_task_correct_mistakes = """
The input is a markdown file that is converted from a pdf using Mathpix API.
The pdf contains questions and may contain the solutions too.
As the original pdf may contain hand written text, the markdown file may contain mistakes in spelling, grammar and structure.
Your task is to:
    1. Correct any spelling mistakes in the markdown file.
    2. Correct any grammar mistakes in the markdown file.
    3. Correct any structure mistakes in the markdown file, such that it follows the styles of the entire markdown file.
    4. Do not change the content of the markdown file, only correct the mistakes.
Output only a valid markdown file with the corrections applied, if any. Do not add any additional text or comments.
"""

def correct_mistakes_in_markdown(md_content: str) -> str:
    prompt = f"""
        {llm_task_correct_mistakes}

        ```input
        {md_content}
        ```

        Return the markdown now.
    """

    response = llm_nano.invoke(prompt)

    return response.content.strip()

# Extract Questions and Parts

In [None]:
#define initial question model
class QuestionModel(BaseModel):
    # full question and full solution
    question_content: str = Field(..., description="The content of the question.")
    solution_content: str = Field(..., description="The content of the solution.")
    images: list[str] = Field(..., description="A list of image URLs associated with the question.")

class AllQuestionsModel(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    questions: list[QuestionModel] = Field(..., description="A list of questions.")

llm_task_seperate_questions = """
    Your task is to extract all the individual questions and their worked solutions from the markdown content.
    please follow these steps carefully:
        1. you can choose the name of "AllQuestionModel".
        2. Identify the year of the tutorial, if mentioned. Otherwise, use "0".
        3. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.
        4. Look through the entire markdown:
            - Do not neglect any images, figures, or other media mentioned in the question, do not alter or neglect the alt text and the image URL.
            - Leave the Image links and alt text within the question/solution, but also make a copy and place it into the `images` field.
            - Identify full Questions, place it into question_content, becareful to not Include the solution in the question.
            - Identify the full Worked Solution for each full Question.
            - If the Worked Solution is not found, try to find the Answers associated with it instead.
            - If Worked Solution or Answers are found, place it into the solution_content. Otherwise leave as empty string, "".
            - For each question and corresponding solution, extract all image references (like ![pictureTag](filename.jpg)) and place them into the `images` field. If no images, use empty array [].
        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately
            - NO markdown code blocks, NO extra text, NO explanations.
            - Use plain newlines (not escaped as `\n`).
            - In JSON strings, backslashes must be escaped. Use \\\\ for LaTeX backslashes.
            - Always have each field in the JSON, even if it is empty.
            - Becareful that the last element of a list is not followed by a comma.
        6. The Text inside the JSON should be in Lexdown:
            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.
            2. Do not remove or collapse blank lines.
            3. Do not escape characters like `\n` or `\\` except for JSON requirements.
    """

def extract_questions(doc_page_content: str) -> dict:
    # Initialise the parser for the output.
    parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)

    prompt = f"""
        Your task is to extract a JSON with the following structure exactly:
        {parser.get_format_instructions()}

        {llm_task_seperate_questions}

        Input markdown:
        ```
        {doc_page_content}
        ```
        Return the JSON now.
    """

    # tries to call the LLM multiple times to ensure robustness.
    for attempt_idx in range(3):
        
        # Call the LLM
        response = llm_mini.invoke(prompt)

        # Debug: print the raw LLM response
        # print("Raw LLM Response:")
        # print(response)

        try:
            # Parse the response using the output parser.
            parsed_output = parser.parse(response.content.strip())
            print("LLM response successfully parsed as JSON with questions:")
            print(response.content)
            # For Pydantic v2, use model_dump() to convert the model to a dictionary.
            return parsed_output.model_dump()
        except Exception as e:
            print("Error parsing LLM response as JSON:")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)

    print("Final raw LLM Response:")
    print(repr(response.content))
    raise Exception("Failed to parse LLM response as JSON after multiple attempts.")


In [None]:
# Define the schema for the tutorial output.
class Set_Question(BaseModel):
    title: str = Field(..., description="Title of the question (only the text, no numbering)")
    content: str = Field(..., description="Content of the question (no exercise title, no subquestions)")
    parts: list[str] = Field(..., description="List of parts within the question (only the text, no numbering)")
    images: list[str] = Field(..., description="List of image URLs associated with the question (no alt text, only URLs)")

class Set_Solution_Part(BaseModel):
    part_solution: str = Field(..., description="The worked solution for the part (no numbering or counting)")

class Set_Solution(BaseModel):
    parts_solutions: list[str] = Field(..., description="List of worked solutions for the question (no numbering or counting)")

    def __init__(self, parts_solutions: list[Set_Solution_Part]):
        """
        Initialize the Set_Solution with a list of solutions for each part.
        
        Args:
            parts_solutions (list[Set_Solution_Part]): The worked solutions for the parts.
        """
        super().__init__(parts_solutions=[part.part_solution for part in parts_solutions])

class Set_Question_With_Solution(Set_Question):
    parts_solutions: list[str] = Field(..., description="The worked solution for the parts.")

    def __init__(self, question: Set_Question, solution: Set_Solution):
        """
        Initialize the Set_Question_With_Solution with a question and its solution.
        
        Args:
            question (Set_Question): The question object.
            solution (Set_Solution): The solution object.
        """
        super().__init__(
            **question.model_dump(),
            parts_solutions=solution.parts_solutions
        )


class Set(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    questions: list[Set_Question_With_Solution] = Field(..., description="List of questions in the set")


# TODO: make parts completely seperate from stem to question
# ensure no answer in question itself, only in parts_solutions.
llm_task_seperate_parts_question = """
    Your task is to seperate the questions into individual parts.
    Please follow these steps carefully:
        1. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.
        2. Use the same name and year.
        3. Use the same list of images as in the input for each question.
        4. For each question in questions:
            - Title is the only field where you are allowed to name it whatever you seem fit for the question.
            - Do not neglect any images, figures, or other media mentioned in the question, do not alter or neglect the alt text and the image URL.
            - You may use the question_content and solution_content from the input, to help with knowing how to split the question into parts. Making sure that neither content nor parts contain any solution/answers.
            - Try to get as many parts as possible, but do not split the question into too many parts.
            - The parts may be obvious to find, like "a)...", "b)...", or, "i)...", "ii)...", etc, or they could be implied by the question itself. All question must have at least one part, if there is only one part.
                1. The stem should be placed into the "content" field. Text in this field should be valid in the Milkdown editor. 
                2. the parts of the question (subquestions) should be placed into the "parts" field. Text in this field should be valid under Lexdown.
        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately
            - NO markdown code blocks, NO extra text, NO explanations.
            - Use plain newlines (not escaped as `\n`).
            - In JSON strings, backslashes must be escaped. Use \\\\ for LaTeX backslashes.
            - Always have each field in the JSON, even if it is empty.
            - Becareful that the last element of a list is not followed by a comma.
        6. The Text inside the JSON should be in Lexdown:
            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.
            2. Do not remove or collapse blank lines.
            3. Do not escape characters like `\n` or `\\` except for JSON requirements.
    """

llm_task_seperate_parts_solution = """
    Your task is to extract the solution for each part of a question, there should be an equal number of solutions as there are parts in the question.
    Please follow these steps carefully:
        1. Every character should match the original source exactly unless you're instructed to split content into fields, without adding escapes or modifications.
        2. Use the same list of images as in the input for each question.
        3. For each parts of the question (subquestions):
            - Carefully try to find the solution for each part, and place it into the "part_solution" field. Otherwise, leave as empty string. Text in this field should be valid under Lexdown.
            - Make sure that the solution is only for the particular part.
        4. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately
            - NO markdown code blocks, NO extra text, NO explanations.
            - Use plain newlines (not escaped as `\n`).
            - In JSON strings, backslashes must be escaped. Use \\\\ for LaTeX backslashes.
            - Always have each field in the JSON, even if it is empty.
            - Becareful that the last element of a list is not followed by a comma.
        5. The Text inside the JSON should be in Lexdown:
            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.
            2. Do not remove or collapse blank lines.
            3. Do not escape characters like `\n` or `\\` except for JSON requirements.
    """

def extract_parts_question(questions_dict: dict) -> dict:
    """
    Extracts the title and individual questions from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "questions": [
            { title: "exercise text 1", content: "content text exercise 1", parts: ["subquestion text 1", "subquestion text 2", ...],
            { title: "exercise text 2", content: "content text exercise 2", parts: ["subquestion text 1", "subquestion text 2", ...],
            ...
        ]
    }
    
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    if any of the text mentions a figure/diagram, then also find the figure and add it to the content of the exercise.
    
    Args:
        doc_page_content (str): The content of a set.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    
    questions_in_parts = []
    for question_idx, question in enumerate(questions_dict["questions"]):
        question_parse_success = False

        # Initialize the output parser with the Set_Question schema.
        question_parser = PydanticOutputParser(pydantic_object=Set_Question)

        for attempt_idx in range(3):

            # Construct the prompt, appending the parser's format instructions.
            prompt = f"""
                Your task is to extract a JSON with the following structure exactly:
                {question_parser.get_format_instructions()}

                {llm_task_seperate_parts_question}

                Input Dictionary:
                ```JSON
                {question}
                ```

                Return the JSON now.
                """
            
            # Call the LLM
            response = llm_mini.invoke(prompt)

            # Debug: print the raw LLM response
            # print("Raw LLM Response:")
            # print(response)

            try:
                # Parse the response using the output parser.
                parsed_output_parts = question_parser.parse(response.content)
                print(f"LLM response successfully parsed question {question_idx + 1}.")
                print(parsed_output_parts.content)
                # For Pydantic v2, use model_dump() to convert the model to a dictionary.
                question_parse_success = True
                break
            except Exception as e:
                print(f"Error parsing LLM response as JSON for question {question_idx + 1}:")
                print(f"Retrying... Attempt No.{attempt_idx + 1}")
                time.sleep(2)

        if not question_parse_success:
            print("Final LLM Response:")
            print(response.content)
            raise Exception(f"Failed to parse LLM response as JSON after multiple attempts for question {question_idx + 1}.")
        

        # Initialize the output parser with the Set_Solution schema.
        solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part)
        solutions_parts = []
        for part_idx, part in enumerate(parsed_output_parts.parts):

            solution_parse_success = False
            for attempt_idx in range(3):
                # Construct the prompt, appending the parser's format instructions.
                prompt = f"""
                    Your task is to extract a JSON with the following structure exactly:
                    {solution_parser.get_format_instructions()}

                    {llm_task_seperate_parts_solution}

                    full solution:
                    {question["solution_content"]}

                    question part:
                    {part}
                    """
                
                # Call the LLM
                response = llm_mini.invoke(prompt)

                try:
                    # Parse the response using the output parser.
                    parsed_output_solution_part = solution_parser.parse(response.content)
                    print(f"LLM response successfully parsed solution for part {part_idx + 1} of question {question_idx + 1}.")
                    print(response.content)
                    # For Pydantic v2, use model_dump() to convert the model to a dictionary.
                    solution_parse_success = True
                    break
                except Exception as e:
                    print(f"Error parsing LLM response as JSON for part {part_idx + 1} of question {question_idx + 1}:")
                    print(f"Outputted response:\n{response.content}")
                    print(f"Retrying... Attempt No.{attempt_idx + 1}")
                    time.sleep(2)

            if not solution_parse_success:
                print("Final LLM Response:")
                print(response.content)
                raise Exception(f"Failed to parse LLM response as JSON after multiple attempts for part {part_idx + 1} of question {question_idx + 1}.")

            solutions_parts.append(parsed_output_solution_part)

        set_solution = Set_Solution(parts_solutions=solutions_parts)
        set_question_with_solution = Set_Question_With_Solution(
            question=parsed_output_parts,
            solution=set_solution
        )
        questions_in_parts.append(set_question_with_solution)

    return Set(
        name=questions_dict["name"],
        year=questions_dict["year"],
        questions=questions_in_parts
    ).model_dump()


# LLM evaluation of the content of JSON

In [None]:
# Define the PartModel for validation
class PartModel(BaseModel):
    part_text: str = Field(..., description="The text of the part")
    part_solution: str = Field(..., description="The solution for the part")

llm_task_expression_check = r"""
    Look inside the structure, specifically the `part_text` and `part_solution` fields. Ensure that the JSON content follows these rules:
        1. JSON escaping: In JSON strings, backslashes must be escaped. Use \\\\ for LaTeX backslashes (e.g., "$A \\\\cup B$" not "$A \\cup B$").
        2. Math delimiters: All LaTeX math commands and math macros must be fully enclosed within math delimiters — use `$...$` for inline math, and `$$...$$` for display math.
        3. Balanced delimiters:
            - All `$$` and `$` must be properly opened and closed.
            - No unbalanced or partial math blocks.
        4. Display math formatting:
            - The opening `$$` must appear on a new line.
            - The closing `$$` must also be on its own new line.
            - The math content must appear immediately between them, with no extra blank lines unless they are part of the input.
        5. Inline math rules:
            - `$...$` should not span multiple lines.
            - Avoid using `$$` for short inline expressions.
        6. Preserve LaTeX syntax:
            - All LaTeX commands, braces (`{}`, `[]`), and special characters must be preserved exactly as in the original input.
            - Remember: in JSON, use \\\\ for each LaTeX backslash.
        7. Blank lines:
            - Preserve all blank lines inside math blocks.
            - Outside math, follow the structure of the original input.
        8. Alt text and image URLs:
            - Ensure that all image URLs and alt text are preserved as they appear in the original input.
            - The alt text must be `pictureTag`.
        9. Output format:
            - Output a single valid JSON string.
            - Do not include any extra characters, explanations, or escaped formatting outside the JSON structure.
            - No literal \\n sequences - use actual newlines in JSON strings.
    """


def content_texdown_check(validated_dict: dict) -> dict:
    """
    Checks if the content of the JSON is in Texdown format by processing each part individually.
    
    Args:
        validated_dict (dict): The validated dictionary from the LLM.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    part_parser = PydanticOutputParser(pydantic_object=PartModel)

    questions_in_parts = []
    for question_idx, question in enumerate(validated_dict["questions"]):
        
        # Process each part individually
        validated_parts = []
        validated_parts_solutions = []
        
        for part_idx, (part_text, part_solution) in enumerate(zip(question.get("parts", []), question.get("parts_solutions", []))):
            passed = False
            
            # loop 3 times to ensure robustness.
            for attempt_idx in range(3):
                # Create a part object for validation
                part_data = {
                    "part_text": part_text,
                    "part_solution": part_solution
                }
                
                # prompt to let llm validate the part.
                validation_prompt = f"""
                    Your task is to extract a JSON with the following structure exactly:
                    {part_parser.get_format_instructions()}

                    {llm_task_expression_check}

                    Input Part:
                    ```json
                    {json.dumps(part_data, indent=2)}
                    ```
                    return the JSON with the content fixed if needed.
                    """

                # Call the LLM
                response = llm_nano.invoke(validation_prompt)

                try:
                    # Parse the response using the output parser.
                    parsed_output = part_parser.parse(response.content)
                    print(f"LLM response successfully parsed as JSON with valid $$ for question {question_idx + 1}, part {part_idx + 1}.")
                    # For Pydantic v2, use model_dump() to convert the model to a dictionary.
                    validated_part = parsed_output.model_dump()
                    validated_parts.append(validated_part["part_text"])
                    validated_parts_solutions.append(validated_part["part_solution"])
                    passed = True
                    break
                except ValidationError as ve:
                    print(f"Validation error for question {question_idx + 1}, part {part_idx + 1}: {ve}")
                    print("Retrying... Attempt No.", attempt_idx + 1)
                    print(ve.errors())
                    time.sleep(2)
                except Exception as e:
                    print("LLM Response:")
                    print(response.content)
                    print(f"Error parsing textdown LLM response as JSON for question {question_idx + 1}, part {part_idx + 1}: {e}")
                    print("Retrying... Attempt No.", attempt_idx + 1)
                    time.sleep(2)
            
            if not passed:
                print("Final LLM Response:")
                print(response.content)
                raise Exception(f"Failed to parse LLM response as JSON after multiple attempts for question {question_idx + 1}, part {part_idx + 1}.")
        
        # Create the validated question with processed parts
        validated_question = {
            "title": question.get("title", ""),
            "content": question.get("content", ""),
            "parts": validated_parts,
            "parts_solutions": validated_parts_solutions,
            "images": question.get("images", [])
        }
        questions_in_parts.append(validated_question)
    
    return {
        "name": validated_dict["name"],
        "year": validated_dict["year"],
        "questions": questions_in_parts
    }

In [None]:
def md_to_json(md_content: str) -> dict:
    """
    Extracts the title and individual questions from a tutorial sheet.
    
    Args:
        md_content (str): The content of a set.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    corrected_md_content = correct_mistakes_in_markdown(md_content)
    print("Markdown content corrected for spelling, grammar, and structure.")

    questions_dict = extract_questions(corrected_md_content)
    print("successfully extracted the questions from the markdown. Now extracting the parts...")

    extracted_dict = extract_parts_question(questions_dict)
    print("succesfully extracted the parts from the questions. Now validating the content...")

    content_validated_dict = content_texdown_check(extracted_dict)
    print("successfully validated the content.")
    print("successfully converted markdown to JSON.")
    
    return content_validated_dict

In [None]:
imported_tutorial = md_to_json(md_content)

In [None]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["questions"]

print(questions)

# Loop over and print each question
for idx1, question in enumerate(questions, start=1):
    print(f"**Question {idx1}**:\n{question.get('title')}\n")
    print(f"Content: {question.get('content')}\n")
    for idx2, (part, part_answer) in enumerate(zip(question.get("parts", []), question.get("parts_solutions", [])), start=1):
        print(f"Question {idx1}:")
        print(f"- Subquestion {idx2}: {part}")
        print(f"- Worked Solution {idx2}: {part_answer}")
        print("\n")
    print("-" * 40)  # Separator for readability

# Form JSON Schemas

In [None]:
questions = imported_tutorial["questions"]

in2lambda_questions = []

# Loop over all questions and question_answers and use in2lambda API to create a JSON.
for idx, question_dict in enumerate(questions, start=1):
    parts = []
    for part_question, part_solution in zip(question_dict.get("parts", []), question_dict.get("parts_solutions", [])):
        part_obj = Part(
            text=part_question,
            worked_solution=part_solution
        )
        parts.append(part_obj)

    # Handle image paths - ensure they exist
    image_paths = []
    for img in question_dict.get("images", []):
        if img.startswith("http"):
            # Skip URLs that weren't processed
            continue
        full_path = f"{media_path}/{img}"
        if Path(full_path).exists():
            image_paths.append(full_path)
        else:
            print(f"Warning: Image file not found: {full_path}")

    question = Question(
        title=question_dict.get("title", f"Question {idx}"),
        main_text=question_dict.get("content", ""),
        parts=parts,
        images=image_paths
    )
    in2lambda_questions.append(question)

try:
    Module(in2lambda_questions).to_json(f"{output_path}/out")
    print("JSON output successfully created.")
except Exception as e:
    print(f"Error creating JSON output: {e}")