# process description  

the program takes in a pdf  
mathpix is used to scan the pdf and turning it into markdown  
markdown then processed to get the images  

llm is used to extract the questions and solutions in **ONE** go.  
the final JSON is made using the in2lambda api.

In [None]:
import os
import re
import json
import time
import requests
import concurrent.futures

from pathlib import Path

from dotenv import load_dotenv
from pydantic import BaseModel, Field, ValidationError

from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

from in2lambda.api.module import Module
from in2lambda.api.question import Question
from in2lambda.api.part import Part

from PIL import Image

# Load environment variables from .env file.
load_dotenv()

# scanning/processing the initial pdf into markdown

In [None]:
MATHPIX_API_KEY = os.getenv("MATHPIX_API_KEY")
MATHPIX_APP_ID = os.getenv("MATHPIX_APP_ID")

def pdf_to_markdown(source_path: str, result_path: str):
    ''' 
    converts the pdf at `source_path` to a markdown file at `result_path` using Mathpix API.
    '''
    # Upload PDF to Mathpix and returns a Markdown file with the content.
    with open(source_path, "rb") as file:
        r = requests.post(
            "https://api.mathpix.com/v3/pdf",   
            headers={
                "app_id": MATHPIX_APP_ID,
                "app_key": MATHPIX_API_KEY,
            },
            files={"file": file},
        )
        pdf_id = r.json()["pdf_id"]
        print("PDF ID:", pdf_id)
        print("Response:", r.json())

        url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.md"
        headers = {
            "app_id": MATHPIX_APP_ID,
            "app_key": MATHPIX_API_KEY,
        }

        max_retries = 10
        retry_delay = 5  # seconds
        for attempt in range(max_retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                # Save the result if the request is successful
                with open(result_path, "w") as f:
                    f.write(response.text)
                print("Downloaded MD successfully.")
                break
            else:
                print(f"Attempt {attempt + 1}/{max_retries}: Processing not complete. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
        else:
            print("Failed to retrieve processed PDF after multiple attempts:", response.status_code, response.text)

# setting up the directories

In [None]:
folder_path = "conversion_content"
output_path = f"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out"
media_path = f"{output_path}/media"

Path(media_path).mkdir(parents=True, exist_ok=True)

source_path = f"{folder_path}/example.pdf"
result_path = f"{output_path}/example.md"

# Only activate mathpix if the markdown has not been created yet.
# This avoids unnecessary reprocessing of the same PDF.
if not Path(result_path).exists():
    if Path(source_path).exists():
        pdf_to_markdown(source_path, result_path)
    else:
        print(f"Error: Source PDF file not found at {source_path}")
        exit(1)

try:
    with open(result_path, "r") as f:
        md_content = f.read()
except FileNotFoundError:
    print(f"Error: Markdown file not found at {result_path}")
    exit(1)

# Print out a summary.
print("Markdown text: ")
print(f"  {result_path}: {len(md_content)} characters")
print("Markdown text: ")
print(f"  {result_path}: {md_content}")

# downlaoding extracted images from Mathpix

In [None]:
# Fetch the figures from the paper and answers.
def extract_figures_from_text(text): #, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                "local_path": "",
                # "answerFile": ans
            }
    return figures

# a dictionary storing information on the figures
figures = extract_figures_from_text(md_content)

# saving the images locally

In [None]:
def save_figures_to_path(figures):
    for idx, (fig_name, fig_info) in enumerate(figures.items()):
        print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
        # Extract file extension and create a clean filename
        if "?" in fig_name:
            end_location = fig_name.index("?")
            image_name = f"{idx}_{fig_name[:end_location]}"
        else:
            image_name = f"{idx}_{fig_name}"
        
        fig_info["local_path"] = image_name
        try:
            fig_info["image"].save(f"{media_path}/{fig_info['local_path']}")
            print(f"Saved image: {fig_info['local_path']}")
        except Exception as e:
            print(f"Error saving image {image_name}: {e}")

save_figures_to_path(figures)

# replacing url for images with local path

In [None]:
def replace_figures_in_markdown(md_content, figures):
    #replace the image URLs in the markdown content with local paths
    # add pictureTag for Lambda Feedback to recognise it as a picture
    md_content = md_content.replace("![]", "![pictureTag]")
    for fig_name, fig_info in figures.items():
        md_content = md_content.replace(fig_info["url"], fig_info["local_path"])
        print(f"Replaced {fig_info['url']} with {fig_info['local_path']} in markdown content.")
    # Save the modified markdown content to a file
    try:
        with open(f"{output_path}/example.md", "w") as f:
            f.write(md_content)
        print("Modified markdown saved successfully.")
    except Exception as e:
        print(f"Error saving modified markdown: {e}")

replace_figures_in_markdown(md_content, figures)

# Initialising llm

In [None]:
# Set up the LLM via LangChain.

# Uses gpt-4.1-nano:
#    - a faster model
#    - less intelligent

llm_nano = ChatOpenAI(
            model="gpt-4.1-nano",
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Uses gpt-4o-mini:
#    - more intelligent
llm_mini = ChatOpenAI(
            model="gpt-4o-mini",
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Spelling and structure check

In [None]:
llm_task_correct_mistakes = """
The input is a markdown file that is converted from a pdf using Mathpix API.
The pdf contains questions and may contain the solutions too.
As the original pdf may contain hand written text, the markdown file may contain mistakes in spelling, grammar and structure.

Important things to remember:
    1. Leave all Math commands and LaTeX formatting the same. As they are completely valid. Do not change the LaTeX formatting and expressions.
    2. Only ever use LaTeX math delimiters for math expressions. I.e. use `$...$` for inline math, and `$$...$$` for display math.
    3. Leave references to images and figures the same. I.e. do not change the image links or alt text.

Your task is to:
    1. Correct any spelling mistakes in the markdown file.
    2. Correct any grammar mistakes in the markdown file.
    3. Correct any layout mistakes in the markdown file, such that it follows the styles of the entire markdown file.
    4. Do not change the content of the markdown file, only correct the mistakes.
Output only a valid markdown file with the corrections applied, if any. Do not add any additional text or comments.
"""

def correct_mistakes_in_markdown(md_content: str) -> str:
    prompt = f"""
        {llm_task_correct_mistakes}

        ```input
        {md_content}
        ```

        Return the markdown now.
    """

    response = llm_nano.invoke(prompt)
    print("Corrected markdown content:")
    print(response.content.strip())

    return response.content.strip()

# Extract Questions

In [None]:
#define initial question model
class QuestionModel(BaseModel):
    # full question and full solution
    question_content: str = Field(..., description="The content of the question.")
    solution_content: str = Field(..., description="The content of the solution.")
    images: list[str] = Field(..., description="A list of image URLs associated with the question.")

class AllQuestionsModel(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    questions: list[QuestionModel] = Field(..., description="A list of questions.")

llm_task_seperate_questions = r"""
    Your task is to extract all individual questions and their worked solutions from the provided markdown content.

    1.  **Content Extraction:**
        -   Identify a suitable `name` for the set of questions.
        -   Identify the `year` if mentioned; otherwise, use "0".
        -   For each question, carefully extract the full question text into `question_content` and the corresponding full solution/answer text into `solution_content`. They may not be in the same section.
        -   If no solution is found, leave `solution_content` as an empty string `""`.
        -   Preserve all image tags like `![pictureTag](filename.jpg)`, making sure they are placed with their respective "question_content" and "solution_content".
        -   For Each Question extract all image references (e.g., `filename.jpg`) found within the `question_content` and `solution_content` and place them in the `images` list.

    2.  **Output Format (Crucial):**
        -   You MUST output ONLY a single, raw, valid JSON string that matches the provided schema.
        -   Do NOT include any explanations, comments, or markdown code blocks (like ```json).

    3.  **JSON Formatting Rules:**
        -   **Escape Backslashes:** All LaTeX backslashes (`\`) MUST be escaped as double backslashes (`\\`). For example, `\cup` must become `\\cup`. This is the most important rule.
        -   **Newlines:** Use `\n` for newlines within the JSON string values.
        -   **Content Integrity:** Preserve all text, LaTeX (`$...$`, `$$...$$`), and image tags perfectly. Do not alter or summarize content.
        -   **Strict Schema:** Ensure the final JSON has no trailing commas and includes all fields, even if they are empty.
    """

def extract_questions(doc_page_content: str) -> dict:
    # Initialise the parser for the output.
    parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)

    prompt = f"""
        Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:
        {parser.get_format_instructions()}

        {llm_task_seperate_questions}

        Input markdown:
        ```
        {doc_page_content}
        ```
        Return the JSON now.
    """

    # tries to call the LLM multiple times to ensure robustness.
    for attempt_idx in range(3):
        
        # Call the LLM
        response = llm_mini.invoke(prompt)

        # Debug: print the raw LLM response
        # print("Raw LLM Response:")
        # print(response)

        try:
            # Parse the response using the output parser.
            parsed_output = parser.parse(response.content.strip())
            print("LLM response successfully parsed as JSON with questions:")
            print(response.content)
            # For Pydantic v2, use model_dump() to convert the model to a dictionary.
            return parsed_output.model_dump()
        except Exception as e:
            print("Error parsing LLM response as JSON:")
            print("raw response:")
            print(response.content)
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)

    print("Final raw LLM Response:")
    print(response.content)
    raise Exception("Failed to parse LLM response as JSON after multiple attempts.")


# Back slash correction


In [None]:
def escape_latex_backslashes(text: str) -> str:
    # This regex finds any single backslash `\` that is not already part of an escaped backslash `\\`
    # or a newline `\n`, and replaces it with a double backslash `\\`.
    return re.sub(r'\\\\|\\(?!n)', r'\\\\', text)

# Extract question parts and solutions

In [None]:
# Define the schema for the tutorial output.
class Set_Question(BaseModel):
    title: str = Field(..., description="Title of the question (only the text, no numbering)")
    content: str = Field(..., description="Content of the question (no exercise title, no subquestions)")
    parts: list[str] = Field(..., description="List of parts within the question (only the text, no numbering)")
    images: list[str] = Field(..., description="List of image URLs associated with the question (no alt text, only URLs)")

class Set_Solution_Part(BaseModel):
    part_solution: str = Field(..., description="The worked solution for the part (no numbering or counting)")

class Set_Solution(BaseModel):
    parts_solutions: list[str] = Field(..., description="List of worked solutions for the question (no numbering or counting)")

    def __init__(self, parts_solutions: list[Set_Solution_Part]):
        """
        Initialize the Set_Solution with a list of solutions for each part.
        
        Args:
            parts_solutions (list[Set_Solution_Part]): The worked solutions for the parts.
        """
        super().__init__(parts_solutions=[part.part_solution for part in parts_solutions])

class Set_Question_With_Solution(Set_Question):
    parts_solutions: list[str] = Field(..., description="The worked solution for the parts.")

    def __init__(self, question: Set_Question, solution: Set_Solution):
        """
        Initialize the Set_Question_With_Solution with a question and its solution.
        
        Args:
            question (Set_Question): The question object.
            solution (Set_Solution): The solution object.
        """
        super().__init__(
            **question.model_dump(),
            parts_solutions=solution.parts_solutions
        )


class Set(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    questions: list[Set_Question_With_Solution] = Field(..., description="List of questions in the set")

llm_task_seperate_parts_question = r"""
    Your task is to separate a question's content into a main stem and distinct parts, then format it as a JSON object.
    Follow these rules precisely:

    1.  **Content Splitting:**
        -   From the input `question_content`, identify the main introductory text (the stem) and place it in the `content` field.
        -   Identify all sub-questions (e.g., "(a)", "(b)", "i.", "ii.") and place their text into the `parts` list.
        -   Parts may also be implied.
        -   All Question Must have at least one part.
        -   Ensure that images references are correctly placed with their respective parts.
        -   Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.
        -   Ensure no solution content is included in the `content` or `parts` fields.
        -   The `title` should be a concise summary of the question.
        -   The `images` list should be copied exactly from the input.

    2.  **Output Format (Crucial):**
        -   You MUST output ONLY a single, raw, valid JSON string.
        -   Do NOT include any explanations, comments, or markdown code blocks (like ```json).

    3.  **JSON Formatting Rules:**
        -   **Escape Backslashes:** All LaTeX backslashes (`\`) MUST be escaped as double backslashes (`\\`). For example, `\cup` must become `\\cup`. This is the most important rule.
        -   **Newlines:** Use `\n` for newlines within the JSON string values.
        -   **Content Integrity:** Preserve all text, LaTeX (`$...$`, `$$...$$`), and image tags (`![pictureTag](...)`) perfectly. Do not alter or summarize content.
    """

llm_task_seperate_parts_solution = r"""
    Your task is to extract the solution for a specific question part from the full solution provided.
    Please follow these rules carefully:

    1.  **Content Extraction:**
        -   From the `full solution`, find the worked solution that corresponds to the given `question part`.
        -   Make sure the solutions for all parts together include the entire full solution text, with no missing content.
        -   Place this exact text into the `part_solution` field.
        -   Ensure that images references are correctly placed with their respective parts.
        -   Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.
        -   If no specific solution is found, use an empty string `""`.

    2.  **Output Format (Crucial):**
        -   You MUST output ONLY a single, raw, valid JSON string.
        -   Do NOT include any explanations, comments, or markdown code blocks (like ```json).

    3.  **JSON Formatting Rules:**
        -   **Escape Backslashes:** All LaTeX backslashes (`\`) MUST be escaped as double backslashes (`\\`). For example, `\cup` must become `\\cup` in the JSON string. This is the most important rule.
        -   **Newlines:** Use `\n` for newlines within the JSON string values.
        -   **Math Delimiters:** Ensure all math delimiters (`$...$` and `$$...$$`) are correctly balanced and preserved.
    """

def process_single_question(question_data):
    """Process a single question and its parts in parallel"""
    question_idx, question = question_data
    
    # Initialize the output parser with the Set_Question schema.
    question_parser = PydanticOutputParser(pydantic_object=Set_Question)
    
    # Process the question part
    for attempt_idx in range(3):
        prompt = f"""
            Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:
            {question_parser.get_format_instructions()}

            {llm_task_seperate_parts_question}

            Input Dictionary:
            ```JSON
            {json.dumps(question)}
            ```

            Return the JSON now.
            """
        
        response = llm_mini.invoke(prompt)
        
        try:
            parsed_output_parts = question_parser.parse(response.content)
            print(f"LLM response successfully parsed question {question_idx + 1}.")
            break
        except Exception as e:
            print(f"Error parsing LLM response as JSON for question {question_idx + 1}:")
            print(f"Retrying... Attempt No.{attempt_idx + 1}")
            time.sleep(2)
    else:
        print("Final LLM Response:")
        print(response.content)
        raise Exception(f"Failed to parse LLM response as JSON after multiple attempts for question {question_idx + 1}.")

    # Process solution parts in parallel
    def process_solution_part(part_data):
        part_idx, part = part_data
        solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part)
        
        for attempt_idx in range(3):
            prompt = f"""
                Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:
                {solution_parser.get_format_instructions()}

                {llm_task_seperate_parts_solution}

                full solution:
                {question["solution_content"]}

                question part:
                {part}
                """
            
            response = llm_mini.invoke(prompt)
            
            try:
                cleaned_response = escape_latex_backslashes(response.content.strip())
                parsed_output_solution_part = solution_parser.parse(cleaned_response)
                print(f"LLM response successfully parsed solution for part {part_idx + 1} of question {question_idx + 1}.")
                return parsed_output_solution_part
            except Exception as e:
                print(f"Error parsing LLM response as JSON for part {part_idx + 1} of question {question_idx + 1}:")
                print(f"Retrying... Attempt No.{attempt_idx + 1}")
                time.sleep(2)
        
        raise Exception(f"Failed to parse LLM response as JSON after multiple attempts for part {part_idx + 1} of question {question_idx + 1}.")

    # Process all parts in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        part_data_list = [(i, part) for i, part in enumerate(parsed_output_parts.parts)]
        solutions_parts = list(executor.map(process_solution_part, part_data_list))

    set_solution = Set_Solution(parts_solutions=solutions_parts)
    return Set_Question_With_Solution(
        question=parsed_output_parts,
        solution=set_solution
    )

def extract_parts_question(questions_dict: dict) -> dict:
    """
    Extracts the title and individual questions from a tutorial sheet.
    Now processes questions in parallel while maintaining order.
    """

    # Process all questions in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        question_data_list = [(i, q) for i, q in enumerate(questions_dict["questions"])]
        questions_in_parts = list(executor.map(process_single_question, question_data_list))

    return Set(
        name=questions_dict["name"],
        year=questions_dict["year"],
        questions=questions_in_parts
    ).model_dump()

# LLM evaluation of the content of JSON

In [None]:
# Define the models for validation
class PartTextModel(BaseModel):
    part_text: str = Field(..., description="The text of the part")

class PartSolutionModel(BaseModel):
    part_solution: str = Field(..., description="The solution for the part")

class QuestionContentModel(BaseModel):
    title: str = Field(..., description="The title of the question")
    content: str = Field(..., description="The main content of the question")

llm_task_text_check = r"""
    Your task is to validate and correct the content within the `part_text` field of the provided JSON input.
    You MUST return ONLY a single, raw, valid JSON string that strictly follows the original schema. Do NOT add any explanations, comments, or markdown code blocks.

    Apply these correction rules to the content inside the JSON fields:
    1.  **JSON Escaping:** All LaTeX backslashes (`\`) MUST be escaped as double backslashes (`\\`). For example, `\cup` must be written as `\\cup`. Never escape backslashes for newlines (`\n`), as they should remain as is.
    2.  **Math Delimiters:** All mathematical content must be enclosed in `$...$` for inline math or `$$...$$` for display math. Ensure all delimiters are correctly balanced and closed. '$' and '$$' should not be used for any other purpose. Move all `\n` outside the math delimiters.
    3.  **Display Math:** `$$` delimiters must be on their own separate lines.
    4.  **Image Tags:** Preserve image tags like `![pictureTag](filename.jpg)` exactly as they are.
    5.  **Content Integrity:** Do not change, paraphrase, or summarize any text, formulas, or image links. Only fix formatting errors according to these rules.
    6.  **Newlines:** Use `\n` for newlines within the JSON string values.
    """

def validate_part_text(part_text_data):
    """Validate a single part text with retry logic"""
    question_idx, part_idx, part_text = part_text_data
    part_text_parser = PydanticOutputParser(pydantic_object=PartTextModel)
    
    for attempt_idx in range(3):
        part_text_validation_data = {
            "part_text": part_text
        }
        
        validation_prompt = f"""
            Your task is to extract a JSON with the following structure exactly:
            {part_text_parser.get_format_instructions()}

            Your task is to validate and correct the content within the `part_text` field of the provided JSON input.
            {llm_task_text_check}

            Input Part Text:
            ```json
            {json.dumps(part_text_validation_data, indent=2)}
            ```
            return the JSON with the content fixed if needed.
            """

        response = llm_mini.invoke(validation_prompt)

        try:
            parsed_output = part_text_parser.parse(response.content)
            print(f"LLM response successfully parsed part text validation for question {question_idx + 1}, part {part_idx + 1}")
            return parsed_output.model_dump()
        except ValidationError as ve:
            print(f"Part text validation error for question {question_idx + 1}, part {part_idx + 1}: {ve}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
        except Exception as e:
            print(f"Error parsing part text validation LLM response for question {question_idx + 1}, part {part_idx + 1}: {e}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
    
    raise Exception(f"Failed to parse part text validation LLM response after multiple attempts for question {question_idx + 1}, part {part_idx + 1}.")

def validate_part_solution(part_solution_data):
    """Validate a single part solution with retry logic"""
    question_idx, part_idx, part_solution = part_solution_data
    part_solution_parser = PydanticOutputParser(pydantic_object=PartSolutionModel)
    
    for attempt_idx in range(3):
        part_solution_validation_data = {
            "part_solution": part_solution
        }
        
        validation_prompt = f"""
            Your task is to extract a JSON with the following structure exactly:
            {part_solution_parser.get_format_instructions()}

            Your task is to validate and correct the content within the `part_solution` field of the provided JSON input.
            {llm_task_text_check}

            Input Part Solution:
            ```json
            {json.dumps(part_solution_validation_data, indent=2)}
            ```
            return the JSON with the content fixed if needed.
            """

        response = llm_mini.invoke(validation_prompt)

        try:
            parsed_output = part_solution_parser.parse(response.content)
            print(f"LLM response successfully parsed part solution validation for question {question_idx + 1}, part {part_idx + 1}")
            return parsed_output.model_dump()
        except ValidationError as ve:
            print(f"Part solution validation error for question {question_idx + 1}, part {part_idx + 1}: {ve}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
        except Exception as e:
            print(f"Error parsing part solution validation LLM response for question {question_idx + 1}, part {part_idx + 1}: {e}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
    
    raise Exception(f"Failed to parse part solution validation LLM response after multiple attempts for question {question_idx + 1}, part {part_idx + 1}.")

def validate_question_content(question_data):
    """Validate question title and content with retry logic"""
    question_idx, title, content = question_data
    content_parser = PydanticOutputParser(pydantic_object=QuestionContentModel)
    
    for attempt_idx in range(3):
        content_validation_data = {
            "title": title,
            "content": content
        }
        
        validation_prompt = f"""
            Your task is to extract a JSON with the following structure exactly:
            {content_parser.get_format_instructions()}

            Your task is to validate and correct the content within the `title` and `content` fields of the provided JSON input.
            {llm_task_text_check}

            Input Question Content:
            ```json
            {json.dumps(content_validation_data, indent=2)}
            ```
            return the JSON with the content fixed if needed.
            """

        response = llm_mini.invoke(validation_prompt)

        try:
            parsed_output = content_parser.parse(response.content)
            print(f"LLM response successfully parsed content validation for question {question_idx + 1}")
            return parsed_output.model_dump()
        except ValidationError as ve:
            print(f"Content validation error for question {question_idx + 1}: {ve}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
        except Exception as e:
            print(f"Error parsing content validation LLM response for question {question_idx + 1}: {e}")
            print("Retrying... Attempt No.", attempt_idx + 1)
            time.sleep(2)
    
    raise Exception(f"Failed to parse content validation LLM response after multiple attempts for question {question_idx + 1}.")

def process_single_question_validation(question_data):
    """Process validation for a single question's content, parts, and solutions in parallel"""
    question_idx, question = question_data
    
    # Validate question content (title and content) separately
    content_data = (question_idx, question.get("title", ""), question.get("content", ""))
    validated_content = validate_question_content(content_data)
    
    # Prepare part text data for parallel processing
    part_text_data_list = [
        (question_idx, part_idx, part_text)
        for part_idx, part_text in enumerate(question.get("parts", []))
    ]
    
    # Prepare part solution data for parallel processing
    part_solution_data_list = [
        (question_idx, part_idx, part_solution)
        for part_idx, part_solution in enumerate(question.get("parts_solutions", []))
    ]
    
    # Process part texts and solutions in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit all validation tasks
        part_text_futures = [executor.submit(validate_part_text, data) for data in part_text_data_list]
        part_solution_futures = [executor.submit(validate_part_solution, data) for data in part_solution_data_list]
        
        # Collect results maintaining order
        validated_part_texts = [future.result() for future in part_text_futures]
        validated_part_solutions = [future.result() for future in part_solution_futures]

    validated_parts = [p["part_text"] for p in validated_part_texts]
    validated_parts_solutions = [p["part_solution"] for p in validated_part_solutions]
    
    return {
        "title": validated_content["title"],
        "content": validated_content["content"],
        "parts": validated_parts,
        "parts_solutions": validated_parts_solutions,
        "images": question.get("images", [])
    }

def content_texdown_check(validated_dict: dict) -> dict:
    """
    Checks if the content of the JSON is in Texdown format by processing each question's content, parts, and solutions separately.
    Now processes questions in parallel while maintaining order.
    """
    
    # Process all questions in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        question_data_list = [(i, q) for i, q in enumerate(validated_dict["questions"])]
        questions_in_parts = list(executor.map(process_single_question_validation, question_data_list))
    
    return {
        "name": validated_dict["name"],
        "year": validated_dict["year"],
        "questions": questions_in_parts
    }

In [None]:
def md_to_json(md_content: str) -> dict:
    """
    Extracts the title and individual questions from a tutorial sheet.
    
    Args:
        md_content (str): The content of a set.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # corrected_md_content = correct_mistakes_in_markdown(md_content)
    # print("Markdown content corrected for spelling, grammar, and structure.")

    questions_dict = extract_questions(md_content)
    print("successfully extracted the questions from the markdown. Now extracting the parts...")

    extracted_dict = extract_parts_question(questions_dict)
    print("succesfully extracted the parts from the questions.")
    print(json.dumps(extracted_dict, indent=2))
    print("Now validating the content...")

    # content_validated_dict = content_texdown_check(extracted_dict)
    # print("successfully validated the content.")
    # print(json.dumps(content_validated_dict, indent=2))
    # print("successfully converted markdown to JSON.")
    
    return extracted_dict

In [None]:
imported_tutorial = md_to_json(md_content)

# Displaying questions

In [None]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["questions"]

print(questions)

# Loop over and print each question
for idx1, question in enumerate(questions, start=1):
    print(f"**Question {idx1}**:\n{question.get('title')}\n")
    print(f"Content: {question.get('content')}\n")
    for idx2, (part, part_answer) in enumerate(zip(question.get("parts", []), question.get("parts_solutions", [])), start=1):
        print(f"Question {idx1}:")
        print(f"- Subquestion {idx2}: {part}")
        print(f"- Worked Solution {idx2}: {part_answer}")
        print("\n")
    print("-" * 40)  # Separator for readability

# in2lambda to JSON

In [None]:
questions = imported_tutorial["questions"]

in2lambda_questions = []

# Loop over all questions and question_answers and use in2lambda API to create a JSON.
for idx, question_dict in enumerate(questions, start=1):
    parts = []
    for part_question, part_solution in zip(question_dict.get("parts", []), question_dict.get("parts_solutions", [])):
        part_obj = Part(
            text=part_question,
            worked_solution=part_solution
        )
        parts.append(part_obj)

    # Handle image paths - ensure they exist
    image_paths = []
    for img in question_dict.get("images", []):
        if img.startswith("http"):
            # Skip URLs that weren't processed
            continue
        full_path = f"{media_path}/{img}"
        if Path(full_path).exists():
            image_paths.append(full_path)
        else:
            print(f"Warning: Image file not found: {full_path}")

    question = Question(
        title=question_dict.get("title", f"Question {idx}"),
        main_text=question_dict.get("content", ""),
        parts=parts,
        images=image_paths
    )
    in2lambda_questions.append(question)

try:
    Module(in2lambda_questions).to_json(f"{output_path}/out")
    print("JSON output successfully created.")
except Exception as e:
    print(f"Error creating JSON output: {e}")