# process description  

the program takes in a pdf  
mathpix is used to scan the pdf and turning it into markdown  
markdown then processed to get the images  

llm is used to extract the questions and solutions in **ONE** go.  
the final JSON is made using the in2lambda api.

In [None]:
import os
import re
import json
import time
import requests

from pathlib import Path

from dotenv import load_dotenv
from pydantic import BaseModel, Field, ValidationError

from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

from in2lambda.api.module import Module
from in2lambda.api.question import Question
from in2lambda.api.part import Part

from PIL import Image

# Load environment variables from .env file.
load_dotenv()


# scanning/processing the initial pdf into markdown

In [None]:
MATHPIX_API_KEY = os.getenv("MATHPIX_API_KEY")
MATHPIX_APP_ID = os.getenv("MATHPIX_APP_ID")

def pdf_to_markdown(source_path: str, result_path: str):
    ''' 
    converts the pdf at `source_path` to a markdown file at `result_path` using Mathpix API.
    '''
    # Upload PDF to Mathpix and returns a Markdown file with the content.
    with open(source_path, "rb") as file:
        r = requests.post(
            "https://api.mathpix.com/v3/pdf",   
            headers={
                "app_id": MATHPIX_APP_ID,
                "app_key": MATHPIX_API_KEY,
            },
            files={"file": file},
        )
        pdf_id = r.json()["pdf_id"]
        print("PDF ID:", pdf_id)
        print("Response:", r.json())

        url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.md"
        headers = {
            "app_id": MATHPIX_APP_ID,
            "app_key": MATHPIX_API_KEY,
        }

        max_retries = 10
        retry_delay = 5  # seconds
        for attempt in range(max_retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                # Save the result if the request is successful
                with open(result_path, "w") as f:
                    f.write(response.text)
                print("Downloaded MD successfully.")
                break
            else:
                print(f"Attempt {attempt + 1}/{max_retries}: Processing not complete. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
        else:
            print("Failed to retrieve processed PDF after multiple attempts:", response.status_code, response.text)

# setting up the directories

In [None]:
folder_path = "conversion_content"
output_path = f"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out"
media_path = f"{output_path}/media"

Path(media_path).mkdir(parents=True, exist_ok=True)

source_path = f"{folder_path}/example.pdf"
result_path = f"{output_path}/example.md"

# Only activate mathpix if the markdown has not been created yet.
# This avoids unnecessary reprocessing of the same PDF.
if not Path(f"{output_path}/example.md").exists():
    pdf_to_markdown(source_path, result_path)

with open(result_path, "r") as f:
    md_content = f.read()

# Print out a summary.
print("Markdown text: ")
print(f"  {result_path}: {len(md_content)} characters")
print("Markdown text: ")
print(f"  {result_path}: {md_content}")

# downlaoding extracted images from Mathpix

In [None]:
# Fetch the figures from the paper and answers.
def extract_figures_from_text(text): #, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                "local_path": "",
                # "answerFile": ans
            }
    return figures

# a dictionary storing information on the figures
figures = extract_figures_from_text(md_content)

# saving the images locally

In [None]:
def save_figures_to_path(figures):
    for idx, (fig_name, fig_info) in enumerate(figures.items()):
        print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
        # image_name = f"figure_{fig_info['title']}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
        # if image_name in os.listdir(f"{set_path}media/"):
        #     image_name = f"figure_{fig_info['title']}_{idx}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
        end_location = fig_name.index("?")
        image_name = f"{idx}_{fig_name[:end_location]}"
        fig_info["local_path"] = image_name
        fig_info["image"].save(f"{media_path}{fig_info['local_path']}")

save_figures_to_path(figures)

# replacing url for images with local path

In [None]:
def replace_figures_in_markdown(md_content, figures):
    #replace the image URLs in the markdown content with local paths
    for fig_name, fig_info in figures.items():
        md_content = md_content.replace(fig_info["url"], fig_info["local_path"])
        print(f"Replaced {fig_info['url']} with {fig_info['local_path']} in markdown content.")
    # Save the modified markdown content to a file
    with open(f"{output_path}/example.md", "w") as f:
        f.write(md_content)

replace_figures_in_markdown(md_content, figures)

# Initialising llm

In [None]:
# Set up the LLM via LangChain.
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Extract Questions and Parts

In [None]:
# Define the schema for the tutorial output.
class Set_Question(BaseModel):
    title: str = Field(..., description="Title of the question (only the text, no numbering)")
    content: str = Field(..., description="Content of the question (no exercise title, no subquestions)")
    parts: list[str] = Field(..., description="List of parts within the question (only the text, no numbering)")
    parts_solutions: list[str] = Field(..., description="List of worked solutions for the question (no numbering or counting)")

class Set(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    questions: list[Set_Question] = Field(..., description="List of questions in the set")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual questions from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "questions": [
            { title: "exercise text 1", content: "content text exercise 1", parts: ["subquestion text 1", "subquestion text 2", ...], parts_solutions: ["solution text 1", "solution text 2", ...] },
            { title: "exercise text 2", content: "content text exercise 2", parts: ["subquestion text 1", "subquestion text 2", ...], parts_solutions: ["solution text 1", "solution text 2", ...] },
            ...
        ]
    }
    
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    if any of the text mentions a figure/diagram, then also find the figure and add it to the content of the exercise.
    
    Args:
        doc_page_content (str): The content of a set.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Set)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        Input markdown:
        ```markdown
        {doc_page_content}
        ```

        Your task is to extract a JSON with the following structure exactly:
        {parser.get_format_instructions()}

        Please follow these steps carefully:
            1. Infer a very short and concise title describing the entire Input.
            2. Identify the year of the tutorial, if mentioned. Otherwise, use "0".
            3. Use the original markdown text exactly as it appears for content, question, parts, and parts_solutions, **preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input**, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.
            4. Identify the questions in the Input markdown and add them to the "questions" list.
            5. for each question:
                - Infer the title of the question (only the text, no numbering).
                - Identify the content of the question (no exercise title, no subquestions).
                - Identify the parts of the question (subquestions) and their worked solutions. If the worked solution is not given, leave the worked solution empty.
                - Add the parts of the question (subquestions) and their worked solutions to the "parts" and "parts_solutions" lists, respectively.
            6. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, with no extra text, comments, or explanations. Use plain newlines (not escaped as `\n`).
            7. The Text inside the JSON should be in Lexdown, preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas. As it will be parsed by KaTex, it should be valid LaTeX.

        Return the JSON now.
        """
    
    # tries to call the LLM multiple times to ensure robustness.
    for i in range(3):
        
        # Call the LLM
        response = llm.invoke(prompt)

        # Debug: print the raw LLM response
        print("Raw LLM Response:")
        print(response)

        try:
            # Parse the response using the output parser.
            parsed_output = parser.parse(response.content)
            # For Pydantic v2, use model_dump() to convert the model to a dictionary.
            return parsed_output.model_dump()
        except ValidationError as ve:
            print("❌ Pydantic Validation Error:")
            for error in ve.errors():
                print(f" - {error['loc']}: {error['msg']}")
            print("Raw LLM output:")
            print(response.content)

In [None]:
imported_tutorial = extract_tutorial_questions(md_content)

In [None]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["questions"]

print(questions)

# Loop over and print each question
for idx1, question in enumerate(questions, start=1):
    print(f"**Question {idx1}**:\n{question.get("title")}\n")
    print(f"Content: {question.get("content")}\n")
    for idx2, (part, part_answer) in enumerate(zip(question.get("parts", []), question.get("parts_solutions", [])), start=1):
        print(f"Question {idx1}:")
        print(f"- Subquestion {idx2}: {part}")
        print(f"- Worked Solution {idx2}: {part_answer}")
        print("\n")
    print("-" * 40)  # Separator for readability

# Form JSON Schemas

In [None]:
questions = imported_tutorial["questions"]

in2lambda_questions = []

# Loop over all questions and question_answers and use in2lambda to create a JSON.
for idx, question_dict in enumerate(questions, start=1):
    parts = []
    for part_question, part_solution in zip(question_dict.get("parts", []), question_dict.get("parts_solutions", [])):
        part_obj = Part(
            text=part_question,
            worked_solution=part_solution
        )
        parts.append(part_obj)

    question = Question(
        title=question_dict.get("title", f"Question {idx}"),
        main_text=question_dict.get("content", ""),
        parts=parts
    )
    in2lambda_questions.append(question)

Module(in2lambda_questions).to_json(f"{output_path}/out")