In [None]:
import os
import re
import json
import time
import requests

from pathlib import Path

from dotenv import load_dotenv
from pydantic import BaseModel, Field

from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

from PIL import Image

# Load environment variables from .env file.
load_dotenv()


# scanning/processing the initial pdf into markdown

In [None]:
MATHPIX_API_KEY = os.getenv("MATHPIX_API_KEY")
MATHPIX_APP_ID = os.getenv("MATHPIX_APP_ID")

def pdf_to_markdown(source_path: str, result_path: str):
    ''' 
    converts the pdf at `source_path` to a markdown file at `result_path` using Mathpix API.
    '''
    # Upload PDF to Mathpix and returns a Markdown file with the content.
    with open(source_path, "rb") as file:
        r = requests.post(
            "https://api.mathpix.com/v3/pdf",   
            headers={
                "app_id": MATHPIX_APP_ID,
                "app_key": MATHPIX_API_KEY,
            },
            files={"file": file},
        )
        pdf_id = r.json()["pdf_id"]
        print("PDF ID:", pdf_id)
        print("Response:", r.json())

        url = f"https://api.mathpix.com/v3/pdf/{pdf_id}.md"
        headers = {
            "app_id": MATHPIX_APP_ID,
            "app_key": MATHPIX_API_KEY,
        }

        max_retries = 10
        retry_delay = 5  # seconds
        for attempt in range(max_retries):
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                # Save the result if the request is successful
                with open(result_path, "w") as f:
                    f.write(response.text)
                print("Downloaded MD successfully.")
                break
            else:
                print(f"Attempt {attempt + 1}/{max_retries}: Processing not complete. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
        else:
            print("Failed to retrieve processed PDF after multiple attempts:", response.status_code, response.text)

# setting up the directories

In [None]:
folder_path = "conversion_content"
output_path = f"{folder_path}/out"
media_path = f"{output_path}/media"

Path(media_path).mkdir(parents=True, exist_ok=True)

source_path = f"{folder_path}/example.pdf"
result_path = f"{folder_path}/example.md"

pdf_to_markdown(source_path, result_path)
with open(result_path, "r") as f:
    md_content = f.read()

# Print out a summary.
print("Markdown text: ")
print(f"  {result_path}: {len(md_content)} characters")
print("Markdown text: ")
print(f"  {result_path}: {md_content}")

# downlaoding extracted images from Mathpix

In [None]:
# Fetch the figures from the paper and answers.
def extract_figures_from_text(text): #, ans=False):
    """
    Extracts figures from the text using regex.
    Finds figure references and their descriptions.
    """
    figures = {}
    # Regex to match figure references and their descriptions
    pattern = r'!\[.*?\]\((.*?)\)'
    matches = re.findall(pattern, text)
    print(f"Matches found: {matches}")
    
    for match in matches:
        url = match
        url = url.strip()
        figure_caption_pattern = rf'\({re.escape(url)}\)\s*-?\s*Figure\s+(Q\d+)\s*-\s*(.+?)\n'
        caption_match = re.search(figure_caption_pattern, text)

        if caption_match:
            title, description = caption_match.groups()
            print("Caption match found")
        else:
            title, description = "", ""

        if url.startswith("http"):
            # Download the image and save it to a file
            image = Image.open(requests.get(url, stream=True).raw)
            # Create a figure name based on the URL
            fig_name = os.path.basename(url)
            figures[fig_name] = {
                "image": image,
                "title": title.strip(),
                "label": description.strip(),
                "url": url,
                "local_path": "",
                # "answerFile": ans
            }
    return figures

# a dictionary storing information on the figures
figures = extract_figures_from_text(md_content)

# saving the images locally

In [None]:
def save_figures_to_path(figures):
    for idx, (fig_name, fig_info) in enumerate(figures.items()):
        print(f"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'")
        # image_name = f"figure_{fig_info['title']}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
        # if image_name in os.listdir(f"{set_path}media/"):
        #     image_name = f"figure_{fig_info['title']}_{idx}.png" #{"_ans" if fig_info["answerFile"] else ""}.png"
        end_location = fig_name.index("?")
        image_name = f"{idx}_{fig_name[:end_location]}"
        fig_info["local_path"] = image_name
        fig_info["image"].save(f"{media_path}{fig_info['local_path']}")

save_figures_to_path(figures)

# replacing url for images with local path

In [None]:
def replace_figures_in_markdown(md_content, figures):
    #replace the image URLs in the markdown content with local paths
    for fig_name, fig_info in figures.items():
        md_content = md_content.replace(fig_info["url"], fig_info["local_path"])
        print(f"Replaced {fig_info['url']} with {fig_info['local_path']} in markdown content.")
    # Save the modified markdown content to a file
    with open(f"{folder_path}/example.md", "w") as f:
        f.write(md_content)

replace_figures_in_markdown(md_content, figures)

# Initialising llm

In [None]:
# Set up the LLM via LangChain.
llm = ChatOpenAI(
            model=os.environ['OPENAI_MODEL'],
            api_key=os.environ["OPENAI_API_KEY"],
        )

# Extract Questions and Parts

In [None]:
# Define the schema for the tutorial output.
class Set_Question(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    content: str = Field(..., description="Content of the exercise (no exercise title, no subquestions)")
    subquestions: list[str] = Field(..., description="List of subquestions within the exercise (only the text, no numbering)")
    
class Set_Answer(BaseModel):
    title: str = Field(..., description="Title of the exercise (only the text, no numbering)")
    workedSolutions: list[str] = Field(..., description="List of worked solution to subquestions within the exercise (no numbering or counting)")

class Set(BaseModel):
    name: str = Field(..., description="Title of the set")
    year: str = Field(..., description="Year of the set")
    exercise: list[Set_Question] = Field(..., description="List of exercises in the set")
    workedSolution: list[Set_Answer] = Field(..., description="List of worked solutions for the exercises in the set")

def extract_tutorial_questions(doc_page_content: str) -> dict:
    """
    Extracts the title and individual exercises from a tutorial sheet.

    This function takes the content of a tutorial sheet (doc.page_content), constructs a prompt
    instructing the LLM to infer the tutorial title and to split the text into separate questions.
    The output must be a valid JSON string with the following structure:
    
    {
        "name": "<title of tutorial>",
        "year": "<year of tutorial>",
        "exercise": [
            { title: "exercise text 1", content: "content text exercise 1", subquestions: ["subquestion text 1", "subquestion text 2", ...],
            { title: "exercise text 2", content: "content text exercise 2", subquestions: ["subquestion text 1", "subquestion text 2", ...],
            ...
        ]
        "workedSolution": [
            { title: "exercise text 1", workedSolutions: ["solution text 1", "solution text 2", ...] },
            { title: "exercise text 2", workedSolutions: ["solution text 1", "solution text 2", ...] },
            ...
        ]
    }
    
    the original text of the exercises. The function returns a dictionary parsed from the JSON output.
    if any of the text mentions a figure/diagram, then also find the figure and add it to the content of the exercise.
    
    Args:
        doc_page_content (str): The content of a set.
        
    Returns:
        dict: A dictionary containing the keys "name" and "exercise".
              If parsing fails, returns None.
    """
    # Initialize the output parser with the Tutorial schema.
    parser = PydanticOutputParser(pydantic_object=Set)

    # Construct the prompt, appending the parser's format instructions.
    prompt = f"""
        IMPORTED_SET
        ```markdown
        {doc_page_content}
        ```

        IMPORTED_SET is a set of questions. It may or may not include reference solutions.
        Infer the title of the set from the content, if no suitable name found, just leave as Unnamed Set, and extract each individual question as a separate string.
        Do not modify the text of the exercises. 
        it is important to only use $...$ for math expressions.

        If the exercise mentions figures/diagrams, then find the diagram (the local path) that it is talking about,
        and include it in the content of the exercise.

        If the exercise mentions tables, then include the table in the content.

        Ensure that there is a workedSolution for each exercise, which should have the same title and a list of solutions that matches the subquestions.

        Return a valid JSON string with the following structure:
        {parser.get_format_instructions()}
        """

    # Call the LLM
    response = llm.invoke(prompt)

    # Debug: print the raw LLM response
    print("Raw LLM Response:")
    print(response)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        # For Pydantic v2, use model_dump() to convert the model to a dictionary.
        return parsed_output.model_dump()
    except Exception as e:
        print("Error parsing LLM response as JSON:", e)
        return None

In [None]:
imported_tutorial = extract_tutorial_questions(md_content)

In [None]:
# Extract title
title = imported_tutorial["name"] + " " + imported_tutorial["year"]

# Print the title
print(f"Title: {title}\n")

# Extract questions
questions = imported_tutorial["exercise"]
solutions = imported_tutorial["workedSolution"]

# Loop over and print each question
for idx1, (question, solution) in enumerate(zip(questions, solutions), start=1):
    print(f"**Question {idx1}**:\n{question.get("title")}\n")
    print(f"Content: {question.get("content")}\n")
    for idx2, (subquestion, subanswer) in enumerate(zip(question.get("subquestions", []), solution.get("workedSolutions", [])), start=1):
        print(f"Question {idx1}:")
        print(f"- Subquestion {idx2}: {subquestion}")
        print(f"- Worked Solution {idx2}: {subanswer}")
        print("\n")
    print("-" * 40)  # Separator for readability

# Form JSON Schemas

In [None]:
# Define the nested Pydantic models based on the JSON schema.
class WorkedSolution(BaseModel):
    content: str = Field(..., description="Worked solution content")
    title: str = Field(..., description="Worked solution title")
    children: list = []

class Part(BaseModel):
    answerContent: str = Field(..., description="Part answer text")
    content: str = Field(..., description="Part content text")
    orderNumber: int = Field(..., description="The order number of this part")
    responseAreas: list = Field(..., description="List of response areas")
    tutorial: list = Field(..., description="List of tutorial items")
    workedSolution: WorkedSolution = Field(..., description="Worked solution details")

class QuestionJson(BaseModel):
    orderNumber: int = Field(..., description="The order number of the question")
    displayFinalAnswer: bool = Field(..., description="Flag to display the final answer")
    displayStructuredTutorial: bool = Field(..., description="Flag to display the structured tutorial")
    displayWorkedSolution: bool = Field(..., description="Flag to display the worked solution")
    masterContent: str = Field(..., description="Top level question content")
    parts: list[Part] = Field(..., description="List of question parts")
    publish: bool = Field(..., description="Publish flag")
    title: str = Field(..., description="Question title")

def create_question_json(question: str, solution: str) -> dict:
    # Initialize the output parser using the defined Pydantic model.
    parser = PydanticOutputParser(pydantic_object=QuestionJson)

    # Minimum JSON template to guide the model. (Used as context.)
    minimum_json_template = r'''{
      "orderNumber": 0,
      "displayFinalAnswer": true,
      "displayStructuredTutorial": true,
      "displayWorkedSolution": true,
      "displayChatbot": false,
      "masterContent": "Top level question here",
      "parts": [
        {
          "answerContent": "final answer here corresponding the part, is no answer found, leave empty",
          "content": "part question text here, if only one part, then leave empty",
          "orderNumber": 0,
          "responseAreas": [],
          "tutorial": [],
          "workedSolution": {
            "content": "Part worked solution here",
            "title": "",
            "children": []
          }
        }
      ],
      "publish": false,
      "title": "Question title here"
    }'''

    # Construct the prompt, appending the parser's format instructions.
    question_prompt = f'''
      JSON_TEMPLATE
      ```json
      {minimum_json_template}
      ```

      IMPORTED_QUESTION
      ```markdown
      {question}
      ```

      IMPORTED_SOLUTION
      ```markdown
      {solution}
      ```

      Preserve the markdown math formatting to use $...$ for math expressions. Do not modify the original text of the question.

      Infer the final answer and put it in the answerContent field of the part. 
      The worked solution should be in the workedSolution.content field.

      If you cannot find a suitable text for any of the sections, leave it empty.

      {parser.get_format_instructions()}
      '''

    # Invoke the language model.
    response = llm.invoke(question_prompt)

    try:
        # Parse the response using the output parser.
        parsed_output = parser.parse(response.content)
        return parsed_output.model_dump()  # Return as a dictionary.
    except Exception as e:
        print("Error parsing JSON from LLM response:", e)
        print("LLM response:", response.content)
        return None



In [None]:
questions = imported_tutorial["exercise"]
solutions = imported_tutorial["workedSolution"]


# Loop over all questions and question_answers and print each question
for idx, (question, solution) in enumerate(zip(questions, solutions), start=1):
    print(f"**Question {idx}**:\n{question}\n")
    # print(f"**Question Answers {idx}**:\n{question_ans}\n")

    print("INFO: Mapping question in markdown into JSON")
    question_json = create_question_json(question,solution)
    question_json["orderNumber"] = idx-1
    print(f"INFO: JSON {idx}:\n{question_json}\n")
    
    # print("INFO: Get figures")
    # updated_question_json = add_figure_references_to_questions(figures, question_json)
    # updated_question_json = add_local_figures_to_questions(figures, question_json)
    updated_question_json = question_json

    question_name = updated_question_json["title"].replace(" ", "_")
    question_index = f"{(idx-1):03}" 
    filename = f"{output_path}/question_{question_index}_{question_name}.json"
    print(f"INFO: writing {filename}")
    open(filename, "w").write(json.dumps(updated_question_json, indent=2))
    
    # break # breaking here as just doing quick test