In [1]:
import sys
sys.path.append("..")
import os

from src.image_processing.image_utils import convert_pdf_to_png
from src.image_processing.ocr import process_png_images
from src.text_processing.text_utils import concatenate_text_files, count_tokens, read_file, save_to_file
from src.text_processing.reasoning import gpt4_completion, check_and_correct_latex
from src.pdf_generation.pdf_creator import convert_md_to_pdf
from src.utils.prompts import PROMPT_MD, PROMPT_CHECK_LATEX

api_key = os.getenv('OPENAI_API_KEY')

# Turn PDF into separate pngs (one per pdf page)

In [5]:
pdf_path = '../data/notes/aula13_full.pdf'  # Replace with your PDF file path
poppler_path = r'../venv/poppler-23.11.0/Library/bin'  # Specify your Poppler bin path

convert_pdf_to_png(pdf_path, poppler_path)

# Extract text from image

In [6]:
png_folder_path = "../data/processed/aula13_full/pngs/"
ocr_text_folder = "../data/processed/aula13_full/ocr_text/"

process_png_images(png_folder_path, ocr_text_folder)

../data/processed/aula13_full/pngs/1.png
Processed and saved OCR text for: 1.png
../data/processed/aula13_full/pngs/2.png
Processed and saved OCR text for: 2.png
../data/processed/aula13_full/pngs/3.png
Processed and saved OCR text for: 3.png
../data/processed/aula13_full/pngs/4.png
Processed and saved OCR text for: 4.png
../data/processed/aula13_full/pngs/5.png
Processed and saved OCR text for: 5.png
../data/processed/aula13_full/pngs/6.png
Processed and saved OCR text for: 6.png
../data/processed/aula13_full/pngs/7.png
Processed and saved OCR text for: 7.png
../data/processed/aula13_full/pngs/8.png
Processed and saved OCR text for: 8.png
../data/processed/aula13_full/pngs/9.png
Processed and saved OCR text for: 9.png
../data/processed/aula13_full/pngs/10.png
Processed and saved OCR text for: 10.png
../data/processed/aula13_full/pngs/11.png
Processed and saved OCR text for: 11.png
../data/processed/aula13_full/pngs/12.png
Processed and saved OCR text for: 12.png


# Concatenate descriptions

In [3]:
ocr_text_folder = "../data/processed/aula13_full/ocr_text"
output_file_path = "../data/processed/aula13_full/combined_text.txt"

concatenate_text_files(ocr_text_folder, output_file_path)


All text files have been concatenated into '../data/processed/aula13_full/combined_text.txt'.


# Create Tex

In [2]:
token_threshold = 5500  # Adjust as needed

# File paths
input_file_path = "../data/processed/aula13_full/combined_text.txt"
output_file_path = "../data/processed/aula13_full/formatted_notes.tex"

file_content = read_file(input_file_path)
total_tokens = count_tokens(file_content)
print("TOTAL TOKENS:", total_tokens)

# Check size
if total_tokens <= token_threshold:
    # Process in one go
    response = gpt4_completion("", file_content, PROMPT_MD)
else:
    print("File is too large. Try with a smaller one.")

save_to_file(output_file_path, response)
print(f"Formatted notes saved to {output_file_path}")

TOTAL TOKENS: 4793
Formatted notes saved to ../data/processed/aula13_full/formatted_notes.tex


# Clean LaTeX output

In [4]:
def clean_and_overwrite_latex_file(file_path):
    unwanted_start = "```latex"
    unwanted_end = "```"

    # Read the content of the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Strip off the unwanted markdown from the start and end of the LaTeX content
    if content.startswith(unwanted_start):
        content = content[len(unwanted_start):].strip()
    if content.endswith(unwanted_end):
        content = content[:-len(unwanted_end)].strip()

    # Overwrite the file with the cleaned content
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

    return content

# File path
output_file_path = "../data/processed/aula13_full/formatted_notes.tex"

# Read, clean, and overwrite the content of the file
cleaned_output = clean_and_overwrite_latex_file(output_file_path)

# Turn into Tex and/or Check LaTeX

Use GPT-4 to check LaTeX syntax

In [2]:
input_md_file = "../data/processed/aula13_full/formatted_notes.tex"
output_pdf_file = "../data/processed/aula13_full/formatted_notes.pdf"

success, error_message = convert_md_to_pdf(input_md_file, output_pdf_file)

if not success:
    print("Correcting LaTeX errors...")
    check_and_correct_latex(input_md_file, error_message)
    print("Retrying PDF generation...")
    convert_md_to_pdf(input_md_file, output_pdf_file)

pdflatex: major issue: So far, you have not checked for MiKTeX updates.
pdflatex: major issue: So far, you have not checked for MiKTeX updates.
Error producing PDF.
! Undefined control sequence.
l.89   Defining the matrix \[M(t) \coloneqq


Correcting LaTeX errors...
Retrying PDF generation...


pdflatex: major issue: So far, you have not checked for MiKTeX updates.
pdflatex: major issue: So far, you have not checked for MiKTeX updates.



PDF successfully created at ../data/processed/aula13_full/formatted_notes.pdf


In [7]:
input_file_path = "../data/processed/aula13_full/formatted_notes.tex"
output_file_path = "../data/processed/aula13_full/formatted_notes.tex"

# Read and measure the text
file_content = read_file(input_file_path)

response = gpt4_completion("", file_content, PROMPT_CHECK_LATEX)

# Save the response
save_to_file(output_file_path, response)
print(f"Formatted notes saved to {output_file_path}")

Formatted notes saved to ../data/processed/aula13_full/formatted_notes.tex
