# Turn PDF into separate pngs (one per pdf page)

In [9]:
from pdf2image import convert_from_path
import os

def convert_pdf_to_png(pdf_path, poppler_path):
    # Extracting the base name of the PDF file to name the folder
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # Creating the main directory inside 'data/processed/'
    main_output_folder = os.path.join('../data', 'processed', base_name)
    if not os.path.exists(main_output_folder):
        os.makedirs(main_output_folder)

    # Creating a sub-directory for PNG files
    png_output_folder = os.path.join(main_output_folder, 'pngs')
    if not os.path.exists(png_output_folder):
        os.makedirs(png_output_folder)

    # Convert PDF to a list of images
    images = convert_from_path(pdf_path, poppler_path=poppler_path)

    # Save each page as a PNG
    for i, image in enumerate(images):
        image_path = os.path.join(png_output_folder, f'{i + 1}.png')
        image.save(image_path, 'PNG')

# Usage
pdf_path = '../data/notes/aula13.pdf'  # Replace with your PDF file path
poppler_path = r'C:/Users/lealm/Documents/canguru/poppler-23.11.0/Library/bin'  # Specify your Poppler bin path
convert_pdf_to_png(pdf_path, poppler_path)

# Extract text from image

In [7]:
import os
import base64
import requests
import time

api_key = os.getenv('OPENAI_API_KEY')

def encode_image_to_base64(image_path):
    """Encodes the image to base64 format."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def send_image_to_gpt4v(image_path, prompt):
    """Sends the image to GPT-4V and retrieves the response."""
    base64_image = encode_image_to_base64(image_path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 4000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

def get_previous_description(ocr_text_folder, current_file_index):
    """Retrieves the previous description from the OCR text files."""
    previous_file_index = current_file_index - 1
    if previous_file_index > 0:
        previous_file_name = f"{previous_file_index}.txt"
        previous_file_path = os.path.join(ocr_text_folder, previous_file_name)
        if os.path.exists(previous_file_path):
            with open(previous_file_path, 'r', encoding='utf-8') as file:
                return file.read()
    return ""

def create_prompt_with_previous_description(previous_description):
    """Cria um prompt para o GPT-4V que inclui a descrição da página anterior ou lida com o início das anotações."""
    if previous_description:
        previous_notes_context = f"Baseado nas notas anteriores fornecidas, que são: '{previous_description}',"
    else:
        previous_notes_context = "Parece ser a primeira página de anotações manuscritas, sem conteúdo anterior para referência."

    prompt = (
        f"{previous_notes_context} você agora receberá uma imagem de anotações manuscritas de um curso de Matemática. "
        f"Após o texto, seguirá uma imagem. Sua tarefa é processar esta imagem em duas partes:\n\n"
        f"##OBSERVAÇÕES##\n"
        f"Forneça considerações gerais sobre o conteúdo na imagem. Descreva quaisquer gráficos, desenhos ou elementos que não estejam claramente distinguíveis. "
        f"Inclua sua compreensão do contexto do conteúdo, especialmente se ele se relacionar ou contrastar com quaisquer notas anteriores. Mencione qualquer coisa que possa ser omitida durante a transcrição.\n\n"
        f"##TRANSCRIÇÃO EFETIVA##\n"
        f"Transcreva todo o conteúdo visível da imagem. Converta todas as expressões matemáticas e equações para o formato LaTeX adequado. "
        f"Inclua cada pedaço de texto, garantindo que a transcrição seja o mais precisa e completa possível."
    )
    return prompt

def process_png_images(folder_path, output_folder):
    """Processes all PNG images in the specified folder and saves the extracted text."""
    files = os.listdir(folder_path)
    png_files = [file for file in files if file.endswith('.png')]
    png_files_sorted = sorted(png_files, key=lambda x: int(x.split('.')[0]))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_index, file in enumerate(png_files_sorted, start=1):
        image_path = os.path.join(folder_path, file)
        print(image_path)
        #break
        previous_description = get_previous_description(ocr_text_folder, file_index)
        prompt = create_prompt_with_previous_description(previous_description)
        response = send_image_to_gpt4v(image_path, prompt)
        time.sleep(15)
        
        extracted_text = response.get('choices', [{}])[0].get('message', {}).get('content', 'No text found')
        output_file_path = os.path.join(output_folder, file.replace('.png', '.txt'))
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(extracted_text)

        print(f"Processed and saved OCR text for: {file}")

# Paths to the folders
png_folder_path = "../data/processed/aula13/pngs/"
ocr_text_folder = "../data/processed/aula13/ocr_text/"

# Process the images
process_png_images(png_folder_path, ocr_text_folder)


../data/processed/aula13/pngs/1.png
Processed and saved OCR text for: 1.png
../data/processed/aula13/pngs/2.png
Processed and saved OCR text for: 2.png
../data/processed/aula13/pngs/3.png
Processed and saved OCR text for: 3.png
../data/processed/aula13/pngs/4.png
Processed and saved OCR text for: 4.png
../data/processed/aula13/pngs/5.png
Processed and saved OCR text for: 5.png
../data/processed/aula13/pngs/6.png
Processed and saved OCR text for: 6.png
../data/processed/aula13/pngs/7.png
Processed and saved OCR text for: 7.png
../data/processed/aula13/pngs/8.png
Processed and saved OCR text for: 8.png
../data/processed/aula13/pngs/9.png
Processed and saved OCR text for: 9.png
../data/processed/aula13/pngs/10.png
Processed and saved OCR text for: 10.png
../data/processed/aula13/pngs/11.png
Processed and saved OCR text for: 11.png
../data/processed/aula13/pngs/12.png
Processed and saved OCR text for: 12.png


# Concatenate descriptions

In [8]:
import os

def concatenate_text_files(ocr_text_folder, output_file_path):
    """
    Reads all .txt files in the specified folder and concatenates their content into a single file.

    Parameters:
    ocr_text_folder (str): The folder containing the .txt files.
    output_file_path (str): The path of the output file where the concatenated text will be saved.
    """
    # Ensure the folder exists
    if not os.path.exists(ocr_text_folder):
        print(f"The specified folder '{ocr_text_folder}' does not exist.")
        return

    # Get all .txt files in the folder
    txt_files = sorted(f for f in os.listdir(ocr_text_folder) if f.endswith('.txt'))

    # Concatenate the contents of each file
    concatenated_text = ''
    for file in txt_files:
        file_path = os.path.join(ocr_text_folder, file)
        with open(file_path, 'r', encoding='utf-8') as f:
            concatenated_text += f.read() + '\n'  # Add a newline character between files for clarity

    # Write the concatenated text to the output file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(concatenated_text)

    print(f"All text files have been concatenated into '{output_file_path}'.")

# Example usage
ocr_text_folder = "../data/processed/aula13/ocr_text"
output_file_path = "../data/processed/aula13/combined_text.txt"
concatenate_text_files(ocr_text_folder, output_file_path)


All text files have been concatenated into '../data/processed/aula13/combined_text.txt'.


# Create Markdown

In [15]:
from openai import OpenAI

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def gpt4_completion(file_content, prompt):
    client = OpenAI()  # Initialize the OpenAI client

    full_prompt = file_content + "\n\n" + prompt  # Combine file content and prompt

    completion = client.chat.completions.create(
        model="gpt-4-1106-preview",  # Specify the GPT-4 model
        messages=[
            {"role": "system", "content": "You are a math editor."},
            {"role": "user", "content": full_prompt}
        ]
    )

    # Extracting the response from the completion
    response = completion.choices[0].message.content

    return response

def save_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# Example usage
input_file_path = "../data/processed/aula13/combined_text.txt"
output_file_path = "../data/processed/aula13/formatted_notes.md"

file_content = read_file(input_file_path)

prompt = "Dadas as anotações acima, crie uma versão limpa, bem elaborada e completamente fiel às anotações originais dessas notas de aula, mas faça isso em Inglês. \
    Seu output deve estar em linguagem Markdown, incluindo todo o código LaTeX quando necessário \
        e títulos e subtítulos apropriados. Cheque seu trabalho ao final e corrija qualquer erro de Latex. Lembre-se o output final é em Inglês. Verifique que não existe nenhum erro de Latex. Nao inclua as observações. Apenas um texto limpo em ingles em Markdown."

response = gpt4_completion(file_content, prompt)

# Save the response to a file
save_to_file(output_file_path, response)

print(f"Formatted notes saved to {output_file_path}")


Formatted notes saved to ../data/processed/aula13/formatted_notes.md


# Check LaTeX

Use GPT-4 to check LaTeX syntax

# Turn MarkDown into PDF

In [16]:
import pypandoc

# This will download Pandoc to a location accessible by pypandoc
pypandoc.download_pandoc()

def convert_md_to_pdf(input_file_path, output_file_path):
    try:
        # Convert Markdown to PDF
        pypandoc.convert_file(input_file_path, 'pdf', outputfile=output_file_path)
        print(f"PDF successfully created at {output_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_md_file = "../data/processed/aula13/formatted_notes.md"
output_pdf_file = "../data/processed/aula13/formatted_notes.pdf"

convert_md_to_pdf(input_md_file, output_pdf_file)


An error occurred: Pandoc died with exitcode "43" during conversion: pdflatex: major issue: So far, you have not checked for MiKTeX updates.
pdflatex: major issue: So far, you have not checked for MiKTeX updates.
Error producing PDF.
! Package amsmath Error: \dot allowed only in math mode.

See the amsmath package documentation for explanation.
Type  H <return>  for immediate help.
 ...                                              
                                                  
l.62 A System {[} \dot{x}


