In [1]:
import sys
sys.path.append("..")
import os
import base64
import requests
import time

from src.image_processing.image_utils import convert_pdf_to_png, encode_image_to_base64
from src.image_processing.ocr import send_image_to_gpt4v
from src.text_processing.text_utils import get_previous_description, concatenate_text_files
from src.utils.prompts import create_prompt_with_previous_description

api_key = os.getenv('OPENAI_API_KEY')

# Turn PDF into separate pngs (one per pdf page)

In [4]:
pdf_path = '../data/notes/aula13.pdf'  # Replace with your PDF file path
poppler_path = r'../venv/poppler-23.11.0/Library/bin'  # Specify your Poppler bin path

convert_pdf_to_png(pdf_path, poppler_path)

# Extract text from image

In [None]:
def process_png_images(folder_path, output_folder):
    """Processes all PNG images in the specified folder and saves the extracted text."""
    files = os.listdir(folder_path)
    png_files = [file for file in files if file.endswith('.png')]
    png_files_sorted = sorted(png_files, key=lambda x: int(x.split('.')[0]))

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_index, file in enumerate(png_files_sorted, start=1):
        image_path = os.path.join(folder_path, file)
        print(image_path)
        #break
        previous_description = get_previous_description(ocr_text_folder, file_index)
        prompt = create_prompt_with_previous_description(previous_description)
        response = send_image_to_gpt4v(image_path, prompt)
        time.sleep(15)
        
        extracted_text = response.get('choices', [{}])[0].get('message', {}).get('content', 'No text found')
        output_file_path = os.path.join(output_folder, file.replace('.png', '.txt'))
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(extracted_text)

        print(f"Processed and saved OCR text for: {file}")

png_folder_path = "../data/processed/aula13/pngs/"
ocr_text_folder = "../data/processed/aula13/ocr_text/"

process_png_images(png_folder_path, ocr_text_folder)

# Concatenate descriptions

In [8]:
ocr_text_folder = "../data/processed/aula13/ocr_text"
output_file_path = "../data/processed/aula13/combined_text.txt"
concatenate_text_files(ocr_text_folder, output_file_path)


All text files have been concatenated into '../data/processed/aula13/combined_text.txt'.


# Create Markdown

In [None]:
from openai import OpenAI
from tiktoken import Tokenizer
import os

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def measure_token_count(text):
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize(text)
    return len(tokens)

def gpt4_completion(context, file_content, prompt):
    client = OpenAI()  # Initialize the OpenAI client
    full_prompt = context + file_content + "\n\n" + prompt
    completion = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a math editor."},
            {"role": "user", "content": full_prompt}
        ]
    )
    return completion.choices[0].message.content

def save_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def split_text(file_content, split_size):
    words = file_content.split()
    return [' '.join(words[i:i + split_size]) for i in range(0, len(words), split_size)]

# Configuration
token_threshold = 3500  # Adjust as needed

# File paths
input_file_path = "../data/processed/aula13/combined_text.txt"
output_file_path = "../data/processed/aula13/formatted_notes.md"

# Read and measure the text
file_content = read_file(input_file_path)
total_tokens = measure_token_count(file_content)


prompt = "Dadas as anotações acima, crie uma versão limpa, bem elaborada e completamente fiel às anotações originais dessas notas de aula, mas faça isso em Inglês. \
    Seu output deve estar em linguagem Markdown, incluindo todo o código LaTeX quando necessário \
        e títulos e subtítulos apropriados. Cheque seu trabalho ao final e corrija qualquer erro de Latex. Lembre-se o output final é em Inglês. Verifique que não existe nenhum erro de Latex. Nao inclua as observações. Apenas um texto limpo em ingles em Markdown."

# Determine processing strategy
if total_tokens <= token_threshold:
    # Process in one go
    response = gpt4_completion("", file_content, prompt)
else:
    # Split and process in chunks
    split_size = token_threshold // 2
    chunks = split_text(file_content, split_size)
    context = ""
    for chunk in chunks:
        response_chunk = gpt4_completion(context, chunk, prompt)
        context += response_chunk  # Update context with the output for the next chunk

    response = context  # Final combined response

# Save the response
save_to_file(output_file_path, response)
print(f"Formatted notes saved to {output_file_path}")


In [None]:
from openai import OpenAI
import os

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def gpt4_completion(file_content, prompt):
    client = OpenAI()  # Initialize the OpenAI client
    full_prompt = file_content + "\n\n" + prompt
    completion = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a math editor."},
            {"role": "user", "content": full_prompt}
        ]
    )
    return completion.choices[0].message.content

def save_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def chunk_text(file_content, chunk_size, overlap_size):
    chunks = []
    start = 0
    while start < len(file_content):
        end = min(start + chunk_size, len(file_content))
        if end == len(file_content):
            chunks.append(file_content[start:end])
            break
        overlap_end = min(end + overlap_size, len(file_content))
        chunks.append(file_content[start:overlap_end])
        start = end
    return chunks

def process_and_merge_chunks(chunks, prompt):
    merged_content = ""
    for i, chunk in enumerate(chunks):
        response_chunk = gpt4_completion(chunk, prompt)
        if i > 0:
            # Remove the overlapping part from the beginning of the current chunk's response
            response_chunk = response_chunk[len(chunks[i-1]) - overlap_size:]
        merged_content += response_chunk
    return merged_content

# Configuration
chunk_size = 3000  # Adjust as needed
overlap_size = 500  # Adjust as needed

# File paths
input_file_path = "../data/processed/aula13/combined_text.txt"
output_file_path = "../data/processed/aula13/formatted_notes.md"

# Read, chunk, and process the text
file_content = read_file(input_file_path)
chunks = chunk_text(file_content, chunk_size, overlap_size)
processed_content = process_and_merge_chunks(chunks, prompt)

# Save the final merged content
save_to_file(output_file_path, processed_content)
print(f"Formatted notes saved to {output_file_path}")


In [15]:
from openai import OpenAI

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def gpt4_completion(file_content, prompt):
    client = OpenAI()  # Initialize the OpenAI client

    full_prompt = file_content + "\n\n" + prompt  # Combine file content and prompt

    completion = client.chat.completions.create(
        model="gpt-4-1106-preview",  # Specify the GPT-4 model
        messages=[
            {"role": "system", "content": "You are a math editor."},
            {"role": "user", "content": full_prompt}
        ]
    )

    # Extracting the response from the completion
    response = completion.choices[0].message.content

    return response

def save_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

# Example usage
input_file_path = "../data/processed/aula13/combined_text.txt"
output_file_path = "../data/processed/aula13/formatted_notes.md"

file_content = read_file(input_file_path)

prompt = "Dadas as anotações acima, crie uma versão limpa, bem elaborada e completamente fiel às anotações originais dessas notas de aula, mas faça isso em Inglês. \
    Seu output deve estar em linguagem Markdown, incluindo todo o código LaTeX quando necessário \
        e títulos e subtítulos apropriados. Cheque seu trabalho ao final e corrija qualquer erro de Latex. Lembre-se o output final é em Inglês. Verifique que não existe nenhum erro de Latex. Nao inclua as observações. Apenas um texto limpo em ingles em Markdown."

response = gpt4_completion(file_content, prompt)

# Save the response to a file
save_to_file(output_file_path, response)

print(f"Formatted notes saved to {output_file_path}")


Formatted notes saved to ../data/processed/aula13/formatted_notes.md


# Check LaTeX

Use GPT-4 to check LaTeX syntax

# Turn MarkDown into PDF

In [16]:
import pypandoc

# This will download Pandoc to a location accessible by pypandoc
pypandoc.download_pandoc()

def convert_md_to_pdf(input_file_path, output_file_path):
    try:
        # Convert Markdown to PDF
        pypandoc.convert_file(input_file_path, 'pdf', outputfile=output_file_path)
        print(f"PDF successfully created at {output_file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
input_md_file = "../data/processed/aula13/formatted_notes.md"
output_pdf_file = "../data/processed/aula13/formatted_notes.pdf"

convert_md_to_pdf(input_md_file, output_pdf_file)


An error occurred: Pandoc died with exitcode "43" during conversion: pdflatex: major issue: So far, you have not checked for MiKTeX updates.
pdflatex: major issue: So far, you have not checked for MiKTeX updates.
Error producing PDF.
! Package amsmath Error: \dot allowed only in math mode.

See the amsmath package documentation for explanation.
Type  H <return>  for immediate help.
 ...                                              
                                                  
l.62 A System {[} \dot{x}


