In [1]:
import fitz  # PyMuPDF
import os

def convert_pdf_to_images(pdf_path, output_folder='./output/pages'):
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []
    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_path = os.path.join(output_folder, f"{os.path.basename(pdf_path).replace('.pdf','')}_page_{i}.png")
        pix.save(img_path)
        image_paths.append(img_path)
    return image_paths

# Run this for both PDFs
integral_page = convert_pdf_to_images("./data/PDFS/integration.pdf", "./output/pages")
addition_page = convert_pdf_to_images("./data/PDFS/addition.pdf", "./output/pages")
subtraction_page = convert_pdf_to_images("./data/PDFS/subtraction.pdf", "./output/pages")
multiplication_page = convert_pdf_to_images("./data/PDFS/multiplication.pdf", "./output/pages")
division_page = convert_pdf_to_images("./data/PDFS/division.pdf", "./output/pages")

In [2]:
import cv2
import numpy as np

def extract_symbols_from_page(image_path, output_folder="./output/symbols", min_area=100):
    os.makedirs(output_folder, exist_ok=True)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Clean small noise
    kernel = np.ones((2,2), np.uint8)
    denoised = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    contours, _ = cv2.findContours(denoised, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for idx, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)
        if w * h > min_area:
            roi = denoised[y:y+h, x:x+w]
            resized = cv2.resize(roi, (28, 28))
            fname = f"{os.path.basename(image_path).replace('.png', '')}_sym{idx}.png"
            cv2.imwrite(os.path.join(output_folder, fname), resized)

for page in integral_page + addition_page + subtraction_page + multiplication_page + division_page:
    extract_symbols_from_page(page)

In [3]:
import os
import pandas as pd

def create_labels_csv(image_folder='./output/symbols', output_file='labels.csv'):
    image_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.png')])
    df = pd.DataFrame({'filename': image_files, 'label': [''] * len(image_files)})
    df.to_csv(output_file, index=False)
    print(f"{len(image_files)} entries written to {output_file}")

# Run this
create_labels_csv()

27 entries written to labels.csv
