## Step 1

In [1]:
import fitz  # PyMuPDF
import os

def convert_pdf_to_images(pdf_path, output_folder='./output/pages'):
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []
    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_path = os.path.join(output_folder, f"{os.path.basename(pdf_path).replace('.pdf','')}_page_{i}.png")
        pix.save(img_path)
        image_paths.append(img_path)
    return image_paths

# Run this for both PDFs
integrals_pages = convert_pdf_to_images("./data/integrals.pdf", "./output/pages")
equations_pages = convert_pdf_to_images("./data/equations.pdf", "./output/pages")


In [2]:
import cv2
import numpy as np

def extract_symbols_from_page(image_path, output_folder="./output/symbols", min_area=100):
    os.makedirs(output_folder, exist_ok=True)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Clean small noise
    kernel = np.ones((2,2), np.uint8)
    denoised = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    contours, _ = cv2.findContours(denoised, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for idx, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)
        if w * h > min_area:
            roi = denoised[y:y+h, x:x+w]
            resized = cv2.resize(roi, (28, 28))
            fname = f"{os.path.basename(image_path).replace('.png', '')}_sym{idx}.png"
            cv2.imwrite(os.path.join(output_folder, fname), resized)


In [3]:
for page in integrals_pages + equations_pages:
    extract_symbols_from_page(page)


## Step 2 

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os

class SymbolDataset(Dataset):
    def __init__(self, image_folder, label_file, transform=None):
        self.image_folder = image_folder
        self.transform = transform or transforms.ToTensor()

        # Load image paths and labels
        with open(label_file, 'r') as f:
            lines = f.read().strip().split('\n')
        self.samples = [line.split(',') for line in lines]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = Image.open(os.path.join(self.image_folder, img_path)).convert('L')
        if self.transform:
            image = self.transform(image)
        return image, label


In [5]:
import torch.nn as nn
import torch.nn.functional as F

class SymbolCNN(nn.Module):
    def __init__(self, num_classes):
        super(SymbolCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.25)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 28x28 -> 14x14
        x = self.pool(F.relu(self.conv2(x)))  # 14x14 -> 7x7
        x = x.view(-1, 64 * 7 * 7)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        return self.fc2(x)


In [6]:
from sklearn.preprocessing import LabelEncoder

def train_model(dataset, num_classes, epochs=10, batch_size=64, lr=0.001):
    le = LabelEncoder()
    dataset.samples = [(img, le.fit_transform([label])[0]) for img, label in dataset.samples]
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = SymbolCNN(num_classes)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}")

    return model, le


In [7]:
import os
import pandas as pd

def create_labels_csv(image_folder='./output/symbols', output_file='labels.csv'):
    image_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.png')])
    df = pd.DataFrame({'filename': image_files, 'label': [''] * len(image_files)})
    df.to_csv(output_file, index=False)
    print(f"{len(image_files)} entries written to {output_file}")

# Run this
create_labels_csv()


83 entries written to labels.csv


In [8]:
def evaluate_model(model, dataset):
    model.eval()
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    correct, total = 0, 0

    with torch.no_grad():
        for image, label in loader:
            image = image.to(next(model.parameters()).device)
            output = model(image)
            pred = torch.argmax(output, dim=1).item()
            correct += int(pred == int(label))
            total += 1

    print(f"Accuracy: {correct / total:.4f}")


In [10]:
import pandas as pd

# Load labels.csv to count unique labels
df_labels = pd.read_csv('labels.csv')
unique_labels = df_labels['label'].dropna().unique()
num_classes = len(unique_labels)

# Setup transform and dataset
transform = transforms.Compose([transforms.ToTensor()])
dataset = SymbolDataset('./output/symbols', 'labels.csv', transform)

# Train model
model, encoder = train_model(dataset, num_classes=num_classes)



FileNotFoundError: [Errno 2] No such file or directory: '/Users/janbierowiec/Desktop/ML_Project/output/symbols/filename'