In [None]:
import pandas as pd
import numpy as np

# Columns for pixel values (28x28 = 784)
pixel_cols = [f'pixel{i}' for i in range(784)]


# Load Greek Letter dataset (main dataset is the inverted one)
greek_df = pd.read_csv("greek_letters_inverted.csv")
greek_df['source'] = 'greek'

# Normalize pixel values (0–255 to 0–1)
greek_df[pixel_cols] = greek_df[pixel_cols] / 255.0

# Ensure label is string type
greek_df['label'] = greek_df['label'].astype(str)


# Load Math Operations dataset (main dataset is the inverted one)
math_df = pd.read_csv("math_operations_inverted.csv")
math_df['source'] = 'math'

# Normalize pixel values (0–255 to 0–1)
math_df[pixel_cols] = math_df[pixel_cols] / 255.0

# Ensure label is string type
math_df['label'] = math_df['label'].astype(str)


# Load EMNIST dataset (train + test CSVs)
emnist_train = pd.read_csv("./data/EMNIST/emnist-byclass-train.csv", header=None)
emnist_test = pd.read_csv("./data/EMNIST/emnist-byclass-test.csv", header=None)

# Label is first column, pixels are the rest
emnist_df = pd.concat([emnist_train, emnist_test], ignore_index=True)
emnist_df.columns = ['label'] + pixel_cols
emnist_df['source'] = 'emnist'

# Optional: map EMNIST numeric labels to characters (skip if already in correct form)
# emnist_df['label'] = emnist_df['label'].map(mapping_dict)

# Normalize EMNIST pixels
emnist_df[pixel_cols] = emnist_df[pixel_cols] / 255.0

# Ensure EMNIST labels are strings for consistency
emnist_df['label'] = emnist_df['label'].astype(str)

# Combine both datasets
combined_df = pd.concat([emnist_df, greek_df, math_df], ignore_index=True)

print("✅ Combined dataset shape:", combined_df.shape)
print("🧪 Unique labels:", sorted(combined_df['label'].unique()))
print("🔢 Sources:", combined_df['source'].value_counts())


In [None]:
import fitz  # PyMuPDF
import os

def convert_pdf_to_images(pdf_path, output_folder='./output/pages'):
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    image_paths = []
    for i in range(len(doc)):
        page = doc[i]
        pix = page.get_pixmap(dpi=300)
        img_path = os.path.join(output_folder, f"{os.path.basename(pdf_path).replace('.pdf','')}_page_{i}.png")
        pix.save(img_path)
        image_paths.append(img_path)
    return image_paths

# Run this for both PDFs
integrals_pages = convert_pdf_to_images("./data/integrals.pdf", "./output/pages")
equations_pages = convert_pdf_to_images("./data/equations.pdf", "./output/pages")


In [None]:
import cv2
import numpy as np

def extract_symbols_from_page(image_path, output_folder="./output/symbols", min_area=100):
    os.makedirs(output_folder, exist_ok=True)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Clean small noise
    kernel = np.ones((2,2), np.uint8)
    denoised = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    contours, _ = cv2.findContours(denoised, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for idx, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)
        if w * h > min_area:
            roi = denoised[y:y+h, x:x+w]
            resized = cv2.resize(roi, (28, 28))
            fname = f"{os.path.basename(image_path).replace('.png', '')}_sym{idx}.png"
            cv2.imwrite(os.path.join(output_folder, fname), resized)


In [None]:
for page in integrals_pages + equations_pages:
    extract_symbols_from_page(page)

In [None]:
import os
import pandas as pd

def create_labels_csv(image_folder='./output/symbols', output_file='labels.csv'):
    image_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.png')])
    df = pd.DataFrame({'filename': image_files, 'label': [''] * len(image_files)})
    df.to_csv(output_file, index=False)
    print(f"{len(image_files)} entries written to {output_file}")

# Run this
create_labels_csv()

## Step 2

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import pandas as pd
import os

class SymbolDataset(Dataset):
    def __init__(self, image_dir, label_csv, transform=None):
        self.image_dir = image_dir
        self.labels_df = pd.read_csv(label_csv)
        self.transform = transform if transform else transforms.ToTensor()

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        img_name = self.labels_df.iloc[idx]['filename']
        label = self.labels_df.iloc[idx]['label']
        img_path = os.path.join(self.image_dir, img_name)

        image = Image.open(img_path).convert('L')  # grayscale
        image = self.transform(image)

        return image, label


In [None]:
from collections import Counter
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Extract labels and image tensors
all_labels = combined_df['label'].tolist()
images = [torch.tensor(row[2:].values.reshape(28, 28), dtype=torch.float32) for _, row in combined_df.iterrows()]

# Count label frequencies
label_counts = Counter(all_labels)

# Filter to labels with at least 2 samples
valid_indices = [i for i, label in enumerate(all_labels) if label_counts[label] >= 2]
filtered_images = [images[i] for i in valid_indices]
filtered_labels = [all_labels[i] for i in valid_indices]

# Encode labels
le = LabelEncoder()
encoded_labels = le.fit_transform(filtered_labels)

# Convert images to tensor
images_tensor = torch.stack(filtered_images)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    images_tensor, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

print(f"✅ Dataset filtered to {len(filtered_labels)} samples across {len(le.classes_)} classes")



In [None]:
from collections import Counter
import matplotlib.pyplot as plt

counts = Counter(all_labels)
plt.bar(counts.keys(), counts.values())
plt.xticks(rotation=90)
plt.title("Label Frequency")
plt.show()


In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

class SymbolCNN(nn.Module):
    def __init__(self, num_classes):
        super(SymbolCNN, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),  # output: 32x28x28
            nn.ReLU(),
            nn.MaxPool2d(2),                             # output: 32x14x14
            nn.Conv2d(32, 64, kernel_size=3, padding=1), # output: 64x14x14
            nn.ReLU(),
            nn.MaxPool2d(2),                             # output: 64x7x7
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        return self.model(x)

# Prepare dataloaders
train_ds = TensorDataset(X_train, torch.tensor(y_train))
test_ds = TensorDataset(X_test, torch.tensor(y_test))
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

# Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SymbolCNN(num_classes=len(le.classes_)).to(device)

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [None]:
def train(model, loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

train(model, train_loader, criterion, optimizer)


In [None]:
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"✅ Accuracy: {correct / total:.4f}")

evaluate(model, test_loader)


In [None]:
from pdf2image import convert_from_path
import cv2
from PIL import Image
import torch

def extract_symbols_with_positions(pdf_path, model, label_encoder, transform, output_expr=[]):
    pages = convert_from_path(pdf_path, dpi=300)
    model.eval()

    for page_index, page in enumerate(pages):
        image = np.array(page.convert('L'))
        _, binary = cv2.threshold(image, 200, 255, cv2.THRESH_BINARY_INV)

        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        symbols = []
        for i, cnt in enumerate(contours):
            x, y, w, h = cv2.boundingRect(cnt)
            if w < 5 or h < 5:
                continue

            roi = binary[y:y+h, x:x+w]
            resized = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
            tensor_img = transform(Image.fromarray(resized)).unsqueeze(0)

            with torch.no_grad():
                pred = model(tensor_img.to(device))
                class_idx = torch.argmax(pred, dim=1).item()
                label = label_encoder.inverse_transform([class_idx])[0]

            symbols.append({'label': label, 'x': x, 'y': y, 'w': w, 'h': h})

        # Sort by reading order: top to bottom, left to right
        symbols = sorted(symbols, key=lambda b: (b['y'], b['x']))

        output_expr.append(symbols)
        print(f"✅ Parsed {len(symbols)} symbols from page {page_index+1}")
    
    return output_expr



In [None]:
def spatial_parse(symbols):
    expr = ""
    symbols = sorted(symbols, key=lambda b: (b['y'], b['x']))

    for i, current in enumerate(symbols):
        label = current['label']
        expr += label

        # Check for next symbol (basic superscript detection)
        if i+1 < len(symbols):
            next = symbols[i+1]
            dx = next['x'] - current['x']
            dy = current['y'] - next['y']
            if dx > 10 and dy > 10:
                expr += '^'  # crude superscript indicator
    return expr


In [None]:
import torch
from torchvision import transforms
from PIL import Image

# Load model + label encoder
model.eval()
transform = transforms.Compose([transforms.Grayscale(), transforms.ToTensor()])

def classify_symbol(image_path):
    image = Image.open(image_path).convert("L")
    image = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(image)
        pred_idx = output.argmax(dim=1).item()
        return label_encoder.inverse_transform([pred_idx])[0]
