### Download data from kaggle

In [None]:
# Install necessary libraries
!pip install torch torchvision torchaudio opencv-python matplotlib numpy


In [None]:
!mkdir dataset


In [None]:
from google.colab import files

files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!kaggle datasets download -d xainano/handwrittenmathsymbols


In [None]:
import zipfile

# Unzip dataset
dataset_path = "/content/handwrittenmathsymbols.zip"
with zipfile.ZipFile(dataset_path, "r") as zip_ref:
    zip_ref.extractall("/content/handwritten_math_symbols")

print("Dataset extracted successfully!")


In [None]:
import os

# List dataset files
dataset_dir = "/content/handwritten_math_symbols"
print(os.listdir(dataset_dir))


In [None]:
!apt-get install unrar


In [None]:
!unrar x "/content/handwritten_math_symbols/data.rar" "/content/handwritten_math_symbols_extracted/"



### Filter needed classes

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data

import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from PIL import Image
from google.colab import drive

In [None]:
import os
import shutil
from torchvision import datasets, transforms

# Paths
original_dataset_path = "/content/handwritten_math_symbols_extracted/extracted_images"
filtered_dataset_path = "/content/filtered_dataset"  # New dataset path

# Selected classes (numbers, characters, basic arithmetic symbols)
selected_classes = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
    '(', ')', '+', '-', '=', 'forward_slash', 'times', ',', '.'
]

# Create new dataset directory if not exists
os.makedirs(filtered_dataset_path, exist_ok=True)

# Copy only selected class folders to the new dataset path
for cls in selected_classes:
    src_folder = os.path.join(original_dataset_path, cls)
    dest_folder = os.path.join(filtered_dataset_path, cls)

    if os.path.exists(src_folder):  # Only copy if class exists in original dataset
        shutil.copytree(src_folder, dest_folder, dirs_exist_ok=True)  # Copy images


print(f"Filtered dataset saved at: {filtered_dataset_path}")


### Data Loader

In [None]:
# Define transformations
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),  # Convert to grayscale
    transforms.Resize((28, 28)),  # Resize to 28x28
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalize pixel values [-1,1]
])

# Load the NEW dataset
filtered_dataset = datasets.ImageFolder(root=filtered_dataset_path, transform=transform)

# Create DataLoader
train_size = int(0.8 * len(filtered_dataset))
val_size = len(filtered_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(filtered_dataset, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

# Print class labels
classes = filtered_dataset.classes
print(f"Final Selected Classes: {classes}")


In [None]:
def imshow(img, label):
    img = img.numpy().squeeze()  # Convert tensor to NumPy & remove extra dims
    plt.imshow(img, cmap='gray')
    plt.title(f"Label: {filtered_dataset.classes[label]}")
    plt.show()

# Get one batch of data
dataiter = iter(train_loader)
images, labels = next(dataiter)

# Show first 5 images
for i in range(5):
    imshow(images[i], labels[i])



In [None]:
# CNN Model
class MathSymbolCNN(nn.Module):
    def __init__(self, num_classes):
        super(MathSymbolCNN, self).__init__()

        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),  # Conv1
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, stride=2, padding=1),  # Conv2 (stride=2 downsamples)
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),  # Conv3
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),  # Conv4 (stride=2 downsamples)
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=1),  # Conv5
            nn.ReLU()
        )

        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * 7 * 7, 768),  # Fully connected layer 1
            nn.ReLU(),
            nn.Linear(768, 128),  # Fully connected layer 2
            nn.ReLU(),
            nn.Linear(128, num_classes)  # Output layer (num_classes = number of symbols)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
num_classes = len(classes)  # Number of math symbols
model = MathSymbolCNN(num_classes).to(device)

# Define Loss Function & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct, total = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}, Accuracy: {train_acc:.2f}%")


In [None]:
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_acc = 100 * correct / total
print(f"Validation Accuracy: {val_acc:.2f}%")


In [None]:
# Define the path in Google Drive
model_path = "/content/drive/My Drive/math_cnn_improve.pth"

# Save the trained model
torch.save(model.state_dict(), model_path)
print(f"Model saved at: {model_path}")
