In [15]:
import os
from pathlib import Path

import lightning as L
from PIL import Image
import torch
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

In [70]:
class CharDataset(Dataset):
    """
    General dataset for single character data loading.

    Assumes that in the directory given, the subdirectories are the classes.
    These directory names should be the single character class.
    """
    def __init__(self, img_dir, transform=None):
        self.img_dir = Path(img_dir)
        if not self.img_dir.exists():
            raise FileNotFoundError(f"{img_dir} does not exist")
        alldirs = [p for p in self.img_dir.glob("*") if p.is_dir()]
        self.imgpaths = []
        self.imglabels = []
        for d in alldirs:
            chclass = d.stem.lower()
            imgpaths = list(d.glob("*.jpg")) + list(d.glob("*.png"))
            self.imgpaths.extend(imgpaths)
            self.imglabels.extend([chclass] * len(imgpaths))
        self.transform = transform

    def __len__(self):
        return len(self.imglabels)

    def __getitem__(self, idx):
        imgpath = self.imgpaths[idx]
        img = Image.open(imgpath).convert("L")
        label = self.imglabels[idx]
        if self.transform:
            img = self.transform(img)
        return img, label

In [89]:
handiso_ds_train = CharDataset("data/handwritten-isolated-english/train")
handiso_ds_test = CharDataset("data/handwritten-isolated-english/test")
nomnist_ds = CharDataset("data/notMNIST_small")
stdocr_ds_train = CharDataset("data/standard_ocr_dataset/data/training_data") + CharDataset("data/standard_ocr_dataset/data2/training_data")
stdocr_ds_test = CharDataset("data/standard_ocr_dataset/data/testing_data") + CharDataset("data/standard_ocr_dataset/data2/testing_data")
mnist_ds_train = MNIST(Path(os.getcwd(), "data"), train=True)
mnist_ds_test = MNIST(Path(os.getcwd(), "data"), train=False)