In [None]:
import os
from pathlib import Path

import lightning as L
import numpy as np
from PIL import Image, UnidentifiedImageError
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import torch
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, random_split
from torchvision import datasets
from torchvision.transforms import v2

In [None]:
class CharDataset(Dataset):
    """
    General dataset for single character data loading.

    Assumes that in the directory given, the subdirectories are the classes.
    These directory names should be the single character class.
    """
    def __init__(self, img_dir, transform=None):
        self.img_dir = Path(img_dir)
        if not self.img_dir.exists():
            raise FileNotFoundError(f"{img_dir} does not exist")
        alldirs = [p for p in self.img_dir.glob("*") if p.is_dir()]
        self.imgpaths = []
        self.imglabels = []
        for d in alldirs:
            chclass = d.stem.lower()
            imgpaths_dirty = list(d.glob("*.jpg")) + list(d.glob("*.png"))
            imgpaths = []
            for i in range(len(imgpaths_dirty)):
                try:
                    Image.open(imgpaths_dirty[i])
                except UnidentifiedImageError:
                    print(f"Image {imgpaths_dirty[i]} is not a valid image, skipping")
                    continue
                imgpaths.append(imgpaths_dirty[i])
            self.imgpaths.extend(imgpaths)
            self.imglabels.extend([chclass] * len(imgpaths))
        self.transform = transform

    def __len__(self):
        return len(self.imglabels)

    def __getitem__(self, idx):
        imgpath = self.imgpaths[idx]
        img = Image.open(imgpath).convert("L")
        label = self.imglabels[idx]
        if self.transform:
            img = self.transform(img)
        return img, label
    

def ds_to_vectors(ds, size=28):
    imgs = np.array([np.array(x[0].resize((28, 28))).reshape(-1) for x in ds])
    labs = np.array([x[1] for x in ds])
    return imgs, labs


def create_knn(ds, k=5, size=28):
    imgs, labs = ds_to_vectors(ds, size=size)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(imgs, labs)
    return knn


def run_test_knn(ds, knn, size=28):
    imgs, labs = ds_to_vectors(ds, size=size)
    preds = knn.predict(imgs)
    return labs, preds

In [None]:
randomrot_T = v2.Compose([
    v2.PILToTensor(),
    v2.RandomRotation(179),
    v2.ToPILImage()
])

In [None]:
handiso_ds_train = CharDataset("data/handwritten-isolated-english/train")
handiso_ds_test = CharDataset("data/handwritten-isolated-english/test")
notmnist_ds = CharDataset("data/notMNIST_small")
notmnist_ds_train, notmnist_ds_test = random_split(notmnist_ds, [0.8, 0.2])
stdocr_ds_train = CharDataset("data/standard_ocr_dataset/data/training_data") + CharDataset("data/standard_ocr_dataset/data2/training_data")
stdocr_ds_test = CharDataset("data/standard_ocr_dataset/data/testing_data") + CharDataset("data/standard_ocr_dataset/data2/testing_data")
mnist_ds_train = MNIST(Path(os.getcwd(), "data"), train=True)
mnist_ds_test = MNIST(Path(os.getcwd(), "data"), train=False)

In [None]:
print(f"handwritten-isolated-english sizes: [train:{len(handiso_ds_train)}, test:{len(handiso_ds_test)}]")
print(f"not MNIST sizes: [train:{len(notmnist_ds_train)}, test:{len(notmnist_ds_test)}]")
print(f"standard OCR ds sizes: [train:{len(stdocr_ds_train)}, test:{len(stdocr_ds_test)}]")
print(f"MNIST sizes: [train:{len(mnist_ds_train)}, test:{len(mnist_ds_test)}]")

In [None]:
handwrknn = create_knn(handiso_ds_train, k=5, size=28)

In [None]:
handwr_test_data, handwr_test_lab = ds_to_vectors(handiso_ds_test, size=28)
handwr_test_labpred = handwrknn.predict(handwr_test_data)
handwrknn.score(handwr_test_data, handwr_test_lab)

In [None]:
notmnistknn = create_knn(notmnist_ds_train, k=5, size=28)

In [None]:
notmnist_test_data, notmnist_test_lab = ds_to_vectors(notmnist_ds_test, size=28)
notmnist_test_labpred = notmnistknn.predict(notmnist_test_data)
notmnistknn.score(notmnist_test_data, notmnist_test_lab)

In [None]:
print(metrics.classification_report(notmnist_test_lab, notmnist_test_labpred))

In [None]:
stdocrknn = create_knn(stdocr_ds_train, k=5, size=28)

In [None]:
stdocr_test_data, stdocr_test_lab = ds_to_vectors(stdocr_ds_test, size=28)
stdocr_test_labpred = stdocrknn.predict(stdocr_test_data)
stdocrknn.score(stdocr_test_data, stdocr_test_lab)

In [None]:
print(metrics.classification_report(stdocr_test_lab, stdocr_test_labpred))

In [None]:
mnistknn = create_knn(mnist_ds_train, k=5, size=28)

In [None]:
mnist_test_data, mnist_test_lab = ds_to_vectors(mnist_ds_test, size=28)
mnist_test_labpred = mnistknn.predict(mnist_test_data)
mnistknn.score(mnist_test_data, mnist_test_lab)