# Synthetic data generation
Run this notebook to generate a synthetic dataset of lowercase and/or uppercase letters of different fonts and sizes for training and testing.

Parameters can be easily modified to change the font, the size, the thickness, or the colour; and to increase/reduce the randomness of the dataset.

In [2]:
import random
import string
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
from pathlib import Path

In [17]:
class DatasetGenerator:
    def __init__(self, output_dir: Path=Path('letters_dataset'), images_per_class: int=10, image_size: int=128, letter_case: str="upper"):
        self.output_dir = output_dir
        self.images_per_class = images_per_class
        self.image_size = image_size
        self.letter_case = letter_case

    def __get_letters(self) -> str:
        """Get string of letters to generate images for."""
        match self.letter_case:
            case "upper":
                return string.ascii_uppercase
            case "lower":
                return string.ascii_lowercase
            case "all":
                return string.ascii_letters
            case _:
                raise ValueError("Invalid letter case")

    def get_letter_image(self, letter: str) -> np.ndarray:
        """Generates a grayscale letter image.

        Args:
            letter (str): chosen letter

        Returns:
            np.ndarray: image of chosen letter
        """
        image = np.zeros((self.image_size, self.image_size), dtype=np.uint8)    

        # Text parameters
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_size = random.randint(1, 4)
        font_thickness = random.randint(1, 4)
        font_colour = (255)
        letter_width, letter_height = cv2.getTextSize(letter, font, font_size, font_thickness)[0]
        x = random.randint(0, self.image_size - letter_width)
        y = random.randint(0, self.image_size - letter_height)

        # Text origin is at the bottom left corner
        cv2.putText(image, letter, (x, self.image_size - y), font, font_size, font_colour, font_thickness)

        return image

    def generate_letter_dataset(self) -> None:
        """Generate dataset of images of letters."""
        self.output_dir.mkdir(exist_ok=True, parents=True)  # create dataset directory

        # Generate n=images_per_class images for each letter
        for letter in self.__get_letters():        
            for i in range(self.images_per_class):
                image = self.get_letter_image(letter)
                filename = self.output_dir / f"{letter}_{i:04}.png"
                cv2.imwrite(filename, image)

            print(f"Generated {self.images_per_class} images for letter {letter}")

In [None]:
generator = DatasetGenerator()
image = generator.get_letter_image("A")
plt.imshow(image, cmap='gray')

In [None]:

# Generate training and testing datasets
generator = DatasetGenerator(output_dir=Path("/home/ubuntu/data/letters_dataset/train"), images_per_class=10)
generator.generate_letter_dataset()

generator = DatasetGenerator(output_dir=Path("/home/ubuntu/data/letters_dataset/test"), images_per_class=2)
generator.generate_letter_dataset()

In [None]:
def display_images(output_dir: Path, n_samples: int) -> None:
    img_dirs = list(output_dir.glob("*"))
    img_dirs = random.sample(img_dirs, n_samples)

    images = []
    for img in img_dirs:
        images.append(mpimg.imread(img))

    data_set = output_dir.stem.upper()
    plt.figure(figsize=(10,2))
    plt.suptitle(f'{data_set} SET', fontsize =20)
    rows = 1
    columns = n_samples
    for i, image in enumerate(images):
        plt.subplot(rows, columns, i + 1)
        plt.imshow(image, cmap='gray')

train_data = Path("/home/ubuntu/data/letters_dataset/train")
test_data = Path("/home/ubuntu/data/letters_dataset/test")
n_samples = 5

display_images(train_data, n_samples)
display_images(test_data, n_samples)