In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./emnist-byclass-train.csv", header=None)

In [None]:
df.shape

In [None]:
test_df = pd.read_csv("./emnist-byclass-test.csv", header=None)

In [5]:
train_df = pd.concat([df, test_df], axis=0)

In [6]:
del df
del test_df

In [7]:
train_df = train_df.drop_duplicates()

In [None]:
train_df.shape

In [None]:
train_df.columns

In [None]:
train_df.head()

In [11]:
images = train_df.iloc[:, 1:].values
labels = train_df.iloc[:, 0].values
del train_df

In [12]:
def plot_image(image):
    plt.imshow(image.reshape(28, 28), cmap="gray")
    plt.axis("off")
    plt.show()

In [None]:
plot_image(images[0])

In [None]:
labels[0]

In [15]:
label_map = {}
with open("/kaggle/input/emnist/emnist-byclass-mapping.txt") as f:
    for line in f:
        (key, val) = line.split()
        label_map[int(key)] = int(val)

In [10]:
label_map = {k: v for k, v in label_map.items()}

In [17]:
# label_map to character
label_map = {k: chr(v) for k, v in label_map.items()}

In [None]:
label_map2 = {
    0: 48,
    1: 49,
    2: 50,
    3: 51,
    4: 52,
    5: 53,
    6: 54,
    7: 55,
    8: 56,
    9: 57,
    10: 65,
    11: 66,
    12: 67,
    13: 68,
    14: 69,
    15: 70,
    16: 71,
    17: 72,
    18: 73,
    19: 74,
    20: 75,
    21: 76,
    22: 77,
    23: 78,
    24: 79,
    25: 80,
    26: 81,
    27: 82,
    28: 83,
    29: 84,
    30: 85,
    31: 86,
    32: 87,
    33: 88,
    34: 89,
    35: 90,
    36: 97,
    37: 98,
    38: 100,
    39: 101,
    40: 102,
    41: 103,
    42: 104,
    43: 110,
    44: 113,
    45: 114,
    46: 116,
}

In [None]:
labels2 = labels

In [None]:
labels3 = []
for i in range(0, int(labels2.shape[0])):
    labels3.append(chr(label_map[labels2[i]]))

In [None]:
l_l = ["c", "i", "j", "k", "l", "m", "o", "p", "s", "u", "v", "w", "x", "y", "z"]

In [None]:
for i in range(0, int(labels2.shape[0])):
    if labels3[i] in l_l:
        labels3[i] = labels3[i].upper()

In [None]:
for i in range(0, int(labels2.shape[0])):
    labels3[i] = ord(labels3[i])

In [None]:
def get_key_by_value(d, value):
    for key, val in d.items():
        if val == value:
            return key
    return None

In [None]:
mapped_array = []
for i in range(0, int(labels2.shape[0])):
    mapped_array.append(get_key_by_value(label_map2, labels3[i]))

In [None]:
labels = mapped_array

In [18]:
# rotate images 90 degrees
images = np.array([np.transpose(img.reshape(28, 28), axes=(1, 0)) for img in images])

In [None]:
plot_image(images[0])

In [None]:
# Class distribution
class_distribution = pd.Series(labels).value_counts().sort_index()

class_distribution = class_distribution.rename(label_map)

class_distribution.plot(kind="bar", figsize=(15, 10))

In [21]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

images_reshaped = images.reshape(images.shape[0], 28 * 28)

sampled_images_reshaped, sampled_labels = rus.fit_resample(images_reshaped, labels)

sampled_images = sampled_images_reshaped.reshape(
    sampled_images_reshaped.shape[0], 28, 28
)

sampled_indices = rus.sample_indices_

excluded_indices = np.setdiff1d(np.arange(len(images)), sampled_indices)

excluded_images = images[excluded_indices]
excluded_labels = labels[excluded_indices]

In [22]:
images = sampled_images
labels = sampled_labels
test_images = excluded_images
test_labels = excluded_labels
del (
    sampled_images_reshaped,
    sampled_labels,
    sampled_images,
    rus,
    sampled_indices,
    excluded_indices,
    excluded_images,
    excluded_labels,
)

In [None]:
labels.shape

In [None]:
images.shape

# Preprocessing


In [25]:
# Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(images.reshape(images.shape[0], 28 * 28))
images = scaler.transform(images.reshape(images.shape[0], 28 * 28))

scaler.fit(test_images.reshape(test_images.shape[0], 28 * 28))
test_images = scaler.transform(test_images.reshape(test_images.shape[0], 28 * 28))

In [None]:
plot_image(images[0])

In [None]:
plot_image(test_images[0])

# Augmentation


In [28]:
# Augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=10, zoom_range=0.10, width_shift_range=0.1, height_shift_range=0.1
)

datagen.fit(images.reshape(images.shape[0], 28, 28, 1))

In [None]:
num_original_images = images.shape[0]

# Set the desired number of augmented images (3 times the original)
num_augmented_images = num_original_images * 3

augmented_images = []
augmented_labels = []

# Generate augmented images and labels
for batch_images, batch_labels in datagen.flow(
    images.reshape(images.shape[0], 28, 28, 1), labels, batch_size=32, shuffle=True
):
    augmented_images.append(batch_images)
    augmented_labels.append(batch_labels)

    # Stop once we've reached the desired number of augmented images
    if len(augmented_images) * 32 >= num_augmented_images:
        break


# Concatenate the augmented images and labels into single numpy arrays
augmented_images = np.concatenate(augmented_images, axis=0)
augmented_labels = np.concatenate(augmented_labels, axis=0)

# Check the final number of augmented images and labels
print(f"Total number of augmented images: {augmented_images.shape[0]}")
print(f"Total number of augmented labels: {augmented_labels.shape[0]}")

# Save final data


In [30]:
np.save("/kaggle/working/aug_img", augmented_images)
np.save("/kaggle/working/aug_lab", augmented_labels)
np.save("/kaggle/working/images", images)
np.save("/kaggle/working/test_images", test_images)
np.save("/kaggle/working/labels", labels)
np.save("/kaggle/working/test_labels", test_labels)