# Exploratory Data Analysis - EMNIST ByClass

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

In [4]:
train = pd.read_csv('data/emnist-byclass-train.csv', header=None)
test = pd.read_csv('data/emnist-byclass-test.csv', header=None)

X_train = train.iloc[:, 1:].values.astype(np.int32)
X_test = test.iloc[:, 1:].values.astype(np.int32)
y_train = train.iloc[:, 0].values.flatten().astype(np.int32)
y_test = test.iloc[:, 0].values.flatten().astype(np.int32)

# Reshape from (N, 784) to (N, 28, 28)
X_train = X_train.reshape(-1, 28, 28)
X_test = X_test.reshape(-1, 28, 28)

# Rotate images to correct orientation
X_train = np.fliplr(np.rot90(X_train, k=3, axes=(1, 2)))
X_test = np.fliplr(np.rot90(X_test, k=3, axes=(1, 2)))

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"Pixel value range: {X_train.min()} to {X_train.max()}")
print(f"Label value range: {y_train.min()} to {y_train.max()}")

Training set shape: (697932, 28, 28)
Test set shape: (116323, 28, 28)
Training labels shape: (697932,)
Test labels shape: (116323,)
Pixel value range: 0 to 255
Label value range: 0 to 61


In [5]:
class_mapping = {}
idx = 0

# Digits 0-9
for i in range(10):
    class_mapping[idx] = str(i)
    idx += 1

# Uppercase letters A-Z
for i in range(26):
    class_mapping[idx] = chr(i + ord('A'))
    idx += 1

# Lowercase letters a-z
for i in range(26):
    class_mapping[idx] = chr(i + ord('a'))
    idx += 1

print(f"Total classes: {len(class_mapping)}")
print(f"Class mapping: {class_mapping}")

Total classes: 62
Class mapping: {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: 'A', 11: 'B', 12: 'C', 13: 'D', 14: 'E', 15: 'F', 16: 'G', 17: 'H', 18: 'I', 19: 'J', 20: 'K', 21: 'L', 22: 'M', 23: 'N', 24: 'O', 25: 'P', 26: 'Q', 27: 'R', 28: 'S', 29: 'T', 30: 'U', 31: 'V', 32: 'W', 33: 'X', 34: 'Y', 35: 'Z', 36: 'a', 37: 'b', 38: 'c', 39: 'd', 40: 'e', 41: 'f', 42: 'g', 43: 'h', 44: 'i', 45: 'j', 46: 'k', 47: 'l', 48: 'm', 49: 'n', 50: 'o', 51: 'p', 52: 'q', 53: 'r', 54: 's', 55: 't', 56: 'u', 57: 'v', 58: 'w', 59: 'x', 60: 'y', 61: 'z'}
