# Loading the data using Pytorch

In [1]:
import torch
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from torchvision import datasets, transforms

# ✅ Load EMNIST Dataset (ByClass) using PyTorch
transform = transforms.Compose([transforms.ToTensor()])
#Updated split='byclass' to load all 62 character classes (digits + uppercase/lowercase letters).
train_dataset = datasets.EMNIST(root='./data', split='byclass', train=True, download=True, transform=transform)
test_dataset = datasets.EMNIST(root='./data', split='byclass', train=False, download=True, transform=transform)

# Print dataset info
print(f"Training Samples: {len(train_dataset)}")
print(f"Testing Samples: {len(test_dataset)}")
print(f"Number of Classes: {len(train_dataset.classes)}")

100%|██████████| 562M/562M [05:56<00:00, 1.57MB/s]   


Training Samples: 697932
Testing Samples: 116323
Number of Classes: 62


#save training and testing data in csv file

In [2]:
import pandas as pd

# ✅ Convert PyTorch Tensors to NumPy arrays
x_train = train_dataset.data.numpy().reshape(-1, 28*28)  # Flatten images (28x28 → 784 pixels)
y_train = train_dataset.targets.numpy()

x_test = test_dataset.data.numpy().reshape(-1, 28*28)
y_test = test_dataset.targets.numpy()

# ✅ Create Pandas DataFrames
df_train = pd.DataFrame(x_train)
df_train.insert(0, "label", y_train)  # Add label column

df_test = pd.DataFrame(x_test)
df_test.insert(0, "label", y_test)

# ✅ Save as CSV files
df_train.to_csv("emnist_train.csv", index=False)
df_test.to_csv("emnist_test.csv", index=False)

print("EMNIST dataset saved as CSV files: emnist_train.csv & emnist_test.csv")

EMNIST dataset saved as CSV files: emnist_train.csv & emnist_test.csv


#checking for missing or null values in full dataset

In [3]:
import torch
from torchvision import datasets, transforms

# Define transformation
transform = transforms.Compose([transforms.ToTensor()])

# Load the train and test datasets
train_dataset = datasets.EMNIST(root='./data', split='byclass', train=True, download=True, transform=transform)
test_dataset = datasets.EMNIST(root='./data', split='byclass', train=False, download=True, transform=transform)

# Combine datasets
full_dataset = train_dataset + test_dataset

# ✅ Find number of rows and columns
num_rows = len(full_dataset)  # Total number of samples
sample_data, _ = full_dataset[0]  # Take one sample to find shape
num_columns = sample_data.numel()  # Flattened number of pixels (channels × height × width)

print(f"Number of rows (samples): {num_rows}")
print(f"Number of columns (features per sample): {num_columns}")

# Check for missing or invalid values
for i, (data, label) in enumerate(full_dataset):
    if torch.isnan(data).any() or torch.isinf(data).any() or label is None:
        print(f"Missing or invalid value found at index {i}")
        break
else:
    print("No missing or invalid values found.")


Number of rows (samples): 814255
Number of columns (features per sample): 784
No missing or invalid values found.


In [6]:
# ✅ Find number of rows and columns in training dataset
num_rows = len(train_dataset)  # Total number of samples
sample_data, _ = train_dataset[0]  # Take one sample to find shape
num_columns = sample_data.numel()  # Flattened number of pixels (channels × height × width)

print(f"Number of rows in training dataset: {num_rows}")
print(f"Number of columns in training dataset: {num_columns}")

# ✅ Find number of rows and columns in testing dataset
num_rows = len(test_dataset)  # Total number of samples
sample_data, _ = test_dataset[0]  # Take one sample to find shape
num_columns = sample_data.numel()  # Flattened number of pixels (channels × height × width)

print(f"Number of rows in testing dataset: {num_rows}")
print(f"Number of columns in testing dataset: {num_columns}")

Number of rows in training dataset: 697932
Number of columns in training dataset: 784
Number of rows in testing dataset: 116323
Number of columns in testing dataset: 784


## Reduce size: Select only first 90,000 samples for training, 10,000 for testing

In [9]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import Subset
small_train_dataset = Subset(train_dataset, range(90000))
small_test_dataset = Subset(test_dataset, range(10000))

# Print reduced dataset sizes
print(f"Reduced training dataset size: {len(small_train_dataset)} samples")
print(f"Reduced testing dataset size: {len(small_test_dataset)} samples")

# Check number of features (columns)
sample_data, _ = small_train_dataset[0]
print(f"Each sample shape: {sample_data.shape} => Number of columns (flattened): {sample_data.numel()}")

Reduced training dataset size: 90000 samples
Reduced testing dataset size: 10000 samples
Each sample shape: torch.Size([1, 28, 28]) => Number of columns (flattened): 784


#remove duplicates values , if any

In [11]:
import pandas as pd

# Extract data and labels from the original EMNIST dataset using the indices from small_train_dataset and small_test_dataset
train_data = train_dataset.data[small_train_dataset.indices]
train_labels = train_dataset.targets[small_train_dataset.indices]

test_data = test_dataset.data[small_test_dataset.indices]
test_labels = test_dataset.targets[small_test_dataset.indices]

# Convert the small datasets into DataFrames for easy manipulation
df_train = pd.DataFrame(train_data.numpy().reshape(-1, 784))  # Reshape the data into 784 columns (flattened images)
df_train['label'] = train_labels.numpy()  # Add the labels

df_test = pd.DataFrame(test_data.numpy().reshape(-1, 784))  # Reshape the data into 784 columns (flattened images)
df_test['label'] = test_labels.numpy()  # Add the labels

# Remove duplicates
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

# Print confirmation
print("✅ Removed duplicate rows (if any).")


✅ Removed duplicate rows (if any).


In [13]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Resize images to 64x64
    transforms.ToTensor()  # Convert images to tensors
])

train_dataset = datasets.EMNIST(root='./data', split='byclass', train=True, download=True, transform=transform)
test_dataset = datasets.EMNIST(root='./data', split='byclass', train=False, download=True, transform=transform)


## normaliza data

In [14]:
# ✅ Normalize pixel values (0 to 1 range)
df_train.iloc[:, 1:] = df_train.iloc[:, 1:] / 255.0  # Normalize all pixel columns
df_test.iloc[:, 1:] = df_test.iloc[:, 1:] / 255.0
# ✅ Print only the first 5 rows to verify
print("Sample Training Data (First 5 Rows):\n", df_train.head())
print("✅ Data Normalized: Pixel values are now between 0 and 1.")


Sample Training Data (First 5 Rows):
    0    1    2    3    4    5    6    7    8    9  ...  775  776  777  778  \
0  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   779  780  781  782  783     label  
0  0.0  0.0  0.0  0.0  0.0  0.000538  
1  0.0  0.0  0.0  0.0  0.0  0.000554  
2  0.0  0.0  0.0  0.0  0.0  0.000092  
3  0.0  0.0  0.0  0.0  0.0  0.000046  
4  0.0  0.0  0.0  0.0  0.0  0.000338  

[5 rows x 785 columns]
✅ Data Normalized: Pixel values are now between 0 and 1.


# Step 1: Prepare Data for CNN Model
CNNs need image data in shape (28, 28, 1) (not flattened). So, let’s reshape it:

In [16]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Access the original EMNIST dataset inside the Subset object
x_train = small_train_dataset.dataset.data[small_train_dataset.indices].numpy().reshape(-1, 28, 28, 1)  # Reshape the data into 28x28 images with 1 channel
y_train = small_train_dataset.dataset.targets[small_train_dataset.indices].numpy()  # Get the labels

x_test = small_test_dataset.dataset.data[small_test_dataset.indices].numpy().reshape(-1, 28, 28, 1)  # Reshape the data into 28x28 images with 1 channel
y_test = small_test_dataset.dataset.targets[small_test_dataset.indices].numpy()  # Get the labels

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=62)
y_test = to_categorical(y_test, num_classes=62)

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")


x_train shape: (90000, 28, 28, 1)
y_train shape: (90000, 62)
x_test shape: (10000, 28, 28, 1)
y_test shape: (10000, 62)


# define a simple cnn model

In [18]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define the input layer explicitly
input_layer = Input(shape=(28, 28, 1))

# Define the rest of the model
x = Conv2D(32, (3, 3), activation='relu')(input_layer)
x = MaxPooling2D((2, 2))(x)

x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)

x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
output_layer = Dense(62, activation='softmax')(x)  # 62 classes for EMNIST ByClass

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the summary of the model
model.summary()


# train the model

In [20]:
# Train
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

Epoch 1/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.7195 - loss: 0.9080 - val_accuracy: 0.8047 - val_loss: 0.5844
Epoch 2/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.7741 - loss: 0.6875 - val_accuracy: 0.8180 - val_loss: 0.5293
Epoch 3/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.8013 - loss: 0.5945 - val_accuracy: 0.8243 - val_loss: 0.5132
Epoch 4/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 20ms/step - accuracy: 0.8128 - loss: 0.5531 - val_accuracy: 0.8287 - val_loss: 0.4998
Epoch 5/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.8189 - loss: 0.5253 - val_accuracy: 0.8301 - val_loss: 0.5027
Epoch 6/10
[1m1266/1266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 20ms/step - accuracy: 0.8268 - loss: 0.4958 - val_accuracy: 0.8357 - val_loss: 0.4863
Epoc