## CONVOLUTIONAL NEURAL NETWORKS APPLIED TO THE FASHION-MNIST DATASET || Mini-ResNet Model

This project comprises the complete workflow for the supervised classification of apparel images. The dataset used is Fashion-MNIST (https://github.com/zalandoresearch/fashion-mnist), which consists of 28×28 grayscale images of 10 different clothing categories.\
As an alternative to the standard CNN model also presented in this repo, this notebook showcases a ResNet-like model which introduces residual connections to improve learning stability and convergence in a deeper network.

In [None]:
# libraries and dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, Add, MaxPooling2D, Dropout, Dense, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import os
import random

%matplotlib inline

## DATA LOADING AND PRE-PROCESSING

The Fashion-MNIST dataset is loaded using TensorFlow's built-in API.\
As it is best practice for this type of model, pixel values are normalized to [0,1]. Then, the training set is split into training and validation subsets.

In [None]:
# Load the dataset directly from TF.Keras API

(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

# Visualizar 10 imágenes aleatorias de los conjuntos de entrenamiento y testeo

# Randomly sample 10 images from each set
train_indices = np.random.choice(len(train_images), size=10, replace=False)
test_indices = np.random.choice(len(test_images), size=10, replace=False)

# Create figure with 2 rows and 10 columns
fig, axs = plt.subplots(2, 10, figsize=(20, 5))

for i, idx in enumerate(train_indices):
    axs[0, i].imshow(train_images[idx], cmap='magma_r')
    axs[0, i].axis('off')
    axs[0, i].set_title(f"Train {i+1}")

for i, idx in enumerate(test_indices):
    axs[1, i].imshow(test_images[idx], cmap='Blues')
    axs[1, i].axis('off')
    axs[1, i].set_title(f"Test {i+1}")

plt.tight_layout()
plt.show()

In [None]:
# Normalize the images, so the value of each pixel is comprised within [0,1] instead of [0,255]

train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0

# Check normalization
print(f"Dimensions: {train_images[0].shape}")
print(f"normalized value range: {train_images[0].min()} - {train_images[0].max()}")

plt.imshow(train_images[0], cmap='gray')
plt.axis('off')
plt.show()

In [None]:
# Prepare data for training

train_images = train_images.reshape((train_images.shape[0], 28, 28, 1))

# Divide 80% for training and 20% for validation
X_train, X_val, y_train, y_val = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")

In [None]:
# Check class balance between train and validation datasets

unique_classes = sorted(np.unique(np.concatenate((y_train, y_val))))

train_counts = [np.sum(y_train == i) for i in unique_classes]
val_counts = [np.sum(y_val == i) for i in unique_classes]

x = np.arange(len(unique_classes))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x - width/2, train_counts, width, label='Training')
ax.bar(x + width/2, val_counts, width, label='Validation')

ax.set_xticks(x)
ax.set_xticklabels(unique_classes)
ax.set_xlabel("Classes")
ax.set_ylabel("Sample size")
ax.set_title("Class distribution in training and validation datasets")
ax.legend()
plt.show()

## DEEP RESIDUAL CNN ARCHITECTURE (ResNet-inspired)

This model builds upon the previous CNN by introducing residual blocks, a core component of the ResNet family.\
This allows the model to have a deeper architecture than the standard CNN, and it's intended to better capture complex hierarchical features in the FashionMNIST dataset, while avoiding the vanishing gradient problem.\
Therefore, it is encouraged to continue adding layers and explore the outcomes.

In [None]:
# Define ResNet

def residual_block(x, filters, l2_value=0.001):
    shortcut = x

    if x.shape[-1] != filters:
        shortcut = Conv2D(filters, kernel_size=(1, 1), padding='same', 
                          kernel_regularizer=regularizers.l2(l2_value))(shortcut)
        shortcut = BatchNormalization()(shortcut)

    x = Conv2D(filters, kernel_size=(3, 3), padding='same', 
               kernel_regularizer=regularizers.l2(l2_value))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = Conv2D(filters, kernel_size=(3, 3), padding='same', 
               kernel_regularizer=regularizers.l2(l2_value))(x)
    x = BatchNormalization()(x)
    
    x = Add()([x, shortcut])
    x = Activation('relu')(x)
    
    return x

# Define model
input_layer = keras.Input(shape=(28, 28, 1))

# Initial convolutional layer
x = Conv2D(32, kernel_size=(3, 3), padding='same')(input_layer)
x = BatchNormalization()(x)
x = Activation('relu')(x)

# Residual block 1
x = residual_block(x, 32)
x = residual_block(x, 32)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.1)(x)

# Residual block 2
x = residual_block(x, 64)
x = residual_block(x, 64)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.2)(x)

# Residual block 3
x = residual_block(x, 128)
x = residual_block(x, 128)
x = Dropout(0.3)(x)

# Residual block 4
x = residual_block(x, 256)
x = residual_block(x, 256)
x = Dropout(0.4)(x)

# Residual block 5
x = residual_block(x, 512)
x = residual_block(x, 512)
x = Dropout(0.5)(x)

# Dense layers
x = GlobalAveragePooling2D()(x)

x = Dense(1024)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = Dense(512)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.4)(x)

output_layer = layers.Dense(10, activation='softmax')(x)

model = keras.Model(inputs=input_layer, outputs=output_layer)

model.summary()

In [None]:
# Set hyperparameters

learning_rate = 0.001
epochs = 50
batch_size = 128

# Compile model

model.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss=SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [None]:
# Learning rate scheduler: reduce learning rate if validation loss plateaus
lr_schedule = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

# Optional: save best model

model_save_path = 'path/model.keras'  # Insert chosen path

checkpoint = ModelCheckpoint(
   'models/ResNet.keras',  # Adjust path as needed
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

callbacks = [lr_schedule]  # Add checkpoint if desired: callbacks = [lr_schedule, checkpoint]

## TRAINING, VALIDATION AND TESTING

In [None]:
# Training

history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    callbacks=[callbacks],
    shuffle=True,
)

In [None]:
# Plot accuracy and loss during training and validation

# Extract data from history object
history_dict = history.history

# Loss chart

x_step = 5 # To adjust ticks on x

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history_dict['loss'], label='training')
plt.plot(history_dict['val_loss'], label='validation')
plt.title('Training and validation loss')
plt.xlabel('Epoch')
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(x_step))
plt.ylabel('Loss')
plt.legend()

# Accuracy chart

plt.subplot(1, 2, 2)
plt.plot(history_dict['accuracy'], label='training')
plt.plot(history_dict['val_accuracy'], label='validation')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(x_step))
plt.ylabel('Accuracy')
plt.legend()

plt.show()

### MODEL EVALUATION

The trained model is evaluated on the validation set to assess its performance using confusion matrix and sample predictions.

Note: the pre-trained model is not included in the repo due to GitHub's file size limitations. You can download it using this link:\
[Download MiniResNet.keras](https://drive.google.com/file/d/1yqQC_CTU_vCvAYyxe1EkbuVJHBP8IMxQ/view?usp=drive_link)

Once downloaded, place the file inside the `models/` directory in your local project folder.

In [None]:
# OPTIONAL: Load pre-trained model

# model = load_model('models/MiniResNet.keras')

In [None]:
# Predict over validation set
y_val_pred = model.predict(X_val)
y_val_pred = y_val_pred.argmax(axis=1)  # Convertir probabilidades en clases enteras

# Generate report
report = classification_report(y_val, y_val_pred, output_dict=True)

# Convert report to DataFrame
report_df = pd.DataFrame(report).transpose()

# Show in table format
print(report_df)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)

# Show matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusión - Conjunto de validación')
plt.xlabel('Predicción')
plt.ylabel('Etiquetas reales')
plt.show()

### GENERATE SUBMISSION FILE (Optional)

If you want to participate in a competition or hackathon (like [this one](https://www.analyticsvidhya.com/datahack/contest/practice-problem-identify-the-apparels/)), use the following code to generate a CSV with predictions on the test set.

In [None]:
# Predict over test dataset (if you intend to submit your results to the hackaton)

# Ensure all 4 dimensions exist (if not, add batch size channel)
if X_test.ndim == 3:
    X_test = X_test.reshape((X_test.shape[0], 28, 28, 1))

# Make predictions
y_test_pred = model.predict(X_test)
y_test_labels = y_test_pred.argmax(axis=1)

# Generate DataFrame with label column
sub_df = pd.DataFrame({
    'id': test_df['id'],
    'label': y_test_labels
})

# Save to CSV file
sub_df.to_csv('submission_cnn.csv', index=False)  # Change path if needed

In [None]:
# Quick assessment of model predictions

labels = {0 : "T-shirt/top", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat",
          5: "Sandal", 6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle Boot"}

# Select 10 random indexes from the test dataset
random_indices = random.sample(range(len(X_test)), 20)

# Visualize samples with their corresponding predictions
plt.figure(figsize=(15, 8))
for i, idx in enumerate(random_indices):
    plt.subplot(4, 5, i + 1)
    plt.imshow(X_test[idx].reshape(28, 28), cmap='gray')
    plt.axis('off')
    pred_label = labels[y_test_labels[idx]]
    plt.title(f"Predicted: {pred_label}")
plt.show()