## Oxford AI Summit: Kaggle dataset training notebook

In [None]:
%pip install kaggle

In [None]:
import os

import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
# Enable TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print('Running on TPU')
except ValueError:
    strategy = tf.distribute.get_strategy()  # Default strategy for CPU and GPU
    print('Running on GPU or CPU')

In [None]:
api = KaggleApi()

dataset = 'paramaggarwal/fashion-product-images-dataset'
destination_folder = 'fashion_product_images'

api.dataset_download_files(dataset, path=destination_folder, unzip=True)

In [None]:
# Load the metadata
metadata_path = 'fashion_product_images/fashion-dataset/styles.csv'
metadata = pd.read_csv(metadata_path, on_bad_lines='skip')

# Display the first few rows of the metadata
print(metadata.head())
print(metadata.columns)

# Replace 'id' with the correct column name containing the unique identifier
image_folder = 'fashion_product_images/fashion-dataset/images'
metadata['image_path'] = metadata.apply(lambda row: os.path.join(image_folder, str(row['id']) + '.jpg'), axis=1)
metadata = metadata[metadata['image_path'].apply(os.path.exists)]

# Select relevant columns and encode labels
metadata = metadata[['image_path', 'articleType']]
metadata['articleType'] = metadata['articleType'].astype('category')
metadata['label'] = metadata['articleType'].cat.codes

# Ensure each class has at least 2 samples
min_samples_per_class = 2
class_counts = metadata['label'].value_counts()
valid_classes = class_counts[class_counts >= min_samples_per_class].index
metadata = metadata[metadata['label'].isin(valid_classes)]

# Split into training and validation sets
train_df, val_df = train_test_split(metadata, test_size=0.2, stratify=metadata['label'], random_state=5)

# Convert the labels to strings
train_df['label'] = train_df['label'].astype(str)
val_df['label'] = val_df['label'].astype(str)

# Find common classes
train_classes = set(train_df['label'].unique())
val_classes = set(val_df['label'].unique())
print(f"Len: train({len(train_classes)}), val({len(val_classes)})")
print(train_classes - val_classes)
common_classes = train_classes.intersection(val_classes)

# Filter dataframes to only include common classes
train_df = train_df[train_df['label'].isin(common_classes)]
val_df = val_df[val_df['label'].isin(common_classes)]

# Print the number of unique labels
num_classes = len(common_classes)
print(f'Number of unique labels: {num_classes}')
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')

print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')

In [None]:
# Image data generator with augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Image data generator for validation (without augmentation)
val_datagen = ImageDataGenerator(rescale=1./255)

# Data generators
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='image_path',
    y_col='label',
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical',
    shuffle=True,
)

val_generator = val_datagen.flow_from_dataframe(
    val_df,
    x_col='image_path',
    y_col='label',
    target_size=(128, 128),
    batch_size=32,
    class_mode='categorical',
    shuffle=False,
)


In [None]:
# Define the CNN model within the strategy scope
with strategy.scope():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # Adjusted number of output units
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()

In [None]:
# Train the model
checkpoint = ModelCheckpoint('fashion_mnist_model.keras', save_best_only=True)
history = model.fit(
    train_generator,
    epochs=2,
    validation_data=val_generator,
    callbacks=[checkpoint],
)

In [None]:
# Basic evaluation
val_loss, val_acc = model.evaluate(val_generator)
print(f'Validation loss: {val_loss}')
print(f'Validation accuracy: {val_acc}')

In [None]:
# Save the model
model.save('fashion_mnist_model_final.keras')