<a href="https://colab.research.google.com/github/kirillturok/ML_3/blob/main/lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare data

Get notMNIST data

In [1]:
import tensorflow as tf
import pathlib

dataset_url = "https://commondatastorage.googleapis.com/books1000/notMNIST_large.tar.gz"
dataset_dir = tf.keras.utils.get_file('notMNIST_large.tar', origin=dataset_url, extract=True)
dataset_dir = pathlib.Path(dataset_dir).with_suffix('')


Downloading data from https://commondatastorage.googleapis.com/books1000/notMNIST_large.tar.gz


Create dataframe

In [2]:
import os
import cv2
import pandas as pd

CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
DATA_COLUMN = 'data'
LABELS_COLUMN = 'labels'
HASHED_DATA_COLUMN = 'hashed'

def get_class_data(folder_path):
    result_data = list()
    files = os.listdir(folder_path)
    for file in files:
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        if img is not None:
            result_data.append(img)

    return result_data

def create_data_frame():
    data = list()
    labels = list()
    for class_item in CLASSES:
        class_folder_path = os.path.join(dataset_dir, class_item)
        class_data = get_class_data(class_folder_path)

        data.extend(class_data)
        labels.extend([CLASSES.index(class_item) for _ in range(len(class_data))])

    data_frame = pd.DataFrame({DATA_COLUMN: data, LABELS_COLUMN: labels})

    return data_frame

data_frame = create_data_frame()


Preprocess data

In [3]:
def remove_duplicates(data):
    data_bytes = [item.tobytes() for item in data[DATA_COLUMN]]
    data[HASHED_DATA_COLUMN] = data_bytes
    data.sort_values(HASHED_DATA_COLUMN, inplace=True)
    data.drop_duplicates(subset=HASHED_DATA_COLUMN, keep='first', inplace=True)
    data.pop(HASHED_DATA_COLUMN)

    return data

df_no_duplicates = remove_duplicates(data_frame)

min_class_count = df_no_duplicates[LABELS_COLUMN].value_counts().min()
balanced_df = pd.concat([df_no_duplicates[df_no_duplicates[LABELS_COLUMN] == label].sample(min_class_count) for label in df_no_duplicates[LABELS_COLUMN].unique()])

df = balanced_df.sample(frac=1).reset_index(drop=True)


Divide into subsamples

In [8]:
import numpy as np
from skimage.color import rgb2gray
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

BATCH_SIZE = 128

def divide_into_subsamples(data_frame):
    data = np.array(data_frame[DATA_COLUMN].values)
    labels = np.array(data_frame[LABELS_COLUMN].values)

    data_gray = np.array([rgb2gray(img) for img in data])
    data_gray = data_gray.reshape(-1, 28, 28, 1)
    data_gray = data_gray.astype('float32')

    x_train, x_other, y_train, y_other = train_test_split(data_gray, labels, train_size=0.2, random_state = 10)
    x_test, x_val, y_test, y_val = train_test_split(x_other, y_other, train_size = 0.5, random_state = 10)

    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, to_categorical(y_train, num_classes=10)))
    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, to_categorical(y_test, num_classes = 10)))
    dataset_val = tf.data.Dataset.from_tensor_slices((x_val, to_categorical(y_val, num_classes = 10)))

    return dataset_train, dataset_test, dataset_val

dataset_train, dataset_test, dataset_val = divide_into_subsamples(df)

dataset_train = dataset_train.batch(BATCH_SIZE)
dataset_test = dataset_test.batch(BATCH_SIZE)
dataset_val = dataset_val.batch(BATCH_SIZE)


# Models

Models declaration

In [9]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_shape = (28, 28)
num_classes = len(CLASSES)

# convolutional model
conv_model = models.Sequential([
    tf.keras.layers.Rescaling(1. / 255),

    tf.keras.layers.Conv2D(
        32, (3, 3), activation='relu',
        input_shape = input_shape),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.4),

    layers.Dense(num_classes, activation='softmax')
])

conv_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Pooling model
pooling_model = models.Sequential([
    tf.keras.layers.Rescaling(1. / 255),

    tf.keras.layers.MaxPooling2D((2, 2), input_shape = input_shape),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.4),

    layers.Dense(num_classes, activation='softmax')
])

pooling_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# LeNet-5
lenet_model = models.Sequential([
    tf.keras.layers.Rescaling(1. / 255),

    layers.Conv2D(32, (3, 3), activation='relu', input_shape = input_shape),
    layers.AveragePooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.AveragePooling2D((2, 2)),

    layers.Dense(128, activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),

    layers.Dense(num_classes, activation='softmax')
])

lenet_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Parameters initialization

In [6]:
EPOCHS = 50

Convolutional model processing

In [None]:
conv_model.fit(
    dataset_train,
    epochs = EPOCHS,
    validation_data = dataset_val)

test_loss, test_acc = conv_model.evaluate(dataset_val)
print(f'\nConvolutional Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')


Pooling model processing

In [None]:
pooling_model.fit(
    dataset_train,
    epochs = EPOCHS,
    validation_data = dataset_val)

test_loss, test_acc = pooling_model.evaluate(dataset_val)
print(f'\nPooling Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')

LeNet-5 Model processing

In [None]:
lenet_model.fit(
    dataset_train,
    epochs = EPOCHS,
    validation_data = dataset_val)

test_loss, test_acc = lenet_model.evaluate(dataset_val)
print(f'\nLeNet-5 Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')