<a href="https://colab.research.google.com/github/kirillturok/ML_lab2/blob/main/lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare data

Get notMNIST data

In [1]:
import tensorflow as tf
import pathlib

dataset_url = "https://commondatastorage.googleapis.com/books1000/notMNIST_large.tar.gz"
dataset_dir = tf.keras.utils.get_file('notMNIST_large.tar', origin=dataset_url, extract=True)
dataset_dir = pathlib.Path(dataset_dir).with_suffix('')

Downloading data from https://commondatastorage.googleapis.com/books1000/notMNIST_large.tar.gz


Create dataframe

In [2]:
import os
import cv2
import pandas as pd

CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
DATA_COLUMN = 'data'
LABELS_COLUMN = 'labels'
HASHED_DATA_COLUMN = 'hashed'

def get_class_data(folder_path):
    result_data = list()
    files = os.listdir(folder_path)
    for file in files:
        image_path = os.path.join(folder_path, file)
        img = cv2.imread(image_path)
        if img is not None:
            result_data.append(img)

    return result_data

def create_data_frame():
    data = list()
    labels = list()
    for class_item in CLASSES:
        class_folder_path = os.path.join(dataset_dir, class_item)
        class_data = get_class_data(class_folder_path)

        data.extend(class_data)
        labels.extend([CLASSES.index(class_item) for _ in range(len(class_data))])

    data_frame = pd.DataFrame({DATA_COLUMN: data, LABELS_COLUMN: labels})

    return data_frame

data_frame = create_data_frame()


Preprocess data

In [3]:
def remove_duplicates(data):
    data_bytes = [item.tobytes() for item in data[DATA_COLUMN]]
    data[HASHED_DATA_COLUMN] = data_bytes
    data.sort_values(HASHED_DATA_COLUMN, inplace=True)
    data.drop_duplicates(subset=HASHED_DATA_COLUMN, keep='first', inplace=True)
    data.pop(HASHED_DATA_COLUMN)

    return data

df_no_duplicates = remove_duplicates(data_frame)

min_class_count = df_no_duplicates[LABELS_COLUMN].value_counts().min()
balanced_df = pd.concat([df_no_duplicates[df_no_duplicates[LABELS_COLUMN] == label].sample(min_class_count) for label in df_no_duplicates[LABELS_COLUMN].unique()])

df = balanced_df.sample(frac=1).reset_index(drop=True)


Divide into subsamples

In [4]:
import numpy as np
from skimage.color import rgb2gray
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

BATCH_SIZE = 32

def divide_into_subsamples(data_frame):
    data = np.array(data_frame[DATA_COLUMN].values)
    labels = np.array(data_frame[LABELS_COLUMN].values)

    data_gray = np.array([rgb2gray(img) for img in data])
    data_gray = data_gray.reshape(-1, 28*28)
    data_gray = data_gray.astype('float32')

    x_train, x_other, y_train, y_other = train_test_split(data_gray, labels, train_size=0.2, random_state = 10)
    x_test, x_val, y_test, y_val = train_test_split(x_other, y_other, train_size = 0.5, random_state = 10)

    dataset_train = tf.data.Dataset.from_tensor_slices((x_train, to_categorical(y_train, num_classes=10)))
    dataset_test = tf.data.Dataset.from_tensor_slices((x_test, to_categorical(y_test, num_classes = 10)))
    dataset_val = tf.data.Dataset.from_tensor_slices((x_val, to_categorical(y_val, num_classes = 10)))

    return dataset_train, dataset_test, dataset_val

dataset_train, dataset_test, dataset_val = divide_into_subsamples(df)

dataset_train = dataset_train.batch(BATCH_SIZE)
dataset_test = dataset_test.batch(BATCH_SIZE)
dataset_val = dataset_val.batch(BATCH_SIZE)


# Models

Models declaration

In [5]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_shape = (28, 28)
num_classes = len(CLASSES)

# Simple model
model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1. / 255),
    tf.keras.layers.Flatten(input_shape=input_shape),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.SGD(),
    loss='categorical_crossentropy',
    metrics=['accuracy'])

# Regularized model
regularized_model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1. / 255),
    tf.keras.layers.Flatten(input_shape=input_shape),
    tf.keras.layers.Dense(
        100,
        activation='relu',
        kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

regularized_model.compile(
    optimizer=tf.keras.optimizers.SGD(),
    loss='categorical_crossentropy',
    metrics=['accuracy'])


Parameters initialization

In [6]:
EPOCHS = 50

Simple model processing

In [7]:
model.fit(
    dataset_train,
    epochs = EPOCHS,
    validation_data = dataset_val)

test_loss, test_acc = model.evaluate(dataset_val)
print(f'\nSimple Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Simple Model
	Test Accuracy: 0.666634202003479
	Test Loss: 1.3875747919082642


Regularized model processing

In [8]:
regularized_model.fit(
    dataset_train,
    epochs = EPOCHS,
    validation_data = dataset_val)

test_loss, test_acc = regularized_model.evaluate(dataset_val)
print(f'\nRegularized Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Regularized Model
	Test Accuracy: 0.5750839710235596
	Test Loss: 2.002985954284668


Dynamic model processing

In [9]:

INITIAL_LEARNING_RATE = 0.01
MIN_LEARNING_RATE = 1e-6
DECAY_STEPS = 12000
DECAY_RATE = 0.8

learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = INITIAL_LEARNING_RATE,
    decay_steps = DECAY_STEPS,
    decay_rate = DECAY_RATE,
    staircase = True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor = 'val_loss',
    factor = 0.1,
    patience = 6,
    verbose = 1,
    min_lr = MIN_LEARNING_RATE)

regularized_model.fit(
    dataset_train,
    validation_data = dataset_val,
    epochs = EPOCHS,
    callbacks = reduce_lr,
    verbose = 1)

test_loss, test_acc = regularized_model.evaluate(dataset_val)
print(f'\nDynamic Model\n\tTest Accuracy: {test_acc}\n\tTest Loss: {test_loss}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

Dynamic Model
	Test Accuracy: 0.6701270341873169
	Test Loss: 1.6124968528747559
