# Exploratory Data Analysis

This notebook explores published datasets, which use random image augmentation to supplement the original images. The purpose of this notebook is to improve developer understanding of the dataset, and to identify ways in which the dataset could be improved.

## Change working directory to project root

In [None]:
import os
ROOT_DIRECTORIES = {'imagegen', 'tests'}
if set(os.listdir('.')).intersection(ROOT_DIRECTORIES) != ROOT_DIRECTORIES:
    os.chdir('..')

## Imports and constants

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from mlops.errors import PublicationPathAlreadyExistsError
from mlops.dataset.versioned_dataset import VersionedDataset
from imagegen.publish_dataset import publish_dataset, \
    DATASET_PUBLICATION_PATH_LOCAL, DATASET_VERSION
from imagegen.gan import GAN

## Publish versioned dataset

In [None]:
try:
    dataset_path = publish_dataset(DATASET_PUBLICATION_PATH_LOCAL)
except PublicationPathAlreadyExistsError:
    dataset_path = os.path.join(DATASET_PUBLICATION_PATH_LOCAL,
                                DATASET_VERSION)

## Retrieve versioned dataset

In [None]:
dataset = VersionedDataset(dataset_path)

## Explore versioned dataset

In [None]:
dataset.X_train.shape

In [None]:
num_image_rows = 4
num_image_cols = 4
num_images = num_image_rows * num_image_cols
start_idx = np.random.randint(0, len(dataset.X_train) - num_images)
image_grid = GAN.concatenate_images(
    dataset.X_train[start_idx:start_idx + num_images],
    num_image_rows,
    num_image_cols)

In [None]:
plt.figure(figsize=(8, 8))
plt.imshow(image_grid)
plt.show()