In [None]:
# Autoreload ensures that your python-files stay up-to-date: https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

# Move back to the root directory
%cd ..

# Data Exploration

This notebook shows an example of data exploration.

Data exploration is at the beginning of any Machine Learning project, and helps you familiarise with the problem at hand. You can generate insights like:
* How does a traditional data sample look like?
* How does the biggest bulk of our dataset look like? --> Helps to tackle the "low hanging fruits"
* Are their outliers in our dataset and how do they look like? --> important for "sensitive applications" where outliers are as important as an average sample
* What are the data's properties (i.e. dimensions, type of data, ...)

In [2]:
import re
import csv
import json
import matplotlib.pyplot as plt

from tqdm import tqdm
import random
from random import randint, choice
from PIL import Image, ImageOps
from pathlib import Path
from collections import Counter

## Quick data analysis - FastDup

Luckily, there exist some good libraries out there that do much of the heavy lifting for you and allow you to very quickly get insights in your dataset in a few lines of code. One of these is [FastDup](https://github.com/visual-layer/fastdup). Full documentation [here](https://visual-layer.github.io/fastdup/).

Fastdup is not included in pyproject.toml, because its dependencies are too restrictive, not allowing poetry to solve the version constraints.
As a workaround:

    1. Pip install in the notebook
    2. This changes/breaks the environment: restart the kernel
    3. After using the notebook, run `poetry sync` to restore the versions of all packages

In [None]:
%pip install fastdup

In [None]:
import fastdup

from shutil import rmtree

In [None]:
# Remove if already exists
if Path("fastdup/run").is_dir():
    rmtree("fastdup/run")

fd = fastdup.create(work_dir="fastdup/run", input_dir="data/data/minifigures")
fd.run(nearest_neighbors_k=5, ccthreshold=0.96)

In [None]:
# Create visual gallery of duplicates
fd.vis.duplicates_gallery(num_images=15)

# Create visual gallery of anomalies
fd.vis.outliers_gallery(num_images=15)

# Create visual gallery of clusters
fd.vis.component_gallery(num_images=15)

# Create visual gallery of blur
fd.vis.stats_gallery(metric="blur", num_images=15)

# Create visual gallery of similar images
fd.vis.similarity_gallery(num_images=15)


# View the galleries by opening the generated HTML files in the browser (in fastdup/run/galleries)

## Manual exploration

In a second iteration, we'll go manually over the data to generate some deeper insights.

### 1. Visualise the data

Visualise a random subset of images from our dataset.

In [None]:
# Load in all the data paths using pathlib
files = sorted((Path.cwd() / 'data/data/minifigures').glob('*.png'))
n_files = len(files)
n_files

In [None]:
# Remove the samples with a faulty name
files = [path for path in files if path.name[0] != "."]
n_files = len(files)
n_files

In [None]:
# Visualise 100 images at random
_,axs=plt.subplots(10,10,figsize=(10,10))
for i in range(10*10):
    axs[i%10,i//10].imshow(Image.open(files[randint(0, n_files-1)]))
_ = [ax.set_axis_off() for ax in axs.ravel()]
plt.tight_layout()
plt.show()

### 2. Dataset distribution

Next, when looking into the images' names, we notice that some of them have a prefix, indicating from which dataset they are. Let's extract this prefix and see what type of minifigures we can expect the most. 

In [None]:
# For most samples, there's a dataset prefix
files[200:220]

In [None]:
# Create a function to extract the prefix from a path
def extract_prefix(path:Path) -> str|None:
    """Extract the prefix from the Path."""
    name = path.with_suffix('').name
    prefix = re.search('^[a-z]+', name)
    if prefix:
        return prefix.group(0)
    return None
    
extract_prefix(files[123])

In [None]:
# Generate all the prefixes
prefixes = [extract_prefix(p) for p in tqdm(files)]

# Show to top N prefixes
counts = Counter(prefixes)
if None in counts: del counts[None]

top_n = 10
prefixes_ = sorted(counts.items(), key=lambda x: x[1], reverse=True)
prefixes_[:top_n]

In [None]:
# Visualise dataset distribution
top_n = 20

plt.figure(figsize=(10,5))
plt.bar(range(top_n), [v for _,v in prefixes_[:top_n]], zorder=2)
plt.xticks(range(top_n), [k for k,_ in prefixes_[:top_n]])
plt.tight_layout()
plt.grid(axis='y')
plt.show()

In [None]:
# Show examples for each top_n classes
top_n, n_col = 5, 5

_,axs = plt.subplots(top_n, n_col, figsize=(n_col,top_n))
for i in range(top_n):
    p, n = prefixes_[i]
    selection = [path for path,prefix in zip(files,prefixes) if prefix==p]
    for j in range(n_col):
        axs[i,j].imshow(Image.open(selection[randint(0,len(selection)-1)]))
        if j == (n_col//2): axs[i,j].set_title(p)
_ = [ax.set_axis_off() for ax in axs.ravel()]
plt.tight_layout()
plt.show()

### 3. Sizes

Images have different sizes, let's investigate what the size distribution is of our dataset. This is interesting to keep in mind for later, when we start building a Machine Learning model.

In [None]:
# Get the image's shapes
shapes = [Image.open(path).size for path in tqdm(files)]
x,y = zip(*shapes)

In [None]:
# Visualise the shapes
plt.figure(figsize=(5,5))
plt.scatter(x, y, alpha=0.2, zorder=2)
plt.xlabel('width'); plt.ylabel('height')
plt.ylim(0,550); plt.xlim(0,550)
plt.grid()
plt.show()

In [None]:
# Who's the tall one?
idx = [v>500 for v in x]
Image.open(files[idx.index(True)])

In [None]:
# Who's the wide one?
idx = [v>420 for v in y]
Image.open(files[idx.index(True)])

### 4. Labels

Next to the images themselves, we also have a `dataset.json` that contains the target labels of a few samples in our dataset. Let's have a look how this looks like!

This will show us that we're working with a rather imbalanced dataset, but that there are also already several samples for each individual label.

In [None]:
# Open up the dataset, show how many labels there are, and which unique labels exist
with open('data/data/dataset.json', 'r') as f:
    dataset = json.load(f)
labels = sorted({x for y in dataset.values() for x in y})
len(dataset), labels

In [None]:
# Plot the label ratios
counts = Counter()
for v in dataset.values():
    counts.update(v)

plt.figure()
plt.bar(labels, [counts[l]/len(dataset) for l in labels], zorder=2)
plt.xticks(labels, rotation=45)
plt.yticks([i/10 for i in range(11)], [f"{10*i:3d}%" for i in range(11)])
plt.grid(axis='y')
plt.show()