# Explore the dataset


In this notebook, we will perform an EDA (Exploratory Data Analysis) on the processed Waymo dataset (data in the `processed` folder). In the first part, you will create a function to display 

In [None]:
from utils import get_dataset
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from PIL.ImageStat import Stat
%matplotlib inline

In [None]:
dataset = get_dataset("/home/workspace/data/train/*.tfrecord")

## Write a function to display an image and the bounding boxes

Implement the `display_instances` function below. This function takes a batch as an input and display an image with its corresponding bounding boxes. The only requirement is that the classes should be color coded (eg, vehicles in red, pedestrians in blue, cyclist in green).

In [None]:
def display_instances(batch, batch_size):
    """
    This function takes a batch from the dataset and display the image with 
    the associated bounding boxes.
    """
    colour_map = {1: 'r', 2: 'b', 4: 'g'}
    fig, axes = plt.subplots(batch_size, 1, figsize=(10,10*batch_size))
    for idx, X in enumerate(batch):
        img = X["image"]
        boxes = X["groundtruth_boxes"]
        classes = X["groundtruth_classes"]
        img_width = img.shape[0]
        axes[idx].imshow(img)
        for jdx, box in enumerate(boxes):
            y0, x0, y1, x1 = box[0]*img_width, box[1]*img_width, box[2]*img_width, box[3]*img_width
            rect = patches.Rectangle((x0, y0), x1-x0, y1-y0, linewidth=1, edgecolor=colour_map[classes.numpy()[jdx]], facecolor='none')
            axes[idx].add_patch(rect)
        axes[idx].axis("off")
    plt.show()

## Display 10 images 

Using the dataset created in the second cell and the function you just coded, display 10 random images with the associated bounding boxes. You can use the methods `take` and `shuffle` on the dataset.

In [None]:
## STUDENT SOLUTION HERE
display_instances(dataset.shuffle(buffer_size=1024).take(10), 10)

## Additional EDA

In this last part, you are free to perform any additional analysis of the dataset. What else would like to know about the data?
For example, think about data distribution. So far, you have only looked at a single file...

In [None]:
# Take 1000 random images from dataset and determine frequency of classes
BS = 1000
sample_dataset = dataset.shuffle(buffer_size=1024).take(BS)

counts = np.zeros(5)
for X in sample_dataset:
    classes = X["groundtruth_classes"]
    counts = np.add(counts, np.bincount(classes.numpy(), minlength=5))
normalized_counts = counts / np.sum(counts)
vals = ["cars", "pedestrians", "cyclists"]
fig, ax = plt.subplots(figsize=(10,10))
ax.bar(vals, normalized_counts[np.array([1,2,4])], width=0.8, align='center')
ax.set_title("Frequency of classes in random sample of 1000 images")
ax.set_ylabel("% of total count")
plt.show()

In [None]:
# Get pixel value distribution by channel from random sample of 1000 images
BS = 1000
sample_dataset = dataset.shuffle(buffer_size=1024).take(BS)

red_count = np.zeros(256)
green_count = np.zeros(256)
blue_count = np.zeros(256)
for X in sample_dataset:
    image = X["image"]
    r_, g_, b_ = image[...,0], image[...,1], image[...,2]
    red_count = np.add(red_count, np.bincount(r_.numpy().flatten(), minlength=256))
    green_count = np.add(green_count, np.bincount(g_.numpy().flatten(), minlength=256))
    blue_count = np.add(blue_count, np.bincount(b_.numpy().flatten(), minlength=256))
fig, axes = plt.subplots(3,1,figsize=(10,10))

axes[0].bar(np.arange(256), red_count/np.sum(red_count), width=0.8, align='center', color='r')
axes[0].set_title("Distribution of red pixel values across random sample of 1000 images")
axes[0].set_xlabel("pixel value")
axes[0].set_ylabel("% of total counts")
axes[1].bar(np.arange(256), green_count/np.sum(green_count), width=0.8, align='center', color='g')
axes[1].set_title("Distribution of green pixel values across random sample of 1000 images")
axes[1].set_xlabel("pixel value")
axes[1].set_ylabel("% of total counts")
axes[2].bar(np.arange(256), blue_count/np.sum(blue_count), width=0.8, align='center', color='b')
axes[2].set_title("Distribution of blue pixel values across random sample of 1000 images")
axes[2].set_xlabel("pixel value")
axes[2].set_ylabel("% of total counts")
fig.tight_layout()
plt.show()

In [None]:
# Create scatter plot of mean vs standard deviation of the image pixel values
BS = 1000
sample_dataset = dataset.shuffle(buffer_size=1024).take(BS)
means = []
stddevs = []
for X in sample_dataset:
    image_arr = X["image"].numpy()
    image = Image.fromarray(image_arr)
    image = image.convert('L')
    stat = Stat(image)
    means.append(stat.mean[0])
    stddevs.append(stat.stddev[0])
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(stddevs, means)
ax.set_title("Mean vs standard deviation for pixel values of 1000 randomly selected images")
ax.set_xlabel("std dev")
ax.set_ylabel("mean")
plt.show()

In [None]:
# Create a histogram of bounding box sizes from random sample of 1000 images
BS = 1000
sample_dataset = dataset.shuffle(buffer_size=1024).take(BS)
box_sizes = []
outliers = []
for X in sample_dataset:
    boxes = X["groundtruth_boxes"]
    img = X["image"]
    img_width = img.shape[0]
    for box in boxes:
        y0, x0, y1, x1 = box[0]*img_width, box[1]*img_width, box[2]*img_width, box[3]*img_width
        box_size = (y1-y0) * (x1-x0)
        if (box_size > 10000.0):
            outliers.append(box_size)
        else:
            box_sizes.append(box_size)
    
box_sizes_arr = np.array(box_sizes)
outliers_arr = np.array(outliers)
fig, axes = plt.subplots(2,1, figsize=(10,10))
axes[0].hist(box_sizes_arr)
axes[1].hist(outliers_arr)
axes[0].set_title("Distribution of bounding box areas from 1000 randomly selected images (area < 10000px )")
axes[0].set_xlabel("Bounding box widths [px]")
axes[0].set_ylabel("Counts")
axes[1].set_title("Distribution of bounding box areas from 1000 randomly selected images (area > 10000px )")
axes[1].set_xlabel("Bounding box widths [px]")
axes[1].set_ylabel("Counts")
plt.show()