# The Toy Shape Dataset

 - A toy shapes dataset: small resolution image composed of squares, circles an triangles.
 - Strongly inspired from [this implementation](https://github.com/matterport/Mask_RCNN/blob/cbff80f3e3f653a9eeee43d0d383a0385aba546b/samples/shapes/shapes.py).
 - The [config.yaml file](./config.yaml) should be able to train the dataset composed of 50 images in less than 10 minutes on a regular GPU with a **mAP@[0.5:0.95] of ~70%**.
 
This dataset is ideal for debugging and educational purpose..

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from pathlib import Path
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import tqdm

import sys; sys.path.append("../../../")
import maskflow

import utils

root_dir = Path("/home/hadim/.data/Neural_Network/Maskflow/Shapes")

data_dir = root_dir / "Data"
data_dir.mkdir(parents=True, exist_ok=True)

# Load the configuration.
config = maskflow.load_config("config.yaml")

# Copy config next to data folder.
maskflow.save_config(config, root_dir / "config.yaml")

## Generate the dataset

In [None]:
height = config["DATASET"]["IMAGE_SIZE"]
width = config["DATASET"]["IMAGE_SIZE"]
class_names = config["DATASET"]["CLASS_NAMES"]

count = 50
min_n_per_image = 1
max_n_per_image = 4
training_size = 0.9  # From 0 to 1

image_format = "png"
masks_format = "png"

train_ids, _ = train_test_split(np.arange(0, count), train_size=training_size)

train_features_example = []
test_features_example = []

train_file_path = data_dir / "train.tfrecords"
test_file_path = data_dir / "test.tfrecords"

for image_id in tqdm.trange(count):
    # Generate image specification
    bg_color, shapes = utils.random_image(height, width, min_n_per_image, max_n_per_image, class_names)
    
    # Generate the image
    image = utils.generate_image(bg_color, height, width, shapes)
    
    # Generate the masks
    masks, label_ids = utils.generate_mask(bg_color, height, width, shapes, class_names)
    
    # Get a list of class from the object label ids.
    label_names = [class_names[class_id].encode("utf8") for class_id in label_ids]
    
    # Get bounding boxes from masks.
    bboxes = maskflow.bbox.from_masks(masks)
    
    filename = f"toy_{image_id:04d}.png"
    
    build_features_args = {}
    build_features_args['image'] = image
    build_features_args['image_id'] = image_id
    build_features_args['filename'] = filename
    build_features_args['image_format'] = image_format
    build_features_args['bboxes'] = bboxes
    build_features_args['masks'] = masks
    build_features_args['label_ids'] = label_ids
    build_features_args['label_names'] = label_names
    build_features_args['masks_format'] = masks_format
    features_dict = maskflow.dataset.build_features_dict(**build_features_args)
    
    example = tf.train.Example(features=tf.train.Features(feature=features_dict))
    
    if image_id in train_ids:
        train_features_example.append(example)
    else:
        test_features_example.append(example)
        
# Write examples to TFRecord files.
with tf.io.TFRecordWriter(str(train_file_path)) as writer:
    for example in train_features_example:
        writer.write(example.SerializeToString())
        
with tf.io.TFRecordWriter(str(test_file_path)) as writer:
    for example in test_features_example:
        writer.write(example.SerializeToString())

## Visualize some images with their masks

### TODO

- allow iamge without bboxes
- convert to bboxes
- pad datum to be able to batch
- make plot

In [307]:
train_dataset = maskflow.dataset.parse(train_file_path)
for datum in train_dataset:
    pass

In [308]:
def _fn(datum):
    small = {}
    #small['bboxes'] = datum['bboxes']
    #small['masks'] = datum['masks']
    small['label_ids'] = datum['label_ids']
    small['label_names'] = datum['label_names']
    return small
    
test = train_dataset.map(_fn)

padded_shapes = {}
#padded_shapes['bboxes'] = [-1, None, None]
#padded_shapes['masks'] = [-1, None, None]
padded_shapes['label_ids'] = [-1]
padded_shapes['label_names'] = [-1]

padding_values = {}
#padding_values['bboxes'] = np.nan
#padding_values['masks'] = np.nan
padding_values['label_ids'] = tf.constant(-1, dtype=tf.int64)
padding_values['label_names'] = ""

test = test.padded_batch(7, padded_shapes=padded_shapes, padding_values=padding_values)

for datum in test:
    pass

datum

{'label_ids': <tf.Tensor: id=31265, shape=(3, 4), dtype=int64, numpy=
 array([[ 2,  2,  1, -1],
        [ 1,  1, -1, -1],
        [ 0,  2,  1,  1]])>,
 'label_names': <tf.Tensor: id=31266, shape=(3, 4), dtype=string, numpy=
 array([[b'square', b'square', b'triangle', b''],
        [b'triangle', b'triangle', b'', b''],
        [b'circle', b'square', b'triangle', b'triangle']], dtype=object)>}

In [283]:
datum

{'label_ids': <tf.Tensor: id=26964, shape=(5, 4), dtype=int64, numpy=
 array([[1, 0, 0, 0],
        [2, 2, 2, 0],
        [1, 0, 0, 2],
        [1, 1, 1, 0],
        [2, 0, 0, 0]])>}

In [246]:
for datum in train_dataset.take(10).batch(2):
    pass

InvalidArgumentError: Cannot batch tensors with different shapes in component 0. First element had shape [4,4] and element 1 had shape [1,4]. [Op:IteratorGetNextSync]

In [None]:
import matplotlib.pyplot as plt
plt.imshow(datum['masks'].numpy().max(axis=0))

In [None]:
plt.imshow(datum['image'].numpy())

In [None]:
# Number of batch to load
n = 1

# Load some data
data_loader = maskflow.dataset.get_data_loader(config, data_dir, is_train=True)
some_data = [iter(data_loader).next() for _ in range(n)]

# Retrieve category's names
categories = data_loader.dataset.coco.cats

for batch_image, batch_target, batch_idx in some_data:
    maskflow.viz.batch_display_top_masks(train_dataset, basesize=14, limit=3, cmap="PuBu_r")