## Review Dataset and Build Image loading function

In [1]:
import zipfile

def unzip_data(filename: str,data_dir="data") -> None:
    """
    Unzips filename into the current working directory.
    Args:
        filename (str): a filepath to a target zip folder to be unzipped.
    """
    zip_ref = zipfile.ZipFile(filename, "r")
    zip_ref.extractall(data_dir)
    zip_ref.close()

In [2]:
unzip_data("/content/zipped_data.zip")

In [3]:
from pathlib import Path
Path.cwd()

PosixPath('/content')

### Import TF and Deps

In [4]:
import tensorflow as tf
import json
import numpy as np
import matplotlib.pyplot as plt

### Load Images into TF data pipeline

In [5]:
images = tf.data.Dataset.list_files('/content/data/data/images/*.jpg', shuffle=False)

In [6]:
def load_img(x):
  """
  Reads in the file and spits out image as an array
  """
  byte_img = tf.io.read_file(x)
  img = tf.io.decode_image(byte_img)
  return img

In [8]:
images = images.map(load_img)

In [9]:
images.as_numpy_iterator().next()

array([[[52, 50, 53],
        [49, 47, 50],
        [47, 45, 48],
        ...,
        [21, 22, 26],
        [19, 23, 26],
        [18, 22, 25]],

       [[51, 49, 52],
        [49, 47, 50],
        [47, 45, 48],
        ...,
        [24, 25, 29],
        [24, 25, 29],
        [22, 26, 29]],

       [[51, 49, 52],
        [49, 47, 50],
        [47, 45, 48],
        ...,
        [24, 25, 29],
        [23, 24, 28],
        [22, 23, 27]],

       ...,

       [[31, 26, 32],
        [38, 33, 39],
        [42, 37, 43],
        ...,
        [21, 17, 16],
        [22, 18, 17],
        [22, 18, 17]],

       [[29, 19, 27],
        [37, 27, 35],
        [42, 32, 40],
        ...,
        [18, 18, 18],
        [19, 19, 19],
        [19, 19, 19]],

       [[26, 15, 23],
        [35, 24, 32],
        [41, 30, 38],
        ...,
        [17, 19, 18],
        [18, 20, 19],
        [18, 20, 19]]], dtype=uint8)

### View images using `matpotlib`

In [13]:
image_generator = images.batch(4).as_numpy_iterator()

In [16]:
to_plot_images = image_generator.next()

In [18]:
# Uncomment below code to see the data

# fig, ax = plt.subplots(ncols=4, figsize=(20, 20))

# for idx, image in enumerate(to_plot_images):
#   ax[idx].imshow(image)

# plt.show()

## Partition data and move files

In [19]:
!mkdir /content/data/train
!mkdir /content/data/train/images
!mkdir /content/data/train/labels

!mkdir /content/data/val
!mkdir /content/data/val/images
!mkdir /content/data/val/labels

!mkdir /content/data/test
!mkdir /content/data/test/images
!mkdir /content/data/test/labels

In [26]:
import random
import shutil
import os
from pathlib import Path
random.seed(42)

N_VAL = 13
N_TEST = 14

image_fileset = os.listdir('/content/data/data/images')
labels_fileset = os.listdir('/content/data/data/labels')

base_path = '/content/data/data/images'

val_imgs, val_labels = zip(*random.sample(list(zip(image_fileset, labels_fileset)), N_VAL))
set1 = set(image_fileset)
set2 = set(val_imgs)
image_fileset = list(set1 - set2)
set1 = set(labels_fileset)
set2 = set(val_labels)
labels_fileset = list(set1 - set2)

test_imgs, test_labels = zip(*random.sample(list(zip(image_fileset, labels_fileset)), N_TEST))
set1 = set(image_fileset)
set2 = set(test_imgs)
train_imgs = list(set1 - set2)
set1 = set(labels_fileset)
set2 = set(test_labels)
train_labels = list(set1 - set2)

for file_name in train_imgs:
  prev_file_path = Path.cwd() / "data" / "data" / "images" / file_name
  new_file_path = Path.cwd() / "data" / "train" / "images" / file_name
  shutil.move(prev_file_path, new_file_path)

for file_name in train_labels:
  prev_file_path = Path.cwd() / "data" / "data" / "labels" / file_name
  new_file_path = Path.cwd() / "data" / "train" / "labels" / file_name
  shutil.move(prev_file_path, new_file_path)

for file_name in val_imgs:
  prev_file_path = Path.cwd() / "data" / "data" / "images" / file_name
  new_file_path = Path.cwd() / "data" / "val" / "images" / file_name
  shutil.move(prev_file_path, new_file_path)

for file_name in val_labels:
  prev_file_path = Path.cwd() / "data" / "data" / "labels" / file_name
  new_file_path = Path.cwd() / "data" / "val" / "labels" / file_name
  shutil.move(prev_file_path, new_file_path)

for file_name in test_imgs:
  prev_file_path = Path.cwd() / "data" / "data" / "images" / file_name
  new_file_path = Path.cwd() / "data" / "test" / "images" / file_name
  shutil.move(prev_file_path, new_file_path)

for file_name in test_labels:
  prev_file_path = Path.cwd() / "data" / "data" / "labels" / file_name
  new_file_path = Path.cwd() / "data" / "test" / "labels" / file_name
  shutil.move(prev_file_path, new_file_path)

## Apply data augmentation on images and labels using `albumentations`

In [27]:
import albumentations as alb

In [None]:
augmentor = alb.Compose([
    alb.RandomCrop(width=450, height=450),
    alb.HorizontalFlip(p=0.5),
    alb.RandomBrightnessConstrast(p=0.2),
    alb.RandomGamma(p=0.2),
    alb.RGBShift(p=0.2),
    alb.VerticalFlip(0.5),
], bbox_params=alb.BboxParams(format='albumentations', label_fields=['class_labels']))

### Testing the augmentation pipeline

In [30]:
import cv2
img = cv2.imread('/content/data/train/images/2c0b4709-8133-11ed-a4ff-6018953ccf15.jpg')

In [33]:
import json
with open('/content/data/train/labels/2fbac32e-8133-11ed-b334-6018953ccf15.json', 'r') as f:
  label = json.load(f)

AttributeError: ignored