<a href="https://colab.research.google.com/github/ldeluigi/supermarket-2077-product-vision/blob/master/StoreProduct_DatasetGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminary Operations

In [None]:
!rm -rf sample_data
!gdown --id 1fDr4g4wbnSRkuCYyS3wpuJS7Ax22bVB_ -O all.zip
!unzip -oq all.zip

%matplotlib inline

In [None]:
!pip install opencv-contrib-python==4.4.0.44

In [None]:
import scipy.io
import os
from pathlib import Path
import re
import cv2
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
import shutil
from tqdm.notebook import tqdm
from keras.preprocessing.image import ImageDataGenerator

# Data Visualization Utilities

In [None]:
def show_image(img):
  plt.axis('off')
  plt.imshow(img)

def show_grayscale_image(img):
  show_image(cv2.merge([img, img, img]))

def plot_grid(images, columns, show_axis=False, labels=None):
  if len(images) == 0 or columns <= 0:
    return
  height = 1 + math.ceil(len(images) / columns) * 2
  width = columns * 4
  dpi = max(images[0].shape[0], images[0].shape[1]) // 2
  fig = plt.figure(figsize=(width, height), dpi=dpi)
  fig.subplots_adjust(hspace=0.4)
  for index, img in enumerate(images, start=1):
    if 'float' in img.dtype.str:
      img = (img * 255).astype('uint8')
    sp = fig.add_subplot(math.ceil(len(images) / columns), columns, index)
    if not show_axis:
      plt.axis('off')
    plt.imshow(img)
    if labels is not None:
      l = len(labels)
      sp.set_title(labels[(index-1) % l], fontsize=10)
    else:
      sp.set_title(index, fontsize=10)

def dataset_plot_grid(indexes, columns, dataset, draw_item):
  fig = plt.figure(figsize=(12, 6), dpi=120)
  # fig.subplots_adjust(hspace=0.2)
  for index, i_img in enumerate(indexes, start=1):
    sp = fig.add_subplot(math.ceil(len(indexes) / columns), columns, index)
    row = dataset[i_img]
    draw_item(row, sp)

# Raw image loading

## Utilities to read raw data from disk

In [None]:
training_dirname = 'Training'

def create_class_label(class_index, class_name):
  return class_name

def read_classes():
  mat = scipy.io.loadmat(os.path.join(training_dirname, 'TrainingClassesIndex.mat'))
  raw_classes = list(map(lambda x: x[0], mat['classes'][0]))
  classes = map(lambda x: (x[0], create_class_label(*x)), enumerate(raw_classes, start=1))
  return dict(classes), dict(enumerate(raw_classes, start=1))

def read_training_data(classes):
  images = []
  class_indices = []
  for class_index, class_name in classes.items():
    dirname_images = os.path.join(training_dirname, class_name)
    directory_images = os.fsencode(dirname_images)
    for file in os.listdir(directory_images):
      img = cv2.imread(os.path.join(dirname_images, os.fsdecode(file)))
      img_rgb =  cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
      images.append(img_rgb)
      class_indices.append(class_index)
  return images, class_indices

def read_store_data():
  images = []
  class_indices = []
  bounding_boxes = []
  labels = []
  for i in range(5):
    storename = 'store' + str(i + 1)
    dirname_anno = os.path.join(storename, 'annotation')
    dirname_images = os.path.join(storename, 'images')
    directory_anno = os.fsencode(dirname_anno)
    directory_images = os.fsencode(dirname_images)

    for file in os.listdir(directory_anno):
      filename = os.fsdecode(file)
      if filename.endswith(".mat"): 
        mat = scipy.io.loadmat(os.path.join(dirname_anno, filename))
        number = re.search(r'^anno.(\d+).mat$', filename).group(1)
        img = cv2.imread(os.path.join(dirname_images, number + '.jpg'))

        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_annotation = mat['annotation'][0, 0]
        def clamp(x):
          return max(0, min(1, x))
        def clamp_box(box):
          return list(map(clamp, box))
        bounding_boxes.append(list(map(lambda x: clamp_box(x[0]), img_annotation[0][0])))
        print(img_annotation)
        labels.append(list(map(lambda x: str(x[0][0][0]), img_annotation[1][0])))
        classes_in_image = img_annotation[2][0]
        most_frequent_class = mode(classes_in_image)[0][0]
        img_rgb = cv2.GaussianBlur(img_rgb, (3, 3), 0)
        images.append(img_rgb)
        class_indices.append(most_frequent_class)
  return images, class_indices, bounding_boxes, labels

## Prepare products class dictionary

In [None]:
classes, raw_classes = read_classes()

def class_name(class_index):
  return classes[class_index] if class_index >= 0 else None

## Load training raw images

In [None]:
products, products_classes = read_training_data(raw_classes)

## Products visualization

In [None]:
def show_products_with_class(indexes, columns, dataset):
  def show_single_product_with_class(row, sp):
    img, class_index = row
    plt.axis('off')
    plt.imshow(img)
    sp.set_title(class_name(class_index), fontsize=10)
  dataset_plot_grid(indexes, columns, dataset, show_single_product_with_class)

show_products_with_class(np.random.randint(0, len(products), 6), 3, list(zip(products, products_classes)))

# Raw Image preprocessing

## Background removal

In [None]:
# code taken from https://www.kaggle.com/vadbeg/opencv-background-removal and modified

def remove_background(img, threshold, use_mask=False):
  gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
  _, threshed = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY_INV)

  kernel_size = round(max(img.shape[0], img.shape[1]) * 0.02)
  kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
  morphed = cv2.morphologyEx(threshed, cv2.MORPH_CLOSE, kernel)

  cnts = cv2.findContours(morphed, 
                          cv2.RETR_EXTERNAL,
                          cv2.CHAIN_APPROX_SIMPLE)[0] # should be [1] for cv2 version <= 4

  cnts = sorted(cnts, key=cv2.contourArea)

  mask = cv2.drawContours(threshed, [cnts[-1]], 0, [255], cv2.FILLED)

  x, y, w, h = cv2.boundingRect(cnts[-1])

  if use_mask:
    masked_data = cv2.bitwise_and(img, img, mask=mask)
    dst = masked_data[y: y + h, x: x + w]
    r, g, b = cv2.split(dst)
    alpha = mask[y: y + h, x: x + w]

    rgba = [r, g, b, alpha]
    dst = cv2.merge(rgba, 4)
  else:
    dst = img[y: y + h, x: x + w]

  return dst

n = 778
print(f'Index: {n}')
print(f'Class: {class_name(products_classes[n])}')
plot_grid([products[n], remove_background(products[n], 250)], 2, show_axis=True)

## Dataset preparation



### Image cleaning

In [None]:
def clean_image(img):
  threshold = 250
  img = remove_background(img, threshold)
  return img

### Prepare dataset

In [None]:
all_products_images = []
for image, class_index in zip(products, products_classes):
  cleaned_image = clean_image(image)
  all_products_images.append(cleaned_image)

print(len(all_products_images))

# Dataset reduction

In [None]:
store_images_raw, store_classes_raw, store_bounding_boxes, store_product_labels = read_store_data()

In [None]:
import csv, json

def class_should_be_kept(class_index):
  name = class_name(class_index)
  valid_prefixes = [
    'Background',
    'Food/Bakery',
    'Food/Biscuits',
    'Food/Cereals',
    'Food/Coffee',
    'Food/Drinks',
    'Food/Jars-Cans',
    'Food/Pasta',
    'Food/Tea'
  ]
  return any(map(lambda x: name.startswith(x), valid_prefixes))

#store_images_raw, store_classes_raw, store_bounding_boxes, store_product_labels = read_store_data()
output_dir = 'out'
!rm -rf "$output_dir"
training_dir = os.path.join(output_dir, 'Training')
store_dir = os.path.join(output_dir, 'Store')
!mkdir -p "$training_dir"
!mkdir -p "$store_dir"
image_index = 0
new_image_classes = []
product_index_map = {}
for index, image, class_index in zip(range(len(all_products_images)), all_products_images, products_classes):
  if class_should_be_kept(class_index):
    name = class_name(class_index)
    class_path = os.path.join(training_dir, name)
    os.makedirs(class_path, exist_ok=True)
    image_name = os.path.join(class_path, f'{image_index}.jpg')
    cv2.imwrite(image_name, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
    product_index_map[index] = image_index
    image_index += 1
    new_image_classes.append(class_index)

image_index = 0

with open(os.path.join(store_dir, r'store.csv'), 'a+', newline='') as csvfile:
  fieldnames = ['image_index', 'bounding_box', 'product_label']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  writer.writeheader()
  for image, bboxes, labels in zip(store_images_raw, store_bounding_boxes, store_product_labels):
    if all([class_should_be_kept(products_classes[int(l)]) for l in labels]):
      image_name = os.path.join(store_dir, f'{image_index}.jpg')
      cv2.imwrite(image_name, cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
      for bbox, label in zip(bboxes, labels):
        new_index = product_index_map[int(label)]
        writer.writerow({'image_index' : image_index, 'bounding_box': json.dumps(bbox), 'product_label' : new_index})
      image_index += 1

In [None]:
!rm -f products_classifier_data.zip && zip -r products_classifier_data.zip out/Training out/Store

In [None]:
from google.colab import files
files.download('products_classifier_data.zip') 