In [1]:
dbutils.library.installPyPI('opencv-python')
dbutils.library.restartPython()

In [2]:
import csv
import json
import sys
import binascii
import numpy as np
import datetime
import pandas as pd
from collections import Counter
import os, errno
from os import path
import cv2

In [3]:
#Path to Tab Delimited File of Image + Tags
SOURCE_DATA = '/mnt/RAW/FILES/SYNAPSE/POC/MS-TAGGED-DATA/MarsData-1-new2.tsv'
IMAGE_OUTPUT_DIR_LAKE = '/mnt/RAW/FILES/SYNAPSE/POC/UDACITY/PRODUCT_IMAGES_CLASSIFIER/'
IMAGE_OUTPUT_DIR = '/local_disk0/tmp/PRODUCT_IMAGES_CLASSIFIER/'


In [4]:
%sh
rm /local_disk0/tmp/PRODUCT_IMAGES_CLASSIFIER_DATA.ZIP
ls /local_disk0/tmp/ -lth

In [5]:
#Remove Existing Faces Content
#For Datalake:
#dbutils.fs.rm(IMAGE_OUTPUT_DIR, True)
#dbutils.fs.mkdirs(IMAGE_OUTPUT_DIR)

#For Local Storage:
import shutil
if os.path.exists(IMAGE_OUTPUT_DIR):
  shutil.rmtree(IMAGE_OUTPUT_DIR)
os.mkdir(IMAGE_OUTPUT_DIR)

In [6]:
#Open TSV with Data Frame
df_tagged_data = spark.read.format('csv').options(header='false', inferSchema='false', delimiter='\t').load(SOURCE_DATA)
df_tagged_data = df_tagged_data.toPandas()

In [7]:
#Create a Small Slice of 10 Rows for Testing
df_tagged_data_10 = df_tagged_data[:100]

In [8]:
tag_counter = dict()
item_counter = dict()
image_counter = dict()

In [9]:
#ACTION: NEED TO MAKE PRETTY
def ExtractBBoxFromImg(source_image, BBOX_LEFT, BBOX_TOP, BBOX_WIDTH, BBOX_HEIGHT):
  height, width, channels = source_image.shape
  start_row, start_col = int(width * BBOX_LEFT), int(height * BBOX_TOP) 
  end_row, end_col = start_row+int(width * BBOX_WIDTH), start_col+int(height * BBOX_HEIGHT)
  crop_img = source_image[start_col:start_col+int(height * BBOX_HEIGHT),start_row:start_row+int(width * BBOX_WIDTH)]
  return crop_img

def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]

    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image

    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        r = height / float(h)
        dim = (int(w * r), height)

    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        r = width / float(w)
        dim = (width, int(h * r))

    # resize the image
    resized = cv2.resize(image, dim, interpolation = inter)

    # return the resized image
    return resized
  

In [10]:
df_tagged_data.head()

Unnamed: 0,_c0,_c1,_c2
0,1,[],/9j/4RzHRXhpZgAASUkqAAgAAAARABABAgAgAAAA2gAAAC...
1,2,[],/9j/4AAQSkZJRgABAQAASABIAAD/4QBMRXhpZgAATU0AKg...
2,3,[],/9j/4AAQSkZJRgABAQAASABIAAD/4QBYRXhpZgAATU0AKg...
3,4,[],/9j/4AAQSkZJRgABAQAASABIAAD/4QBMRXhpZgAATU0AKg...
4,5,[],/9j/4AAQSkZJRgABAQAASABIAAD/4QBMRXhpZgAATU0AKg...


In [11]:
tag_counter = dict()
item_counter = dict()
image_counter = dict()
output_dir = ''
exception_counter = 0

for index, row in df_tagged_data.iterrows():
    imageTags = []
    taggedImageWithRegion = []
    output_dir = ''

    try:
        # Parse JSON Tag Data
        record_id = row['_c0']
        json_text = row['_c1']
        tag_data = json.loads(json_text)

        # Load Image Byte Data from File
        image_byte_data = binascii.a2b_base64(row['_c2'])
        nparr_image_byte_data = np.fromstring(image_byte_data, np.uint8)
        cv2_image = cv2.imdecode(nparr_image_byte_data, cv2.IMREAD_COLOR)

        for tag in tag_data:
            tag_class = ''
            tag_rect = ''
            output_dir = ''
            item_number = ''
            tag_class = tag["class"]
            tag_rect = tag["rect"]

            if (tag_class.lower() != 'product' and tag_class.lower() != 'skip' and tag_class.lower() != 'gap'):
                # Extract Item # from Existing Tag Name
                item_number = tag_class[tag_class.rfind('-') + 1:tag_class.find('_')]
                #print('Item #: {}'.format(item_number))
                tag_counter[tag_class] = tag_counter.get(tag_class, 0) + 1
                item_counter[item_number] = item_counter.get(item_number, 0) + 1
                
                #Image Save Path for this Item
                output_dir = os.path.join(IMAGE_OUTPUT_DIR, item_number)

                # Create folder for this item if it's the first time we see it
                if item_counter[item_number] == 1:
                    try:
                        print('Creating: {}'.format(output_dir))
                        # dbutils.fs.mkdirs(output_dir)
                        os.mkdir(output_dir)

                    except OSError as exc:
                        print('[ERROR] Exception Occurred on Record: {}'.format(record_id))
                        print(ex)
                        if exc.errno != errno.EEXIST:
                            raise
                            pass


                # Write contents of RECT.
                facing = ExtractBBoxFromImg(cv2_image, tag_rect[0], tag_rect[1], tag_rect[2], tag_rect[3])
                save_status = cv2.imwrite(output_dir + '/' + str(item_counter[item_number]) + '.jpg', facing)
                if save_status == False:
                  print('[ERROR] Error Occurred Saving Image: {}'.format(record_id))
                  print('[ERROR] Output Directory: {}'.format(output_dir))

    except Exception as ex:
        print('[ERROR] Exception Occurred on Record: {}'.format(record_id))
        print(ex)
        exception_counter = + 1
        continue

In [12]:
%sh
###COMPRESS ALL FOLDERS & COPY TO DATA LAKE
pushd /local_disk0/tmp/PRODUCT_IMAGES_CLASSIFIER/
zip -r /local_disk0/tmp/PRODUCT_IMAGES_CLASSIFIER_DATA.ZIP .
cp /local_disk0/tmp/PRODUCT_IMAGES_CLASSIFIER_DATA.ZIP /dbfs/mnt/RAW/FILES/SYNAPSE/POC/UDACITY/PRODUCT_IMAGES_CLASSIFIER/PRODUCT_IMAGES_CLASSIFIER_DATA.ZIP
popd

In [13]:
#Display the # of Images by Item
for item_number, image_count in sorted(item_counter.items(), key=lambda item: item[1], reverse=True):
  print('{}: {}'.format(item_number, image_count))