The purpose of this notebook is to create annotation files for each image in your training set. It creates a JSON file for each image and uploads it to your S3 bucket for future processing.

In [None]:
import sagemaker
from sagemaker import get_execution_role
import os
import urllib.request
import pandas as pd 
from PIL import Image

role = get_execution_role()
print('Role: ' + role)
sess = sagemaker.Session()

from sagemaker.amazon.amazon_estimator import get_image_uri

training_image = get_image_uri(sess.boto_region_name, 'object-detection', repo_version="latest")
print ('Training image: ' + training_image)

Create some functions to help us interact with S3.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import boto3
import io

def get_S3_object(object_name):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('mcwhirter-airbus-ship-detection-data')
    object = bucket.Object(object_name)
    return object.get()['Body']

def get_S3_image(image_name):
    return get_S3_object('training/' + image_name)

def put_S3_file(file_name, file):
    s3 = boto3.resource('s3')
    object = s3.Object('mcwhirter-airbus-ship-detection-data', 'trainimages_annotation/' + file_name)
    object.put(Body=file)
    return object

In [None]:
mapping_csv = pd.read_csv(get_S3_object('bbox_dictionary.csv'))
mapping_csv.columns = ['file','bbox']

bbox_dict = dict()
for index,row in mapping_csv.iterrows():
    bbox_dict[row['file']] = row['bbox']

print(('First image: ' + list(bbox_dict.keys())[0]))

Create a bounding box dictionary from CSV output. It needs some cleaning so we do that below.

In [None]:
import json

bbox_dict_clean = dict()
for key in bbox_dict:
    tmp = bbox_dict[key] 
    tmp = tmp.strip('[')
    tmp = tmp.strip(']')
    tmp = tmp.split('), (')
    tmp = [ele.strip('(') for ele in tmp]
    tmp = [ele.strip(')') for ele in tmp]
    tmp = [ele.split(', ') for ele in tmp]
    tmp = [[int(temp) for temp in ele] for ele in tmp]
    tmpNew = [[ele[0],ele[1],ele[2]-ele[0],ele[3]-ele[1]] for ele in tmp]
    bbox_dict_clean[key] = tmpNew

## Debug ##
# for file in list(bbox_dict_clean.keys())[:5]:
for file in list(bbox_dict_clean.keys())[:5]:
    bbox = list()
    try:
        bbox = bbox_dict_clean[file]
    except KeyError:
        continue
    
    tmpDict = dict()
    tmpDict["file"] = file
    tmpDict["image_size"] = list()
    tmpDict["image_size"] = [{"width":768, "height":768,"depth":3}]
    
    annotations = []
    for ele in bbox:
        annotations.append({"class_id":1,"top":ele[0],"left":ele[1],"width":ele[3],"height":ele[2]})

    tmpDict['annotations'] = annotations
    tmpDict['categories'] = [{'class_id':1,"name":"ship"}]
    jsonFile = file.split('.jpg')[0] + '.json'
    ## Debug statement ##
    # print(jsonFile)
    put_S3_file(jsonFile, json.dumps(tmpDict))