# Data notebook
Preparing data for training model. Initial ideas for working with data in other parts of project.

## Imports, drive connection

In [1]:
from google.colab import drive

drive.mount('content/')

Mounted at content/


In [2]:
import json
import os
import zipfile
import pandas as pd
import skimage.io as io
import matplotlib.pyplot as plt
import numpy as np
from skimage.transform import resize
import cv2
import shutil

In [3]:
DATA_PATH = os.path.join('content', 'MyDrive', 'coco2017-annotations')

In [4]:
if not os.path.isdir(os.path.join(DATA_PATH, 'annotations')):
  with zipfile.ZipFile(os.path.join(DATA_PATH, 'annotations_trainval2017.zip'), 'r') as data:
    data.extractall(DATA_PATH)

## Upload annotations
Meaning of content in annottaions: https://www.immersivelimit.com/tutorials/create-coco-annotations-from-scratch/#coco-dataset-format

In [5]:
anno_train_path = os.path.join(DATA_PATH, 'annotations', 'instances_train2017.json')

with open(anno_train_path, 'r') as f:
  data = f.read()

data = json.loads(data)

## Data presentation

In [6]:
data.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [7]:
len(data['images'])

118287

In [8]:
data['info']

{'contributor': 'COCO Consortium',
 'date_created': '2017/09/01',
 'description': 'COCO 2017 Dataset',
 'url': 'http://cocodataset.org',
 'version': '1.0',
 'year': 2017}

In [9]:
data['licenses']

[{'id': 1,
  'name': 'Attribution-NonCommercial-ShareAlike License',
  'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/'},
 {'id': 2,
  'name': 'Attribution-NonCommercial License',
  'url': 'http://creativecommons.org/licenses/by-nc/2.0/'},
 {'id': 3,
  'name': 'Attribution-NonCommercial-NoDerivs License',
  'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/'},
 {'id': 4,
  'name': 'Attribution License',
  'url': 'http://creativecommons.org/licenses/by/2.0/'},
 {'id': 5,
  'name': 'Attribution-ShareAlike License',
  'url': 'http://creativecommons.org/licenses/by-sa/2.0/'},
 {'id': 6,
  'name': 'Attribution-NoDerivs License',
  'url': 'http://creativecommons.org/licenses/by-nd/2.0/'},
 {'id': 7,
  'name': 'No known copyright restrictions',
  'url': 'http://flickr.com/commons/usage/'},
 {'id': 8,
  'name': 'United States Government Work',
  'url': 'http://www.usa.gov/copyright.shtml'}]

In [10]:
data['images'][0]

{'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg',
 'date_captured': '2013-11-14 11:18:45',
 'file_name': '000000391895.jpg',
 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
 'height': 360,
 'id': 391895,
 'license': 3,
 'width': 640}

In [11]:
data['annotations'][0]

{'area': 2765.1486500000005,
 'bbox': [199.84, 200.46, 77.71, 70.88],
 'category_id': 58,
 'id': 156,
 'image_id': 558840,
 'iscrowd': 0,
 'segmentation': [[239.97,
   260.24,
   222.04,
   270.49,
   199.84,
   253.41,
   213.5,
   227.79,
   259.62,
   200.46,
   274.13,
   202.17,
   277.55,
   210.71,
   249.37,
   253.41,
   237.41,
   264.51,
   242.54,
   261.95,
   228.87,
   271.34]]}

In [12]:
data['categories']

[{'id': 1, 'name': 'person', 'supercategory': 'person'},
 {'id': 2, 'name': 'bicycle', 'supercategory': 'vehicle'},
 {'id': 3, 'name': 'car', 'supercategory': 'vehicle'},
 {'id': 4, 'name': 'motorcycle', 'supercategory': 'vehicle'},
 {'id': 5, 'name': 'airplane', 'supercategory': 'vehicle'},
 {'id': 6, 'name': 'bus', 'supercategory': 'vehicle'},
 {'id': 7, 'name': 'train', 'supercategory': 'vehicle'},
 {'id': 8, 'name': 'truck', 'supercategory': 'vehicle'},
 {'id': 9, 'name': 'boat', 'supercategory': 'vehicle'},
 {'id': 10, 'name': 'traffic light', 'supercategory': 'outdoor'},
 {'id': 11, 'name': 'fire hydrant', 'supercategory': 'outdoor'},
 {'id': 13, 'name': 'stop sign', 'supercategory': 'outdoor'},
 {'id': 14, 'name': 'parking meter', 'supercategory': 'outdoor'},
 {'id': 15, 'name': 'bench', 'supercategory': 'outdoor'},
 {'id': 16, 'name': 'bird', 'supercategory': 'animal'},
 {'id': 17, 'name': 'cat', 'supercategory': 'animal'},
 {'id': 18, 'name': 'dog', 'supercategory': 'animal'},

### Separete only categories for vehicles
 - and save only usefull data from each item

In [13]:
new_annotations = {
    'image_id': [],
    'category_id': [],
    'bbox': []
}

for item in data['annotations']:
  if item['category_id'] in [2,3,4,6,7,8]:

    new_annotations['image_id'].append(item['image_id'])
    new_annotations['category_id'].append(item['category_id'])
    new_annotations['bbox'].append(item['bbox'])

annotations_df = pd.DataFrame(new_annotations)

### Drop unused images

In [14]:
# Get ids of images from annotations_df
image_ids = list(set(annotations_df['image_id'].tolist()))

new_images = {
    'id': [],
    'coco_url': []
}

for item in data['images']:
  if item['id'] in image_ids:
    new_images['id'].append(item['id'])
    new_images['coco_url'].append(item['coco_url'])

images_df = pd.DataFrame(new_images)

In [34]:
class CocoVehicle():
  def __init__(self, anno_df, img_df, train=True, tranform=None):
    self.anno_df = anno_df  # categories - ['bicycle', 'car', 'motorcycle', 'bus', 'train', 'truck']
    self.img_df = img_df

  def __len__(self):
    return len(self.img_df)

  def __getitem__(self, index):
    
    item = self.img_df.iloc[index]
    img_path = item['coco_url']
    img_id = item['id']

    try: 
      img = io.imread(img_path)
      img_height, img_width, _ = img.shape

      item = self.anno_df[self.anno_df['image_id'] == img_id]

      # item['category_id] is a series -> values -> list -> ind 0 
      cat_ids = item['category_id'].values

      # replace cat id from [2,3,4,6,7,8] to [0,1,2,3,4,5]
      cat_ids = self._normalize_cat_id(cat_ids)

      bboxes = item['bbox'].values

      ####
      # adjust params to yolo format 
      # center_x, center_y, width, height -> [0, 1] where top left is (0, 0)
      bboxes = self._bbox_yolo_format(bboxes, img.shape)

      ####
      # Show img with bbox
      # only for development
      # c_x, c_y, w, h = bboxes[2]
      # new = cv2.rectangle(img, (int((c_x - w / 2) * img_width), int((c_y - h / 2) * img_height)), (int((c_x + w / 2) * img_width), int((c_y + h / 2) * img_height)), color=(255,0,0), thickness=2)
      # plt.imshow(new)
      # plt.show()

      #### for dev
      # img = io.imread(img_path)
      ####
      
      # Resize
      resized_img = resize(img, (480, 640), anti_aliasing=True)

      # check if bbox are correctly drown after resize
      # new = cv2.rectangle(resized_img, (int((c_x - w / 2) * 640), int((c_y - h / 2) * 480)), (int((c_x + w / 2) * 640), int((c_y + h / 2) * 480)), color=(255,0,0), thickness=2)
      # plt.imshow(new)
      # plt.show()
      
      anno_yolo = []
      for i in range(len(cat_ids)):
        anno_yolo.append([cat_ids[i], *bboxes[i]])

      return resized_img, anno_yolo, img_id
    except ValueError:
      print('No image: ID -', img_id, ' url - ', {img_path})
      return False

  def _normalize_cat_id(self, cat_ids: list):
    bef = [2,3,4,6,7,8]
    new_ids = []

    for id in cat_ids:
      for i in range(6):
        if id == bef[i]:
          new_ids.append(i)
          break 

    return new_ids

  def _bbox_yolo_format(self, bboxes: list, shape: tuple):
    img_height, img_width, _ = shape
    new_bboxes = []
    for bbox in bboxes:
      x, y, w, h = bbox
      center_x, center_y = x + w / 2, y + h / 2
      center_x, center_y, width, height = center_x / img_width, center_y / img_height, w / img_width, h / img_height

      new_bboxes.append([center_x, center_y, width, height])

    return new_bboxes

  def _write_to_file(self, path):
    if os.path.exists(path):
      shutil.rmtree(path)

    os.mkdir(path)
    os.mkdir(os.path.join(path, 'images'))
    os.mkdir(os.path.join(path, 'labels'))

    for item in range(len(self)):
      if self[item]:  
        img, annos, id = self[item]

        # define paths to save data
        path_img = os.path.join(path, 'images', str(id) + '.jpg')
        path_anno = os.path.join(path, 'labels', str(id) + '.txt')

        # write jpeg file with img 
        plt.imsave(path_img, img)

        # write txt file with annotations
        output_anno = ''
        
        for i, anno in enumerate(annos):
          for item in anno:
            output_anno += f'{item} '
          if len(annos) - i > 1:
            output_anno += '\n'

        with open(path_anno, 'w') as f:
          f.write(output_anno)

      # if image doesn't exist
      else:
        continue
    print('Data has been saved.')
dataset = CocoVehicle(anno_df=annotations_df, img_df=images_df)
dataset._write_to_file(os.path.join('content', 'MyDrive', 'data_proj-wdp'))

No image: ID - 559665  url -  {'http://images.cocodataset.org/train2017/000000559665.jpg'}
Data has been saved.
