# Data notebook
Preparing data for training model. Initial ideas for working with data in other parts of project. **Prepared in colab**

URL to notebook: https://colab.research.google.com/drive/1mabI64REX7Vxtui5R8nuqT6hxGe1Uwv9 

## Imports, drive connection

In [45]:
from google.colab import drive

drive.mount('content/')

Drive already mounted at content/; to attempt to forcibly remount, call drive.mount("content/", force_remount=True).


In [46]:
import json
import uuid
import os
import zipfile
import pandas as pd
import skimage.io as io
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from skimage.transform import resize
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import cv2
import shutil

In [47]:
DATA_PATH = os.path.join('content', 'MyDrive', 'coco2017-annotations')

In [48]:
if not os.path.isdir(os.path.join(DATA_PATH, 'annotations')):
  with zipfile.ZipFile(os.path.join(DATA_PATH, 'annotations_trainval2017.zip'), 'r') as data:
    data.extractall(DATA_PATH)

## Upload annotations
Meaning of content in annottaions: https://www.immersivelimit.com/tutorials/create-coco-annotations-from-scratch/#coco-dataset-format

In [49]:
anno_train_path = os.path.join(DATA_PATH, 'annotations', 'instances_train2017.json')

with open(anno_train_path, 'r') as f:
  data = f.read()

data = json.loads(data)

## Data presentation

In [50]:
data.keys()

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])

In [51]:
len(data['images'])

118287

In [52]:
data['info']

{'contributor': 'COCO Consortium',
 'date_created': '2017/09/01',
 'description': 'COCO 2017 Dataset',
 'url': 'http://cocodataset.org',
 'version': '1.0',
 'year': 2017}

In [53]:
data['licenses']

[{'id': 1,
  'name': 'Attribution-NonCommercial-ShareAlike License',
  'url': 'http://creativecommons.org/licenses/by-nc-sa/2.0/'},
 {'id': 2,
  'name': 'Attribution-NonCommercial License',
  'url': 'http://creativecommons.org/licenses/by-nc/2.0/'},
 {'id': 3,
  'name': 'Attribution-NonCommercial-NoDerivs License',
  'url': 'http://creativecommons.org/licenses/by-nc-nd/2.0/'},
 {'id': 4,
  'name': 'Attribution License',
  'url': 'http://creativecommons.org/licenses/by/2.0/'},
 {'id': 5,
  'name': 'Attribution-ShareAlike License',
  'url': 'http://creativecommons.org/licenses/by-sa/2.0/'},
 {'id': 6,
  'name': 'Attribution-NoDerivs License',
  'url': 'http://creativecommons.org/licenses/by-nd/2.0/'},
 {'id': 7,
  'name': 'No known copyright restrictions',
  'url': 'http://flickr.com/commons/usage/'},
 {'id': 8,
  'name': 'United States Government Work',
  'url': 'http://www.usa.gov/copyright.shtml'}]

In [54]:
data['images'][0]

{'coco_url': 'http://images.cocodataset.org/train2017/000000391895.jpg',
 'date_captured': '2013-11-14 11:18:45',
 'file_name': '000000391895.jpg',
 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
 'height': 360,
 'id': 391895,
 'license': 3,
 'width': 640}

In [55]:
data['annotations'][0]

{'area': 2765.1486500000005,
 'bbox': [199.84, 200.46, 77.71, 70.88],
 'category_id': 58,
 'id': 156,
 'image_id': 558840,
 'iscrowd': 0,
 'segmentation': [[239.97,
   260.24,
   222.04,
   270.49,
   199.84,
   253.41,
   213.5,
   227.79,
   259.62,
   200.46,
   274.13,
   202.17,
   277.55,
   210.71,
   249.37,
   253.41,
   237.41,
   264.51,
   242.54,
   261.95,
   228.87,
   271.34]]}

In [56]:
data['categories']

[{'id': 1, 'name': 'person', 'supercategory': 'person'},
 {'id': 2, 'name': 'bicycle', 'supercategory': 'vehicle'},
 {'id': 3, 'name': 'car', 'supercategory': 'vehicle'},
 {'id': 4, 'name': 'motorcycle', 'supercategory': 'vehicle'},
 {'id': 5, 'name': 'airplane', 'supercategory': 'vehicle'},
 {'id': 6, 'name': 'bus', 'supercategory': 'vehicle'},
 {'id': 7, 'name': 'train', 'supercategory': 'vehicle'},
 {'id': 8, 'name': 'truck', 'supercategory': 'vehicle'},
 {'id': 9, 'name': 'boat', 'supercategory': 'vehicle'},
 {'id': 10, 'name': 'traffic light', 'supercategory': 'outdoor'},
 {'id': 11, 'name': 'fire hydrant', 'supercategory': 'outdoor'},
 {'id': 13, 'name': 'stop sign', 'supercategory': 'outdoor'},
 {'id': 14, 'name': 'parking meter', 'supercategory': 'outdoor'},
 {'id': 15, 'name': 'bench', 'supercategory': 'outdoor'},
 {'id': 16, 'name': 'bird', 'supercategory': 'animal'},
 {'id': 17, 'name': 'cat', 'supercategory': 'animal'},
 {'id': 18, 'name': 'dog', 'supercategory': 'animal'},

### Separete only categories for vehicles
 - and save only usefull data from each item

In [57]:
new_annotations = {
    'image_id': [],
    'category_id': [],
    'bbox': []
}

for item in data['annotations'][:3000]:
  if item['category_id'] in [2,3,4,6,7,8]:

    new_annotations['image_id'].append(item['image_id'])
    new_annotations['category_id'].append(item['category_id'])
    new_annotations['bbox'].append(item['bbox'])

annotations_df = pd.DataFrame(new_annotations)

### Drop unused images

In [58]:
# Get ids of images from annotations_df
image_ids = list(set(annotations_df['image_id'].tolist()))

new_images = {
    'id': [],
    'coco_url': []
}

for item in data['images']:
  if item['id'] in image_ids:
    new_images['id'].append(item['id'])
    new_images['coco_url'].append(item['coco_url'])

images_df = pd.DataFrame(new_images)

In [59]:
class CocoVehicle():
  def __init__(self, anno_df, img_df, train=True, tranform=None):
    self.anno_df = anno_df
    self.img_df = img_df

  def __len__(self):
    return len(self.anno_df)

  def __getitem__(self, index):
    item = self.anno_df.iloc[index]
    cat_id = item['category_id']

    img_path = self.img_df[self.img_df['id'] == item['image_id']]['coco_url'].values[0]
    img = io.imread(img_path)
  
    bbox = item['bbox']
    bbox = [int(item) for item in bbox]

    x, y, width, height = bbox

    ####
    # Show img with bbox
    # only for development
    # new = cv2.rectangle(img, (x, y), (x + width, y + height), color=(255,0,0), thickness=2)

    # plt.imshow(new)
    # plt.show()
    ####
    # adjust params to yolo format 
    # center_x, center_y, width, height -> [0, 1] where top left is (0, 0)
    try:
      img_height, img_width, _ = img.shape

      center_x, center_y = x + width / 2, y + height / 2 

      center_x, center_y, width, height = center_x / img_width, center_y / img_height, width / img_width, height / img_height

      #### for dev
      # img = io.imread(img_path)
      ####
      
      # Resize
      resized_img = resize(img, (480, 640), anti_aliasing=True)

      # check if bbox are correctly drown after resize
      # new = cv2.rectangle(resized_img, (int(640 * (center_x - width / 2)), int(480 * (center_y - height / 2))), (int(640 * (center_x + width / 2)), int(480 * (center_y + height / 2))), color=(255,0,0), thickness=2)
      # plt.imshow(new)
      # plt.show()

      return resized_img, [center_x, center_y, width, height, cat_id]
    except ValueError:
      print('No image: ID -', item['image_id'], ' url - ', {img_path})
      return False

  def _write_to_file(self, path):
    if os.path.exists(path):
      shutil.rmtree(path)

    os.mkdir(path)
    os.mkdir(os.path.join(path, 'images'))
    os.mkdir(os.path.join(path, 'labels'))

    for item in range(len(self)):
      if self[item]:  
        img, anno = self[item]
        
        # generate id
        id = uuid.uuid1()

        # define paths to save data
        path_imgs = os.path.join(path, 'images', str(id) + '.jpg')
        path_anno = os.path.join(path, 'labels', str(id) + '.txt')

        # write jpeg file with img 
        cv2.imwrite(path_imgs, img)

        # write txt file with annotations
        output_anno = ''
        for ann in anno:

          # List with annotations to yolo format string
          output_anno += str(ann) + ' '
        
        with open(path_anno, 'w') as f:
          f.write(output_anno)

      # if image doesn't exist
      else: 
        continue

dataset = CocoVehicle(anno_df=annotations_df, img_df=images_df)
dataset._write_to_file(os.path.join('content', 'MyDrive', 'data_proj-wdp'))

No image: ID - 559665  url -  {'http://images.cocodataset.org/train2017/000000559665.jpg'}
No image: ID - 559665  url -  {'http://images.cocodataset.org/train2017/000000559665.jpg'}
