## Parse the Annotation of PASCAL VOC

## 匯入相關函式庫 Import all the necessary libraries

In [1]:
import os
import xml.etree.ElementTree as ET

import cv2
import numpy as np

## 設定相關設定與參數 Setting parameters

In [2]:
# 訓練/驗證用的資料目錄
DATA_PATH = "../data"

# 資料集目錄
#DATA_SET_PATH = os.path.join(DATA_PATH, "kangaroo")
DATA_SET_PATH = os.path.join(DATA_PATH, "VOCdevkit/VOC2012")

# 資料集標註檔目錄
ANNOTATIONS_PATH = os.path.join(DATA_SET_PATH, "Annotations")

# 資料集圖像檔目錄
IMAGES_PATH = os.path.join(DATA_SET_PATH, "JPEGImages")

print(DATA_PATH)
print(DATA_SET_PATH)
print(ANNOTATIONS_PATH)
print(IMAGES_PATH)

../data
../data/VOCdevkit/VOC2012
../data/VOCdevkit/VOC2012/Annotations
../data/VOCdevkit/VOC2012/JPEGImages


In [3]:
def read_classes(classes_path):
    with open(classes_path) as f:
        class_names = f.readlines()
    class_names = [c.strip() for c in class_names]
    return class_names

In [4]:
# Classes that you want to detect.
CLASSES = read_classes("../model_data/voc2012_classes.txt")
CLASSES

['person',
 'bird',
 'cat',
 'cow',
 'dog',
 'horse',
 'sheep',
 'aeroplane',
 'bicycle',
 'boat',
 'bus',
 'car',
 'motorbike',
 'train',
 'bottle',
 'chair',
 'diningtable',
 'pottedplant',
 'sofa',
 'tvmonitor']

In [5]:
# 一個列表物件,每一個列表物件都包含了要訓練用的重要資訊
all_imgs = [] 

"""
{
    'filename:'/path/kangaroo.jpg', 'width':128, 'height':128, 'depth': 3, 
    'object': [
        {'name':'person', xmin:0, ymin:0, xmax:28, ymax:28},
        {'name':'kangaroo',xmin:45, ymin:45, xmax:60, ymax:60}
    ]
}
"""

# 一個字典物件: {Key:圖像類別, Value:出現的次數}
seen_labels = {}

## Parse Annotation

In [6]:
all_imgs = []
seen_labels = {}
for ann in sorted(os.listdir(ANNOTATIONS_PATH)):
    img = {'object': []}
    
    tree = ET.parse(os.path.join(ANNOTATIONS_PATH, ann))
    root = tree.getroot()
    
    img['filename'] = os.path.join(IMAGES_PATH, root.find('filename').text)
    
    for size in root.findall('size'):
        img['width'] = int(size.find('width').text)
        img['height'] = int(size.find('height').text)
        img['depth'] = int(size.find('depth').text)
    
    for boxes in root.iter('object'):
        obj = {}
        obj['name'] = boxes.find('name').text
        
        #檢查此類別是否有在要偵測的類別中,如果沒有則忽略
        if len(CLASSES) > 0 and obj['name'] not in CLASSES:
            break
        
        if obj['name'] in seen_labels:
            seen_labels[obj['name']] += 1
        else:
            seen_labels[obj['name']] = 1
        
        for box in boxes.findall('bndbox'):
            obj['xmin'] = float(box.find("xmin").text)
            obj['ymin'] = float(box.find("ymin").text)
            obj['xmax'] = float(box.find("xmax").text)
            obj['ymax'] = float(box.find("ymax").text)
    
        img['object'] += [obj]
        
    all_imgs += [img]

print(seen_labels, len(all_imgs))
all_imgs[1]

{'person': 17401, 'aeroplane': 1002, 'tvmonitor': 893, 'train': 704, 'boat': 1059, 'dog': 1598, 'chair': 3056, 'bird': 1271, 'bicycle': 837, 'bottle': 1561, 'sheep': 1084, 'diningtable': 800, 'horse': 803, 'motorbike': 801, 'sofa': 841, 'cow': 771, 'car': 2492, 'cat': 1277, 'bus': 685, 'pottedplant': 1202} 17125


{'object': [{'name': 'aeroplane',
   'xmin': 104.0,
   'ymin': 78.0,
   'xmax': 375.0,
   'ymax': 183.0},
  {'name': 'aeroplane',
   'xmin': 133.0,
   'ymin': 88.0,
   'xmax': 197.0,
   'ymax': 123.0},
  {'name': 'person',
   'xmin': 195.0,
   'ymin': 180.0,
   'xmax': 213.0,
   'ymax': 229.0},
  {'name': 'person',
   'xmin': 26.0,
   'ymin': 189.0,
   'xmax': 44.0,
   'ymax': 238.0}],
 'filename': '../data/VOCdevkit/VOC2012/JPEGImages/2007_000032.jpg',
 'width': 500,
 'height': 281,
 'depth': 3}