In [1]:
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os
import glob
import matplotlib
from matplotlib import pyplot as plt
import xml.etree.ElementTree as ET

In [2]:
# Prepare variables
images_dir = './data/test'
xmls_dir = './data/test'

### Number of images must equals to Number of xmls

In [3]:
image_paths = [f for f in glob.glob(images_dir + '**/*.jpg')]
xml_paths = [f for f in glob.glob(xmls_dir + '**/*.xml')]

n_images = len(image_paths)
n_xmls = len(xml_paths)
print(f"Number of images = {n_images}, Number of xmls = {n_xmls}")

Number of images = 3, Number of xmls = 3


### No .txt file

In [4]:
txt_files = [os.path.basename(f) for f in glob.glob(xmls_dir + '**/*.txt')]

print(f"Text files : {txt_files}")

Text files : []


### Validate  labels

In [5]:
#labels mush be unique
def is_valid_char_label(char_label):
    is_thai_letter = ord(char_label) in range(ord('ก'), ord('ฮ')+1)
    is_digit = ord(char_label) in range(ord('0'), ord('9')+1)
    return is_digit or is_thai_letter

xml_paths = [f for f in glob.glob(xmls_dir + '**/*.xml')]

char_label_set = set()

for xml_path in xml_paths:
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    for obj in root.findall('object'):
        label = obj.find('name').text
        char_label_set.add(label)

print("All possible labels")            
print(sorted(list(char_label_set)))
print(f"{len(char_label_set)}")

All possible labels
['1', '2', '3', '4', '5', '7', '8', 'plate', 'ก', 'จ', 'ณ', 'น', 'ผ']
13


### List Duplicate images and xmls

In [6]:
from collections import Counter

image_names = [os.path.basename(image_path).split('.')[0] for image_path in image_paths]
duplicated_image = [count for item, count in Counter(image_names).items() if count > 1]
print("Duplicated Images = ", duplicated_image)

xml_names = [os.path.basename(xml_path).split('.')[0] for xml_path in xml_paths]
duplicated_xml = [count for item, count in Counter(xml_names).items() if count > 1]
print("Duplicated Xmls = ", duplicated_xml)

Duplicated Images =  []
Duplicated Xmls =  []


### Check image missing xml AND xml missing image

In [7]:
print("Image which missing xml ", set(image_names) - set(xml_names))
print("Xml which missing image ", set(xml_names) - set(image_names))

Image which missing xml  set()
Xml which missing image  set()


### Plotting Bounding Box

In [8]:
output_dir = './tmp'
font_dir = './fonts/angsa.ttf'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [9]:
def plot_bbox(image_path, font):
    image_name = os.path.basename(image_path).split('.')[0]
    
    # Prepare image
    image = Image.open(image_path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    drawer = ImageDraw.Draw(image)    
    
    # Prepare Ground Truth
    xml_path  = os.path.join(xmls_dir, image_name + '.xml')
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    objects = [obj for obj in root.findall('object')]
    
    for obj in objects:
        label = obj.find('name').text
    
        xmin = float(obj.find('bndbox').find('xmin').text)
        ymin = float(obj.find('bndbox').find('ymin').text)
        xmax = float(obj.find('bndbox').find('xmax').text)
        ymax = float(obj.find('bndbox').find('ymax').text)
        
        drawer.line([(xmin, ymin),
                     (xmax, ymin),
                     (xmax, ymax),
                     (xmin, ymax),
                     (xmin, ymin)], fill=(255,0,0), width=2)
        
        drawer.text((xmin + (xmax - xmin) / 2, ymax), label, font = font, fill=(255,0,0))
    return image

for image_path in image_paths:
    font = ImageFont.truetype(font_dir, 20)
    image = plot_bbox(image_path, font)
    image.save(os.path.join(output_dir, os.path.basename(image_path)))