In [7]:
import glob
import os
from PIL import Image
import xml.etree.ElementTree as ET

from tqdm.auto import tqdm
import srsly

In [2]:
IMAGES_DIR = "../data/SVT/img"
TRAIN_PATH = "../data/SVT/train.xml"
TEST_PATH = "../data/SVT/test.xml"

DEST_DIR = "../data/SVT/crop"

In [3]:
def parse_xml_to_data(xml_filepath):
    tree = ET.parse(xml_filepath)
    root = tree.getroot()

    data = {} # name to label
    for image in tqdm(root.findall("image")):
        image_name = image.find("imageName").text.split("/")[-1].replace(".jpg", "")
        tagged_rectangles = image.find("taggedRectangles").findall("taggedRectangle")

        # read image with PIL
        pil_img = Image.open(os.path.join(IMAGES_DIR, image_name + ".jpg"))

        for rectangle in tagged_rectangles:
            x = int(rectangle.get("x"))
            y = int(rectangle.get("y"))
            width = int(rectangle.get("width"))
            height = int(rectangle.get("height"))
            text = rectangle.find("tag").text

            # crop image
            cropped_img = pil_img.crop((x, y, x + width, y + height))

            # save image
            cropped_img.save(os.path.join(DEST_DIR, image_name + "_" + text + ".png"))

            # add to train data
            data[image_name + "_" + text] = text
    return data

In [4]:
train_data = parse_xml_to_data(TRAIN_PATH)
test_data = parse_xml_to_data(TEST_PATH)

100%|██████████| 100/100 [00:02<00:00, 46.18it/s]
100%|██████████| 249/249 [00:04<00:00, 53.99it/s]


In [8]:
srsly.write_json("../data/SVT/train.json", train_data)
srsly.write_json("../data/SVT/test.json", test_data)

In [5]:
len(train_data), len(test_data)

(239, 569)

In [6]:
train_data

{'14_03_LIVING': 'LIVING',
 '14_03_ROOM': 'ROOM',
 '14_03_THEATERS': 'THEATERS',
 '14_04_INSURANCE': 'INSURANCE',
 '14_04_STANDARD': 'STANDARD',
 '14_04_CENTER': 'CENTER',
 '06_08_ANTIQUE': 'ANTIQUE',
 '06_08_LULA': 'LULA',
 '06_08_MALL': 'MALL',
 '00_12_SUBWAY': 'SUBWAY',
 '15_15_CHOCOLATE': 'CHOCOLATE',
 '15_15_GHIRARDELLI': 'GHIRARDELLI',
 '04_04_MASTER': 'MASTER',
 '04_04_NAILS': 'NAILS',
 '04_04_GLENOAK': 'GLENOAK',
 '12_07_WINDSOR': 'WINDSOR',
 '12_07_THE': 'THE',
 '06_11_BANGKOK': 'BANGKOK',
 '06_11_INN': 'INN',
 '08_16_ALDEN': 'ALDEN',
 '05_12_MAX': 'MAX',
 '05_12_TOWING': 'TOWING',
 '00_08_MARBLE': 'MARBLE',
 '00_08_YARD': 'YARD',
 '00_08_ORION': 'ORION',
 '00_08_TILE': 'TILE',
 '00_01_ASTORIA': 'ASTORIA',
 '00_01_BEST': 'BEST',
 '00_01_INN': 'INN',
 '00_01_SUITES': 'SUITES',
 '00_01_VALUE': 'VALUE',
 '01_10_PAYLESS': 'PAYLESS',
 '01_10_SHOE': 'SHOE',
 '01_10_SOURCE': 'SOURCE',
 '03_19_MOUNTS': 'MOUNTS',
 '03_19_ROCKY': 'ROCKY',
 '00_09_GOODWILL': 'GOODWILL',
 '16_19_AMOEBA': 