In [1]:
import os
import xml.etree.ElementTree as ET
import cv2
import numpy as np
import shutil

### Before the start:
1) Download dataset from http://tc11.cvc.uab.es/datasets/Tobacco800_1 (login needed)

2) Unarchive it and two folders inside it

3) Move each folders with files from *Tobacco800_SinglePage* and *Tobacc800_Groundtruth_v2.0* folders to *data/Tobacco* folder and rename them as *Images* and *XMLs* respectively

In [2]:
path = 'data/Tobacco/Signatures/'

### Signatures extraction (1 class)

In [6]:
os.mkdir(path + '1')

In [7]:
i = 0
for filename in os.listdir(path + 'XMLs'):
    root = ET.parse(path + 'XMLs/' + filename)
    # signature position (top left corner coordinates, height and width) is under DL_ZONE tag
    zones = root.findall('.//{http://lamp.cfar.umd.edu/GEDI}DL_ZONE')
    for zone in zones:
        elem = zone.attrib
        if(elem['gedi_type'] == 'DLSignature'):
            t, l, h, w = list(map(int, [elem['row'], elem['col'], elem['height'], elem['width']]))
            img = cv2.imread(path + 'Images/' + filename[:-4] + '.tif')
            img = img[t:t+h, l:l+w]
            cv2.imwrite(path + f'1/{i}.png', img)
            i+=1
            
print(f'Extracted {i} signatures')

Extracted 910 signatures


### Random document frafment sampling (0 class)

In [9]:
os.mkdir(path + '0')

In [11]:
signs = os.listdir(path + '1')
imgs = os.listdir(path + 'Images')

In [12]:
# save shapes of cropped images to find max and min values
# this values then will be used to choose size of fragment randomly in this range
sizes = np.empty((0,2), int)
for filename in signs:
    img = cv2.imread(path + '1/' + filename)
    sizes = np.concatenate((sizes, np.array([img.shape[:2]])))

In [17]:
max_h, min_h, max_w, min_w = np.max(sizes[:,0]), np.min(sizes[:,0]), np.max(sizes[:,1]), np.min(sizes[:,1])
print('Max height:', max_h)
print('Min height:', min_h)
print('Max width:', max_w)
print('Min width:', min_w)

Max height: 579
Min height: 33
Max width: 1142
Min width: 69


In [18]:
samples = 1100
# this is slightly more than ammount of signs (910), because some fragments will contain signatures and handwriten text
# and need to be removed manually
# as a result it left 950 fragments

In [29]:
for i in range(samples):
    n = np.random.randint(len(imgs)-1)
    img = cv2.imread(path + 'Images/' + imgs[n])
    h = np.random.randint(min_h, max_h if img.shape[0] > max_h else img.shape[0]) 
    w = np.random.randint(min_w, max_w if img.shape[1] > max_w else img.shape[1]) 
    t, l = np.random.randint(img.shape[0]-h), np.random.randint(img.shape[1]-w)
    img = img[t:t+h, l:l+w]
    cv2.imwrite(path + f'0/{i}.png', img)

### Train/validation/test split

In [30]:
split_rates = [0.7, 0.2, 0.1]

In [37]:
path = 'data/Tobacco/Signatures/'

In [38]:
for label in ['0/', '1/']:
    for img in os.listdir(path + label):
        new_folder = np.random.choice(['train/', 'val/', 'test/'], p=split_rates, replace=False)
        new_path = path + new_folder + label
        if(not os.path.exists(new_path)):
            os.makedirs(new_path)
        shutil.move(path + label + img, new_path + img)

In [43]:
print('Train size:', len(os.listdir(path + 'train/0')) + len(os.listdir(path + 'train/1')))
print('Validation size:', len(os.listdir(path + 'val/0')) + len(os.listdir(path + 'val/1')))
print('Test size:', len(os.listdir(path + 'test/0')) + len(os.listdir(path + 'test/1')))

Train size: 1295
Validation size: 387
Test size: 177
