In [None]:
%pylab notebook
import os

# Generate Data for Training

Before running this notebook, grab the LabelMe data from [https://vision.csi.miamioh.edu/labelme.zip](https://vision.csi.miamioh.edu/labelme.zip) and extract it somewhere. 
> *NOTE:* I prefer to put my data on a large external drive and then soft-link it to a local 'data' folder. For example 
```bash
ln -s /media/${USER}/external-drive/data ./data
```

The next cell **will fail** on your system, replace the paths to the labelme INPUT and the data OUTPUT with the paths to folders on your own system (e.g. a large drive)

Each image has a corresponding XML file. 
My script to produce training data takes in list of XML files, let's generate a comprehensive list...

In [None]:
from glob import glob
import os

I often want to plot without seeing the _x_ and _y_ axis ticks, since I know all of my images dimensions. 

In [None]:
def noticks():
    """I keep typing these two lines in plots -- time to make a function"""
    xticks([])
    yticks([])

Okay, so when I produced the data for the labelers I highlighted the part of the image I wanted them to label. As a result, the labelme tool may have recorded the path to the highlighted image instead of the original, so we will want to fix that. 

In [None]:
xml = 'facades-2017-07-21/honolulu_hawaii-002943-000004-8HfFc2j4u0BaBAaaYeNy1w-facade-01-highlighted.xml'
hl_jpg = os.path.join('labelme/Images', xml.replace('.xml', '.jpg'))
nohl_jpg = hl_jpg.replace('highlighted', 'original')
mask_jpg = hl_jpg.replace('highlighted', 'mask')

figure()
figsize(10, 5)

subplot(221)
imshow(imread(nohl_jpg))
noticks()
title("No highlights", fontsize=10)

subplot(222)
imshow(imread(hl_jpg))
noticks()
title("Highlighted", fontsize=10)

subplot(223)
imshow(imread(mask_jpg))
noticks()
title("Mask", fontsize=10)
show()

tight_layout()

It may be difficult to tell, but the images above are the result of the following process:
1. I asked the labelers to outline the dominant, camera-facing facades (within 15 deg). 
2. I automatically calculated the homography using the approach of [Affara et al](TBD).
3. I warpedthe image and rendered out the (supposedly rectified) images shown in the figure above. 
4. The labelers continued labeling features in the (supposedly rectified) images.

> NOTE: I ran code that produces '-original.xml' files based on '-highlighted.xml' files.  If 'original.xml' exists, I want to remove 'highlighted.xml' from the list of files.

## Replace 'highlighted' by 'original' XML's

In [None]:
xmls = glob('./labelme/Annotations/*/*.xml')
print "Found", len(xmls), "xml files"
[os.path.basename(f) for f in xmls[-10:]]

**ALERT: ** I have **already backed up** my data so I feel comfortable modifying this in-place. You should consider doing the same. 

In [None]:
highlighted_xmls = [f for f in xmls if '-highlighted.xml' in f]
print len(highlighted_xmls)

---
> *NOTE 1* I have already fixed the XML files, it does not need to be done again, so the next couple of cells are disabled. 

---
  
> *NOTE 2* At the time I wrote this I was unaware of the `force_list` and `unparse` functions of `xmltodict`, which is why I use the less elegant `str.replace` method in the code below. 

---

*The code below is what I used to replace the images in the XML files *

---
```python
num_modifications = 0
for i, xml in enumerate(highlighted_xmls):
    contents = open(xml).read()
    data = xmltodict.parse(contents)
    folder = data['annotation']['folder']
    filename = data['annotation']['filename']
    new_filename = filename
    if 'highlighted' not in filename:
        continue
    elif os.path.isfile(os.path.join('labelme', 'Images', folder,
                                     filename.replace('-highlighted.jpg', '.jpg'))):
        new_filename = filename.replace('-highlighted.jpg', '.jpg')
    elif os.path.isfile(os.path.join('labelme', 'Images', folder, 
                                     filename.replace('-highlighted.jpg', '-original.jpg'))):
        new_filename = filename.replace('-highlighted.jpg', '-original.jpg')
    elif os.path.isfile(os.path.join('labelme', 'Images', folder, 
                                     filename.replace('-hilighted.jpg', '.jpg'))):
        new_filename = filename.replace('-hilighted.jpg', '.jpg')
    else:
        print "Failed to find any variant of", os.path.join('labelme', 'Images', folder, filename)
        
    contents = contents.replace(filename, new_filename)
    original_xml = xml.replace('highlighted.xml', 'original.xml')
    num_modifications += 1

    with open(original_xml, 'w') as f:
        f.write(contents)
    print '\r{: 3} of {}, modified {} files'.format(i+1, len(highlighted_xmls), num_modifications),
```
---

In [None]:
xmls = unique([f.replace('highlighted.xml', 'original.xml') for f in xmls])
print len(xmls)

In [None]:
for xml in xmls:
    assert os.path.isfile(xml)

Let's alsomake sure the images for each XML actually exist
> **OOPS:** It looks like I was inconsistant with the naming. Something I used an '-original.jpg' suffix and sometimes I did not...

In [None]:
import xmltodict
for i, xml in enumerate(xmls):
    contents = open(xml).read().decode('utf8')
    data = xmltodict.parse(contents)
    folder = data['annotation']['folder']
    filename = data['annotation']['filename']
    if not os.path.isfile(os.path.join('labelme', 'Images', folder, filename)):
        print "Missing {}/{}".format(folder, filename)
    print '\rProcessed {} of {}, '.format(i, len(xmls)),

Oops, there is a missing file in the list. It looks like some kind of a test XML. I will remove it with this code:

---

```ipython
In [ ]: len(xmls)
In [ ]: xmls[2275]
Out[ ]: './labelme/Annotations/example_folder/img1.xml'

In [ ]: !rm {xmls[2275]}
In [ ]: os.path.isfile(xmls[2275])
Out[ ]: False

In [ ]: xmls = np.delete(xmls, 2275)
In [ ]: len(xmls)
Out[ ]: 4201
```

---

Now If I run the cell above, I expect no missing files.. (you hopefully did not ever even see one)

While we are at it, let's correct the anntotators typos

In [None]:
#%%file aliases.py
ALIASES = {u'0': 'unlabeled',
 u'None': 'unlabeled',
 u'asdf': 'unlabeled',
 u'blacony': 'balcony',
 u'c': 'unlabeled',
 u'c_2': 'unlabeled',
 u'corice': 'cornice',
 u'cornice\xe7': 'cornice',
 u'cornise': 'cornice',
 u'dasdas': 'unlabeled',
 u'do': 'unlabeled',
 u'doot': 'door',
 u'egative': 'negative',
 u'f': 'unlabeled',
 u'facode': 'facade',
 u'fdsa': 'unlabeled',
 u'fgb': 'unlabeled',
 u'https_//vision.csi.miamioh.edu/_collection_labelme_mode_f_folder_batch2_image_ny_many-0289.jpg_username_michelle_objects_facade,sky,tree,obstruction,window,door,molding,balcony,sill,cornice,shop': 'unlabeled',
 u'ledg': 'ledge',
 u'legde': 'ledge',
 u'license plate': 'unlabeled',
 u'modeling': 'molding',
 u'molging': 'molding',
 u'neative': 'negative',
 u'neattive': 'negative',
 u'negaitve': 'negative',
 u'negarive': 'negative',
 u'negattive': 'negative',
 u'negtive': 'negative',
 u'ngeative': 'negative',
 u'o': 'unlabeled',
 u'o bstruction': 'obstruction',
 u'ob': 'obstruction',
 u'obatruction': 'obstruction',
 u'obstrucion': 'obstruction',
 u'obstruction\xe7': 'obstruction',
 u'obstuction': 'obstruction',
 u'occluded': 'obstruction',
 u'occluision': 'obstruction',
 u'occlusion': 'obstruction',
 u'quitame': 'unlabeled',
 u'siil': 'sill',
 u'sil': 'sill',
 u'sing': 'sign',
 u'test': 'unlabeled',
 u'tre': 'tree',
 u'unknonw': 'unlabeled',
 u'unknow': 'unlabeled',
 u'unknown': 'unlabeled',
 u'unknwon': 'unlabeled',
 u'unlabale': 'unlabeled',
 u'unlabed': 'unlabeled',
 u'unlabel': 'unlabeled',
 u'unlabele': 'unlabeled',
 u'unlabeled_': 'unlabeled',
 u'unlabled': 'unlabeled',
 u'w': 'unlabeled',
 u'wi': 'unlabeled',
 u'windows': 'window',
 u'air-conditioner': 'air_conditioner',
 u'fire-escape-balcony': 'fire_escape_balcony',
 u'fire-escape-ladder': 'fire_escape_ladder'}

When I first put together this list of aliases (based on counting how many unique object names I found) it accidentally had a cycle of aliases.  The following code was used to identify a cycle in my original list of ALIASES

```python
reduced_aliases = {}
for k in ALIASES:
    v = ALIASES[k]
    while v in ALIASES:
        v = ALIASES[v]
    reduced_aliases[k] = v

ALIASES = reduced_aliases
```

Once I had settled on an array of aliases, I renamed all of the mis-spelled objects in the XML files using this code:. 

```python
import xmltodict
num_replacements = 0

for i, xml in enumerate(xmls):
    contents = open(xml).read().decode('utf8')
    initial_replacements = num_replacements
    for key, value in ALIASES.iteritems():
        pattern = u"<name>{}</name>".format(key)
        replacement = u"<name>{}</name>".format(value)
        if pattern in contents:
            contents = contents.replace(pattern, replacement)
            num_replacements += 1
    
    if num_replacements > initial_replacements:
        with open(xml, 'w') as f:
            f.write(contents)
    print '\r {} of {}, '.format(i, len(xmls)), num_replacements, "substitutions",
```

As part of understanding the labels, I wanted to get a count of how often each occured. The following code produced a dictinary of names:


```python
In [ ]: import xmltodict
        names = {}
        for i, xml in enumerate(xmls):
            contents = open(xml).read().decode('utf8')
            data = xmltodict.parse(contents)
            objects = data['annotation']['object']
            if not isinstance(objects, list):
                objects = [objects]
            for o in objects:
                names[o['name']] = names.get(o['name'], 0) + 1
            print '\r {} of {}, '.format(i, len(xmls)),
        print 
```


```python
In [ ]: names
Out[ ]: {u'air_conditioner': 939,
         u'awning': 606,
         u'balcony': 4901,
         u'bay': 191,
         u'cornice': 19517,
         u'deco': 2564,
         u'door': 3765,
         u'facade': 4842,
         u'fire escape': 7,
         u'fire_escape_balcony': 777,
         u'fire_escape_ladder': 684,
         u'flag': 94,
         u'ledge': 1763,
         u'molding': 5841,
         u'negative': 2209,
         u'obstruction': 3954,
         u'pillar': 1875,
         u'roof': 14,
         u'shop': 3013,
         u'sign': 2041,
         u'sill': 15603,
         u'sky': 821,
         u'tree': 1465,
         u'unlabeled': 743,
         u'window': 51621}
```

Based on the actual labels we have (except fire-escape, which should be deleted)I have these features
> *NOTE* The fire escape label was almost immediately replaced with 'fire_excape_balcony' and 'fire_escape_ladder' which, together, form the fire escape structure as a whole. I wanted the labelers to focus onthe smaller parts

> *NOTE* The 'roof' label is not very prevalent -- I decided that if I had them label 'facade' and 'sky' then I could infer, to some extent, the roof. Also there was debate between the co-authors about what 'roof' meant. 

In [None]:
# %load new_feature_names_ordered.py
FEATURES=[
 u'unlabeled', # s/negative/unlabeled for all above
 u'sky',
 u'facade',
 u'bay',
 u'deco',
 u'door',
 #u'fire escape',
 u'roof',
 u'sign',
 u'window',
 u'air_conditioner',
 u'shop',
 u'pillar',
 u'cornice',
 u'sill',
 u'molding',
 u'ledge',
 u'balcony',
 u'fire_escape_balcony',
 u'fire_escape_ladder',
 u'flag',
 u'awning',
 u'obstruction',  # --> s/negative/unlabeled for all below
 u'tree',
 u'negative',     # --> s/*/negative for all below
]

In [None]:
%run -i new_feature_names_ordered.py

In [None]:
len(FEATURES)

for k in names:
    if (k not in ALIASES)  and (k not in ALIASES.values()) and (k not in FEATURES):
        print k, ":", names[k]

It is okay to ignore fire escape as a label; I had them split that into two other labels but the old fire escape labels remain

In [None]:
with open('files.txt', 'w') as f:
    f.writelines([os.path.relpath(xml, 'labelme/Annotations') + '\n' for xml in xmls])
print "Updated files.txt"

In [None]:
!tail files.txt

# Generate Pre-Processed Training Data

In [None]:
from munch import Munch, munchify
import munch

In [None]:
!mkdir -p ./data/labelme-out

**NOTE:** This next script will take FOREVER, so before we run it I think we want to work out some things...
   - Crop in to the largest facade in the image, so we do not waste training time
   - Add some extra labels 

In [None]:
import pyfacades.labelme.annotation
reload(pyfacades.labelme.annotation)
from pyfacades.labelme.annotation import Annotation


In [None]:
import PIL
from PIL import Image, ImageDraw

def draw_objects(im, objects, fill=2, outline=3, vcuts=False):
    im2 = Image.fromarray(im)
    for o in objects:
        if len(o.polygon.points) > 3:
            if vcuts:
                o.draw(im2, fill, fill)
                
                # Just draw the left and right edges as outlines
                d = ImageDraw.Draw(im2)
                ymin, xmin, ymax, xmax = o.bounds()
                d.line((xmin, ymin, xmin, ymax), fill=outline)
                d.line((xmax, ymin, xmax, ymax), fill=outline)
                del d
            else:
                o.draw(im2, fill, outline)
    im[...] = array(im2)
    return im

In [None]:
from skimage.morphology import binary_dilation, disk
def thicken_outlines(im, radius=6, outline_color=3):
    outline = im == outline_color
    outline = binary_dilation(outline, disk(radius))
    im[outline] = outline_color
    return im

In [None]:
def make_mask(a, name, radius = 6, fill=2, outline =3, vcuts=False):
    objects = [o for o in a.objects if o.name == name]
    mask = np.zeros((a.imagesize.nrows, a.imagesize.ncols), dtype=np.uint8)
    draw_objects(mask, objects, fill=fill, outline=outline, vcuts=vcuts)
    thicken_outlines(mask, radius=radius, outline_color=outline)
    
    return mask

In [None]:
a = Annotation(xmls[0])
a, a.objects

In [None]:
NEGATIVE = 0
UNKNOWN  = 1
POSITIVE = 2
OUTLINE  = 3


default = Munch()
default.fill = POSITIVE
default.outline = UNKNOWN
default.radius = 1 # 3 pixel thick border
default.vcuts = False

# Windows have a slightly thicker outline
settings = munchify(dict(zip([str(f) for f in FEATURES], [default]*len(FEATURES))))
settings.window.outline = OUTLINE
settings.window.radius  = 3

# Facades have a much thicker outline
settings.facade.outline = OUTLINE
settings.facade.radius = 10
settings.facade.vcuts=True

# The unknown area near the edge of 'sky' is thicker than normal
# the labelers had a hard time here I think
settings.sky.radius = 15
settings.sky.outline = UNKNOWN

#print munch.toYAML(settings)

In [None]:
with open('settings.yml', 'w') as f:
    f.write(munch.toYAML(settings))

In [None]:
def make_all_masks(a):
    masks = Munch()
    features = unique([o.name for o in a.objects if o.name in FEATURES])
    unlabeled = make_mask(a, 'unlabeled', radius=0, outline=1, fill=1) == 1
    masks.unlabeled= unlabeled
    masks.rgb = array(a.get_image())
    for f in features:
        masks[f] = make_mask(a, f, 
                             radius=settings[f].radius, 
                             fill=settings[f].fill,
                             outline=settings[f].outline,
                             vcuts=settings[f].vcuts)
        masks[f][unlabeled] = 1
        
    return masks
        

In [None]:
masks = make_all_masks(a)

In [None]:
import copy

In [None]:
def expand_bounds(bounds, hpercent, vpercent):
    ymin, xmin, ymax, xmax = bounds
    width = xmax-xmin
    height = ymax-ymin
    dx = int(round(hpercent*width))
    dy = int(round(vpercent*height))
    return ymin-dy, xmin-dx, ymax+dy, xmax+dx

def transpose_bounds(bounds):
    ymin, xmin, ymax, xmax = bounds
    return (xmin, ymin, xmax, ymax)

def clip_bounds(inner, outer):
    return (max(inner[0], outer[0]), max(inner[1], outer[1]), 
            min(inner[2], outer[2]), min(inner[3], outer[3]))

In [None]:
def cropped_masks(mask, bounds):
    result = copy.deepcopy(mask)
    mask_bounds = (0, 0, mask.rgb.shape[0], mask.rgb.shape[1])
    bounds = clip_bounds(bounds, mask_bounds)
    ymin, xmin, ymax, xmax = bounds
    for key in mask:
        result[key] = mask[key][int(ymin):int(ymax), int(xmin):int(xmax)]
    return result

In [None]:
def mask_negative(labels):
    return np.ma.masked_less(labels, 1)

In [None]:
warnings.filterwarnings('error') # For debugging -- with %pdb on....

def extract_facades(a, masks):
    facades = [o for o in a.objects if o.name=='facade']
    results = []
    for f in facades:
        bounds = list(expand_bounds(f.bounds(), 0.10, 0.0))
        if bounds[2] > masks.rgb.shape[0]:
            bounds[2] = masks.rgb.shape[0]
        if bounds[2]-bounds[0] < 512:
            bounds[0] = bounds[2] - 512
        h = bounds[2] - bounds[0]
        w = bounds[3] - bounds[1]
        if  float(w)/h < 1:
            ctr = bounds[1] + w/2.
            bounds[1] = max(0,ctr-h/2.)
            bounds[3] = bounds[1] + h
        facade_mask = cropped_masks(masks, bounds)
        results.append(facade_mask)
    return results

In [None]:
facade_masks = extract_facades(a, masks)
for m2 in facade_masks:
    figure()
    imshow(m2.rgb)
    imshow(mask_negative(m2.facade), alpha=0.25, vmin=0, vmax=3)
    imshow(mask_negative(m2.window), alpha=0.25, vmin=0, vmax=3)
    imshow(mask_negative(m2.tree), alpha=0.25, vmin=0, vmax=3)
    imshow(mask_negative(m2.sky), alpha=0.25, vmin=0, vmax=3)

In [None]:
from pyfacades.util import split_tiles

In [None]:
print "The RGB shape is:", facade_masks[0].rgb.shape
print "The 'window' mask shape is:", facade_masks[0].window.shape

In [None]:
def tile_mask(mask, shape=(512, 512), overlap=16):
    tiles = {}
    for key in mask:
        if key == 'rgb':
            # RGB is a 3D array that is channels_last, it must be channels_first temporarily
            tiles[key] = list(split_tiles(mask[key].transpose(2, 0, 1), shape, overlap))
            tiles[key] = [t.transpose(1, 2,0) for t in tiles[key]]
        else:
            # split_tiles asumes 3D input -- channels_first. 
            tiles[key] = list(split_tiles(array([mask[key]]), shape, overlap))
            tiles[key] = [t.squeeze() for t in tiles[key]]
            
    num_tiles = len(tiles.values()[0])
    
    results = []
    for i in range(num_tiles):
        results.append(munchify({key:values[i] for key, values in tiles.iteritems()}))
    return results

In [None]:
tiles = tile_mask(facade_masks[0])

In [None]:
print "Found", len(tiles), "tiles"
print "Found", tiles[0].keys(), "features"
print "The RGB data shape is:", tiles[0].rgb.shape
print "The 'window' shape is:", tiles[0].window.shape


In [None]:
figure()
for f in range(len(facade_masks)):
    tiles = tile_mask(facade_masks[f])
    for i in range(len(tiles)):
        subplot(len(facade_masks),len(tiles),i+1)
        imshow(tiles[i].rgb/255.)
        imshow(mask_negative(tiles[i].facade), alpha=0.3, vmin=0, vmax=3)
        imshow(mask_negative(tiles[i].window), alpha=0.3, vmin=0, vmax=3)
        imshow(mask_negative(tiles[i].sky), alpha=0.3, vmin=0, vmax=3)
        imshow(mask_negative(tiles[i].tree), alpha=0.3, vmin=0, vmax=3)
        noticks()

## Organization / Plan

- Each source image will be put into its own folder
- Within each folder, each facade will be put into a 2-digit, zero-padded,numbered folder (starting at 01)
- Within each facade, the tiles will be saved as 'tile-####.npz', where #### is an index starting at 0001
- Within the NPZ, each layer will be saved with the corresponding feature name.
- Anything that does not have a label saved in the file should be assumed NEGATIVE
- Unknown regions have the label 'unknown'


In [None]:
import logging

In [None]:
import pyfacades.labelme.annotation as anno

In [None]:
def has_facade(a):
    for o in a.objects:
        if o.name == 'facade':
            return True
    return False

def restore_facade(a):
    facade = Munch()
    facade.name='facade'
    facade.deleted = 0
    facade.verified = 0
    facade.occluded = 'no'
    facade.attributes = None
    facade.parts = Munch()
    facade.parts.hasparts = None
    facade.parts.ispartof = None
    facade.date = u'17-Nov-2017 19:48:13'
    facade['id'] = len(a.objects)
    facade['type'] =  u'bounding_box'
    facade.polygon = Munch()
    facade.polygon.username='generate-training-data(script)'
    facade.polygon.pt = [dict(x=100, y=0),
                         dict(x=a.imagesize.ncols-100, y=0),
                         dict(x=a.imagesize.ncols-100, y=a.imagesize.nrows),
                         dict(x=100, y=a.imagesize.nrows)]
    left = Munch()
    left.name='unlabeled'
    left.deleted = 0
    left.verified = 0
    left.occluded = 'no'
    left.attributes = None
    left.parts = Munch()
    left.parts.hasparts = None
    left.parts.ispartof = None
    left.date = u'17-Nov-2017 19:48:13'
    left['id'] = len(a.objects)
    left['type'] =  u'bounding_box'
    left.polygon = Munch()
    left.polygon.username='generate-training-data(script)'
    left.polygon.pt = [dict(x=0, y=0),
                       dict(x=100, y=0),
                       dict(x=100, y=a.imagesize.nrows),
                       dict(x=0, y=a.imagesize.nrows)]

    right = Munch()
    right.name='unlabeled'
    right.deleted = 0
    right.verified = 0
    right.occluded = 'no'
    right.attributes = None
    right.parts = Munch()
    right.parts.hasparts = None
    right.parts.ispartof = None
    right.date = u'17-Nov-2017 19:48:13'
    right['id'] = len(a.objects)
    right['type'] =  u'bounding_box'
    right.polygon = Munch()
    right.polygon.username='generate-training-data(script)'
    right.polygon.pt = [dict(x=a.imagesize.ncols-100, y=0),
                        dict(x=a.imagesize.ncols, y=0),
                        dict(x=a.imagesize.ncols, y=a.imagesize.nrows),
                        dict(x=a.imagesize.ncols-100, y=a.imagesize.nrows)]

    oleft = anno.Object()
    oright = anno.Object()
    ofacade = anno.Object()

    oleft.set_from_dict(left)
    oright.set_from_dict(right)
    ofacade.set_from_dict(facade)

    a.objects.extend([oleft, oright, ofacade])

In [None]:
from time import clock

In [None]:
working_figure = figure(100)

all_features = {}

def preprocess_single_image(xml, output_dir,
                            skip_no_windows=True,  # It seems like some images have facade, but notheing else
                            visualize=True,
                            redo=False):
    stem = os.path.splitext(os.path.basename(xml))[0]
    timings = dict(setup=0., tiles =0., plot=0.)
    
    if visualize:
        fig = gcf()
   
    if redo==False and os.path.isdir(os.path.join(output_dir, stem)):
        if visualize:
            subplot(111)
            title('Skipping {}, already processed!'.format(stem))
        return len(glob(os.path.join(output_dir, stem, '*', '*.npz'))), timings
    
    #print xml
    
    try:
        a = Annotation(xml)
    except IOError as e:
        logging.error('IOError:{}, when processing {}'.format(e, xml))
        return 0, timings
    #print a
    
    if not has_facade(a):
        restore_facade(a)
    
    features = dict(zip(FEATURES, [0]*len(FEATURES)))
    for o in a.objects:
        if o.name in features:
            features[o.name]+=1
    all_features[stem] = features
    
    if features['window'] == 0 and skip_no_windows:
        if visualize:
            subplot(111)
            title('Skipping {}, no windows!'.format(stem))
        return len(glob(os.path.join(output_dir, stem, '*', '*.npz'))), timings
        
    t1 = clock()
    all_masks = make_all_masks(a)
    useful_masks = extract_facades(a, all_masks)
    tiles = [tile_mask(useful_mask) for useful_mask in useful_masks]
    
    t2 = clock()
    
    num_out = 0
    for facade_index, facade_tiles in enumerate(tiles):
        for tile_index, mask in enumerate(facade_tiles):
            output_path = os.path.join(output_dir, 
                                       stem,
                                       '{:02}'.format(facade_index+1), 
                                       '{:04}.npz'.format(tile_index+1))
            try:
                os.makedirs(os.path.dirname(output_path))
            except OSError: 
                pass 
            np.savez(output_path, **mask)
            
            fig2 = figure(working_figure.number,  figsize=(3,3))
            imshow(mask.rgb/255.)
            imshow(mask_negative(mask.facade), alpha=0.3, vmin=0, vmax=3)

            if 'window' in mask:
                imshow(mask_negative(mask.window), alpha=0.3, vmin=0, vmax=3)

            if 'sky' in mask:
                imshow(mask_negative(mask.sky), alpha=0.3, vmin=0, vmax=3)

            if 'tree' in mask:
                imshow(mask_negative(mask.tree), alpha=0.3, vmin=0, vmax=3)
            noticks()
            axis('off')
            tight_layout()
            savefig(os.path.join(output_dir, 
                                       stem,
                                       '{:02}'.format(facade_index+1), 
                                       '{:04}.jpg'.format(tile_index+1)))
            #close(fig2)
            #del fig2
            num_out += 1
    t3 = clock()
    
    if visualize:
        figure(fig.number)
        clf()
        plot_index = 0
        for facade_index, facade_tiles in enumerate(tiles):
            for tile_index, mask in enumerate(facade_tiles):
                plot_index = facade_index*len(facade_tiles) + tile_index + 1
                ax = subplot(len(tiles),len(facade_tiles), plot_index)
                ax.imshow(mask.rgb/255.)
                ax.imshow(mask_negative(mask.facade), alpha=0.3, vmin=0, vmax=3)
                
                if 'window' in mask:
                    ax.imshow(mask_negative(mask.window), alpha=0.3, vmin=0, vmax=3)
               
                if 'sky' in mask:
                    ax.imshow(mask_negative(mask.sky), alpha=0.3, vmin=0, vmax=3)
                
                if 'tree' in mask:
                    ax.imshow(mask_negative(mask.tree), alpha=0.3, vmin=0, vmax=3)
                noticks()

    t4 = clock()
    timings = dict(setup=t2-t1, tiles = t3-t2, plot=t4-t3)
                
    return num_out, timings
    

In [None]:
output_dir = './data/aeriels-24class'

In [None]:
xml

In [None]:
figure()
preprocess_single_image(xml, output_dir, redo=True)

In [None]:
figure()
imshow(imread('data/aeriels-24class/{}/01/0001.jpg'.format(os.path.splitext(os.path.basename(xml))[0] )))
#imshow(imread('data/aeriels-24class/madrid-1-6-orthographic/01/0001.jpg'))
axis('off')

In [None]:
from glob import glob
print len(glob('data/aeriels-24class/*/*/*.npz'))

In [None]:
empty_xmls = set()
odd_xmls = set()
for i, xml in enumerate(xmls):
    try:
        a = Annotation(xml)
    except IOError:
        pass
    facades = [o for o in a if o.name =='facade']
    if len(facades) == 0:
        objects = set([o.name for o in a.objects if o.name != 'unlabeled'])
        if len(objects) == 0:
            empty_xmls.add(xml)
            #print a.filename, "empty",
        else:
            odd_xmls.add(xml)
            #print a.filename, "odd", ','.join(objects),
        print '\r',i,  xml,

In [None]:
print len(empty_xmls), "empty xml files"
print len(odd_xmls), "odd xmls (no facade)"

In [None]:
import gc
print gc.collect(), "collected garbage"
print "Figures:",[m.num for m in matplotlib._pylab_helpers.Gcf.get_all_fig_managers()]

**NOTE:** Occasionally as I worked on this notebook I realizes I had produced the output completely wrong and had to restart. Since I generally try not to reprocess files I already have produced, occasionally I typed this into a cell in order to get rid of all output. **_Be very careful with this!_**

    !rm -r ./data/aeriels-24class/*

I realized as I produced the data that many of the images had facades labeled, but no windows. According to our labeling process, it is possible that facades and sky would be identified (phase I labeling) and that the labelers did not get to labeling the other features no facades after I rectified them (phase II labeling).  After phase II labeling, absence of a label means it is not present, unless it is part of an 'unlabeled' region. 

I need to go back and remove any preprocessed files that do not have features; in particular I will remove any facade that does not include at least one window.

In [None]:
feature_counts_per_xml = {}

for k, xml in enumerate(xmls):
    if xml in empty_xmls:
        continue
        
    stem = os.path.splitext(os.path.basename(xml))[0]
    
    fc = dict(zip(FEATURES, [0]*len(FEATURES)))
    a = munch.munchify(xmltodict.parse(open(xml), force_list=['object'])).annotation
    for o in a.object:
        if o.name in fc:
            fc[o.name] += 1
        else:
            print xml
    feature_counts_per_xml[xml] = fc
    print '\r {} of {}'.format(k, len(xmls)),

In [None]:
fire_escape_xmls = [
    './labelme/Annotations/batch2/ny_many-0110.xml'
    './labelme/Annotations/batch2/ny_many-0200.xml'
    './labelme/Annotations/batch2/ny_many-0393.xml'
    './labelme/Annotations/batch2/ny_many-0426.xml'
]

In [None]:
window_xmls = [xml for (xml, fc) in feature_counts_per_xml.iteritems() if fc['window'] > 0]

In [None]:
window_xmls[:3]

In [None]:
feature_counts_per_xml['./lab']

In [None]:
fig = figure(figsize=(10,8))
num_out = len(glob(os.path.join(output_dir, '*', '*', '*.npz')))
for k, xml in enumerate(window_xmls):
    if xml in empty_xmls:
        continue
    
    stem = os.path.splitext(os.path.basename(xml))[0]
    
    if os.path.isdir(os.path.join(output_dir, stem)):
        continue # Do not reprocess work done previously -- for some reason this crashed.
        
    print '\r{:>6} of {:6} ({} so far)'.format(k, len(xmls), num_out),
    n, t = preprocess_single_image(xml, output_dir) 
    num_out += n
    suptitle('{:>6} of {:6} ({} so far): i:{:.2f} t:{:.2f} p:{:.2f}\n{}'.format(k, len(window_xmls), num_out,
                                                                     t['setup'], t['tiles'], t['plot'],
                                                                     xml))
    fig.canvas.draw()
    gc.collect()

print '\r{:>6} of {:6} ({} so far)'.format(k, len(xmls), num_out),

In [None]:
def stem(f):
    return os.path.splitext(os.path.basename(f))[0]

## Compress and Move data to a publicly accessible location

In [None]:
%pylab inline
import os  # crashed due to code in a (now deleted) cell -- too lazy to scroll up


In [None]:
if not os.path.isfile('{}/dataset.zip'.format(output_dir)):
    !zip {output_dir}/dataset.zip {output_dir}/*/*/*.npz

In [None]:
# if not os.path.isfile('{}/dataset-visualization.zip'.format(output_dir)):
#     !zip {output_dir}/dataset.zip {output_dir}/*/*/*.jpg

In [None]:
output_dir

In [None]:
!ls ./data/aeriels-24class/*/*/*.npz | tail

In [None]:
test = np.load('./data/aeriels-24class/regent_many-0421-facade-02-original/01/0001.npz')

In [None]:
test.keys()

In [None]:
type(test)

In [None]:
!du -h ./data/aeriels-24class/dataset.zip

Now open up a terminal and copy the data into a location that can be shared

    mkdir /mnt/S/Teams/Vision/fdb
    cp ./data/aeriels-24class/dataset.zip /mnt/S/Teams/Vision/fdb