# Generate Dataset Splits for our GSV Dataset

This notebook is the script we run to produce different splits of the data.

In [1]:
%pylab notebook
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
from glob import glob
import pandas as pd
import shapely
import shapely.geometry
import tqdm as tq
import sys
import uuid

In [3]:
REPO = os.path.abspath('..')
if REPO not in sys.path:
    sys.path.insert(0, REPO)

In [4]:
import annotation
from annotation import Annotation

In [5]:
ROOT = os.path.abspath('../gsv24')   # Data is assumed to be in the repo

In [6]:
xmls = glob(f'{ROOT}/Annotations/**/*.xml', recursive=True)

In [7]:
a = Annotation(xmls[0])

In [8]:
fig = figure()
ax = gca()
a.plot(ax)

<IPython.core.display.Javascript object>

 Give everything a safe ID -- one that is not repeated by an object in any other file

_Code in markdown -- should never be run again_
```python
for xml in tq.tqdm_notebook(xmls):
    a = Annotation(xml)
    for o in a.iter_objects():
        o.id = uuid.uuid1().hex
    a.save_annotation()
```

_Confirm that ids are unique_
```python
uids = set()
for xml in tq.tqdm_notebook(xmls):
    a = Annotation(xml)
    for o in a.iter_objects():
        assert o.id not in uids
        uids.add(o.id)
```

In [10]:
obs = {}
finished = set()

In [11]:
for xml in tq.tqdm_notebook(xmls):
    if xml in finished:
        continue
    a = Annotation(xml)
    for o in a.iter_objects():
        if o.deleted: continue
        if o.id in obs: continue  # already done
        pts = a.points(o)
        if np.isnan(pts).any():
            continue
        shape = shapely.geometry.Polygon(pts)
        fields = dict(label=o.name,
                      xml=os.path.relpath(xml, ROOT),
                      img=os.path.relpath(a.image_path, ROOT),
                      xmin = pts[:,0].min(),
                      ymin = pts[:,1].min(), 
                      width = pts[:,0].max() - pts[:,0].min(),
                      height = pts[:,1].max() - pts[:,1].min(),
                      shape = shape
                      )
        obs[o.id] = fields

    finished.add(xml) 

HBox(children=(IntProgress(value=0, max=1020), HTML(value='')))




In [12]:
df = pd.DataFrame.from_dict(data=obs, orient='index',
                  columns=['label', 'xml', 'img', 'xmin', 'ymin', 'width', 'height', 'shape'])    

In [13]:
df.label.value_counts()

window                 26062
cornice                12326
sill                   10085
molding                 3050
balcony                 2332
door                    2321
deco                    2311
unlabeled               2115
facade                  2066
obstruction             2025
sign                    1759
pillar                  1645
shop                    1584
ledge                   1405
air-conditioner          960
tree                     877
fire-escape-balcony      586
fire-escape-ladder       546
awning                   522
bay                      161
unknown                  101
flag                      83
sky                       62
roof                       3
Name: label, dtype: int64

In [14]:
df.to_pickle(f'{ROOT}/objects.pkl')

In [16]:
windows = df[df.label=='window']

In [18]:
windows.sample(1)

Unnamed: 0,label,xml,img,xmin,ymin,width,height,shape
40c3fd7230a211e99a70613cc4b52a17,window,Annotations/merged/bordeaux_france-001015-0000...,Images/merged/bordeaux_france-001015-000008-s4...,1049.0,1163.0,52.0,78.0,"POLYGON ((1049 1163, 1101 1163, 1101 1241, 104..."


In [68]:
df.sample()

{'name': 'door',
 'deleted': 0,
 'verified': 0,
 'occluded': 'no',
 'attributes': None,
 'parts': {'hasparts': None, 'ispartof': None},
 'date': '06-Jul-2017 20:37:56',
 'id': '3e0896c430a211e99a70613cc4b52a17',
 'type': 'bounding_box',
 'polygon': {'username': 'anonymous', 'pt': [{'x': '937.0', 'y': '1481.0'}]}}

In [36]:
import annotationeditor

In [115]:
figure()
lbh = 0.15
labels_selector_ax = plt.axes([0., 0., 1., lbh], xticks=[], yticks=[])
ax = plt.axes([0., lbh, 1., 1-lbh], xticks=[], yticks=[])
ax.axis('equal')
ae = annotationeditor.AnnotationEditor(ax=gca(), facade=a, root=ROOT)
ae.set_label_box(ae.create_label_box(labels_selector_ax))

<IPython.core.display.Javascript object>

RuntimeWarning: invalid value encountered in reduce

In [74]:
ae.set_active_object(o)
ae.poly_selector.fit_active()
# ae.delete_active()
# ae.save()

ValueError: {'name': 'door', 'deleted': 0, 'verified': 0, 'occluded': 'no', 'attributes': None, 'parts': {'hasparts': None, 'ispartof': None}, 'date': '06-Jul-2017 20:37:56', 'id': '3e0896c430a211e99a70613cc4b52a17', 'type': 'bounding_box', 'polygon': {'username': 'anonymous', 'pt': [{'x': '937.0', 'y': '1481.0'}]}} is not in list

In [64]:
a.save_annotation()