In [1]:
# !pip install pymongo

In [2]:
from pymongo import MongoClient, DESCENDING
from pathlib import Path
import datetime
# import json
import simplejson
from pydash import omit, find_index, merge
from functools import partial
from bson import json_util
import os

from IPython.display import display, Image, JSON
import ipywidgets as widgets
from ipywidgets import HBox, VBox, Button, ButtonStyle, AppLayout, Layout, Style

from lib.PersistentSet import PersistentSet

In [3]:
images_dir = Path('./images')
gallery_dir = Path('./gallery')
handmade_dir = Path('./handmade')
label_dir = handmade_dir/'labels'
label_dir.mkdir(exist_ok=True)
current_ids_filepath = Path('./labeling_current_ids.json')

In [4]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagemeta = db['imagemeta']
visimages = db['visimages']
vislabels = db['vislabels']

# Backup labels

In [5]:
# if os.path.isfile(label_dir/'labels.json'):
#     os.rename(label_dir/'labels.json', label_dir/f"labels_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.json")

In [6]:
# json.dump([omit(l, '_id') for l in vislabels.find()], open(label_dir/'labels.json', 'w'), default=json_util.default)

# Maintenance

In [7]:
# for label in vislabels.find():
#     visimage = visimages.find_one({'image_id': label['image_id']})
#     if not visimage:
# #         print(f"missing {label['image_id']}")
#         duplicated = [i for i in visimages.find({'duplicated_images': label['image_id']})]
#         if len(duplicated) > 1:
#             pass
# #             print(f"found more than one duplicated {label['image_id']} {duplicated}")
#         if not duplicated:
# #             pass
#             print(f"missing in duplicated either {label['image_id']}")
#             print(json.dumps(omit(label, '_id'), default=datetime.datetime.isoformat))
#         else:
#             duplicated = duplicated[0]
# #             print(f"found in duplicated {label['image_id']} {duplicated['image_id']}")

# Labeling

In [8]:
remarks_output = widgets.Output()

In [9]:
remarks_output

Output()

In [10]:
box_height = 800

In [11]:
def find_post (image_id):
    return posts.find_one({'post_id': imagemeta.find_one({'image_id': image_id})['post_id']})

In [12]:
def find_label (image_id):
    image = visimages.find_one({'image_id': image_id})
    image_ids = [image_id] + [i for i in image['duplicated_images']]
    visLabels = [l for l in vislabels.find({'image_id': {'$in': image_ids}})]
    if len(visLabels) > 1:
        print(f"labels warning: more than 1 label {image_ids}")
    return visLabels[0] if visLabels else None

In [13]:
def make_image_box (visImage, width=600, height=800, layoutArgs={}):
    image = widgets.Image(value=open(visImage['image_path'], 'rb').read(), width=600, height=height, layout=Layout(object_position='center center', object_fit='contain'))
    boxItems = {
        'image': image
    }

    image_box_layout = Layout(height=f'{box_height}px', justify_content='center', **layoutArgs)
    imageBox = HBox([image], layout=image_box_layout)

    return (imageBox, boxItems)

In [14]:
def make_link (url, text):
    return widgets.HTML(value=f"<a href='{url}' target='_blank'>{text}</a>")

In [15]:
def make_info_box (visImage, layoutArgs={}):
    output = widgets.Output()

    link = make_link(visImage['url'], visImage['image_id'])
    dp_links = [
        make_link(find_post(dp_image_id)['url'], dp_image_id)
        for dp_image_id in visImage['duplicated_images']
    ]
    image_link = make_link(f"http://vpn2d.mydev:7000/projects/bad-vis-browser/{visImage['image_path']}", visImage['image_path'])

    description = widgets.Output(layout=Layout(**layoutArgs))
    with description:
        print(f"{find_index(label_img_ids, lambda x: x == visImage['image_id'])} / {len(label_img_ids)}")
        print(f"tags: {len(visImage['tags'])}")
        for t in visImage['tags']:
            print(t)

    currentLabelsOutput = widgets.Output(layout=Layout(**layoutArgs))

    outputs = {
        'description': description,
        'currentLabelsOutput': currentLabelsOutput,
        'output': output
    }

    infoBox = VBox([link, *dp_links, image_link, description, currentLabelsOutput, output])

    return (infoBox, outputs)

In [16]:
def ckb_box_layout (rows, cols):
    return Layout(
        flex_flow='column wrap',
        height=f"{rows*30+20}px",
        width=f"{cols*125}px",
        align_content='flex-start'
    )

def make_ckbs (title, options, n_rows=10, layoutArgs={}, on_ckb_change=None):
    ckb_layout = Layout(
        width='120px'
    )

    def ckb_handler (change):
        ckb = change['owner']
        if change['new']:
            ckb.icon = 'check'
            ckb.button_style = 'success'
        else:
            ckb.icon = ''
            ckb.button_style = ''
        if on_ckb_change:
            on_ckb_change(change)

    ckbs = []
    ckbsByName = {}
    for n in options:
        name = f"{title}:{n}"
        ckb = widgets.ToggleButton(value=False, description=n.capitalize(), tooltip=name, icon='', button_style='', disabled=False, indent=False, layout=ckb_layout)
        ckb.observe(ckb_handler, names='value')
        ckbs.append(ckb)
        ckbsByName[name] = ckb

    form = VBox([
        widgets.Label(value=f"{title.capitalize()}"),
        VBox(ckbs, layout=ckb_box_layout(rows=n_rows, cols=(-(len(ckbs) // -n_rows))))
    ], layout=Layout(**layoutArgs))

    return (form, ckbsByName)

def make_remarks (n_rows=10, layoutArgs={}):
    remarks = widgets.Textarea(value='', layout=Layout(height='300px', width=f'240px'), disabled=False)
    return (VBox([widgets.Label(value=f"{'remarks'.capitalize()}"), remarks], layout=Layout(**layoutArgs)), remarks)

def make_ckb_box (options, n_rows=10, layoutArgs={}, on_ckb_change=None):

    formCkbsBox, formCkbsByName = make_ckbs('form', options['form'], n_rows, layoutArgs, on_ckb_change)
    faultCkbsBox, faultCkbsByName = make_ckbs('fault', options['fault'], n_rows, layoutArgs, on_ckb_change)
    dataCkbsBox, dataCkbsByName = make_ckbs('data', options['data'], n_rows, layoutArgs, on_ckb_change)
    layoutCkbsBox, layoutCkbsByName = make_ckbs('layout', options['layout'], n_rows, layoutArgs, on_ckb_change)
    metaphorCkbsBox, metaphorCkbsByName = make_ckbs('metaphor', options['metaphor'], n_rows, layoutArgs, on_ckb_change)
    mediaCkbsBox, mediaCkbsByName = make_ckbs('media', options['media'], n_rows, layoutArgs, on_ckb_change)
    flagCkbsBox, flagCkbsByName = make_ckbs('flag', options['flag'], n_rows, layoutArgs, on_ckb_change)

    remarksBox, remarks = make_remarks(n_rows, layoutArgs=layoutArgs)

    boxItemsByName = {}
    for d in [{'remarks': remarks}, formCkbsByName, faultCkbsByName, dataCkbsByName, layoutCkbsByName, metaphorCkbsByName, mediaCkbsByName, flagCkbsByName]:
        for k, v in d.items():
            boxItemsByName[k] = v

    return (VBox([
        HBox([formCkbsBox, faultCkbsBox]),
        HBox([dataCkbsBox, layoutCkbsBox, metaphorCkbsBox, mediaCkbsBox, flagCkbsBox, remarksBox])
    ]), boxItemsByName)

In [17]:
def make_btn_box (on_update=None, on_prev=None, on_next=None):
    btns = []

    if on_prev:
        updateAndPrevBtn = widgets.Button(description="Save -> Prev", button_style="info")
        def update_and_prev (btn):
            if on_update:
                on_update()
            if on_prev:
                on_prev()
        updateAndPrevBtn.on_click(update_and_prev)
        btns.append(updateAndPrevBtn)

    updateBtn = widgets.Button(description="Save", button_style="success")
    def update (btn):
        if on_update:
            on_update()
    updateBtn.on_click(update)
    btns.append(updateBtn)

    if on_next:
        updateAndNextBtn = widgets.Button(description="Save -> Next", button_style="info")
        def update_and_next (btn):
            if on_update:
                on_update()
            if on_next:
                on_next()
        updateAndNextBtn.on_click(update_and_next)
        btns.append(updateAndNextBtn)

    btnItems = {
        'updateBtn': updateBtn,
        'updateAndPrevBtn': updateAndPrevBtn,
        'updateAndNextBtn': updateAndNextBtn
    }

    return (HBox(btns), btnItems)

In [18]:
def make_label_box (image_id, options, next_callback=None, prev_callback=None):
    visImage = visimages.find_one({'image_id': image_id})
    vislabel = find_label(image_id)

    labels = set(vislabel['labels']) if vislabel else set()

    layoutArgs = {
        'padding': '10px',
        'margin': '5px',
        'border': '3px solid lightblue'
    }

    imageBox, imageBoxItems = make_image_box(visImage, width=600, height=box_height, layoutArgs=layoutArgs)

    infoBox, infoBoxItems = make_info_box(visImage, layoutArgs=layoutArgs)

    def show_current_labels ():
        currentLabelsOutput = infoBoxItems['currentLabelsOutput']
        currentLabelsOutput.clear_output()
        with currentLabelsOutput:
            for l in sorted(list(labels)):
                print(l)

    def on_ckb_change (change):
        ckb = change['owner']
        labelName = ckb.tooltip
        if change['new']:
            labels.add(labelName)
        else:
            labels.remove(labelName)
        show_current_labels()

    ckbBox, boxItemsByName = make_ckb_box(options, layoutArgs=layoutArgs, on_ckb_change=on_ckb_change)

    remarks = boxItemsByName['remarks']

    def update_label ():
        vislabels.find_one_and_update({'image_id': image_id}, {
            '$set': {
                'labels': sorted(list(labels)),
                'remarks': remarks.value,
                'updatedAt': datetime.datetime.utcnow()
            },
            '$setOnInsert': {'createdAt': datetime.datetime.utcnow()}
        }, upsert=True)
        with remarks_output:
            if remarks.value:
                print(f"{remarks.value}")
        with infoBoxItems['output']:
            print(f"{visImage['short_image_id']} updated")

    def on_update ():
        update_label()

    def on_prev ():
        if prev_callback:
            prev_callback(image_id)

    def on_next ():
        if next_callback:
            next_callback(image_id)

    btnBox, btnBoxItems = make_btn_box(on_update=on_update, on_prev=on_prev, on_next=on_next)

    panelBoxLayout = Layout(
        justify_content= 'flex-start',
        align_items= 'flex-start',
        align_content= 'flex-start',
    )
    panelBox = VBox([ckbBox, btnBox], layout=panelBoxLayout)

    for l in labels:
        boxItemsByName[l].value = True

    with infoBoxItems['output']:
        print('ready')

    pane_widths = [2, 4, 1]
    return AppLayout(left_sidebar=imageBox, center=panelBox, right_sidebar=infoBox, pane_widths=pane_widths, height=f'{box_height+10}px')

In [19]:
current_ids = ''
if not os.path.isfile(current_ids_filepath):
    current_ids = PersistentSet()
    current_ids.set_file(current_ids_filepath)
else:
    current_ids = PersistentSet.load_set(current_ids_filepath)

In [20]:
visImages = [i for i in visimages.find()]
visImages.sort(key=lambda x: x['popularity_score'])
visImages.reverse()

def label_images ():
    for i in visImages:
        visLabel = vislabels.find_one({'image_id': i['image_id']})
        if visLabel == None:
            yield i['image_id']

label_img_ids = [i['image_id'] for i in visImages]

In [21]:
cell_outputs = [widgets.Output(layout=Layout(height=f'{box_height+30}px')) for i in range(len(current_ids))]

In [22]:
labelOptions = {
    'form': [
        'barchart', 'linechart', 'scatterplot', 'dotplot', 'barcodechart', 'areachart', 'histogram',
        'nodelink', 'radarchart', 'bubblechart', 'dumbbellplot',
        'piechart', 'donutchart',
        'pyramid', 'venn', 'choropleth', 'flowmap', 'map', 'chernoffface',
        'table', 'heatmap', 'treemap', 'quadrant',
        'boxplot', 'violinplot', 'parallelcoor', 'streamgraph',
        'pictogram', 'guagechart', 'sankeydiagram', 'chorddiagram', 'sunbrust', 'voronoi',
        'unknown'
    ],
    'data': [
        'categorical', 'quantitative', 'indexvalue', 'accumulated',
        'percentage', 'probability',
        'ordinal', 'ranking', 'sequential',
        'timeseries', 'cyclic',
        'geospatial', 'flow',
        'network', 'tree',
        'set', 'bitmap',
        'text', 'multivariate',
    ],
#     'encoding': [
#         'position', 'position(unaligned)', 'length', 'area', 'tilt',
#         'curvature', 'region', 'motion', 'shape',
#         'depth', 'volume',
#         'luminance', 'saturation', 'hue'
#     ],
#     'mark': [
#         'line', 'rectangle', 'point'
#     ],
    'layout': [
        'circular', 'infographics', 'stacked', 'map', 'juxtaposition', 'overlay', 'mixed'
    ],
    'metaphor': [
        'pictograph', 'periodictable', 'gear', 'clock'
    ],
    'media': [
        'inreallife', 'printed', 'handdrawn', 'tv', 'ads', 'news', 'NSFW'
    ],
    'fault': [
        'percentage', 'percentage:sum', 'percentage:encoding', 'percentage:wholepart',
        'label', 'description',
        'axis', 'axis:label', 'axis:flipped', 'axis:truncated', "axis:missing", 'axis:double', # axis:dual
        'legend',
        'color', 'color:over12',
        'scale', 'scale:log', 'scale:inconsistent', 'binning',
        'area',
        'picto:distortion', 'picto:area',
        'position', 'itemorder',
        'connection',
        '3d', 'animation',
        'cluttering', 'occulusion',
        'data', 'data:selective', 'data:questionable', 'data:missingvalues', 'data:redundant', 'data:prediction',
        'index:comparison', # index:crossbasiscomparison
        'parody', 'faultylogic', 'missingcontext',
        'confirmationbias', 'chartjunk', 'betteralternative',
        'faultystatistics', 'invalidcomparison',
        'map:population', 'invalidencoding',
        'unreadable', 'visuallyawful',
        'encoding', 'cannotaddup', 'legibility'
    ],
    'flag': [
        'needreview', 'invalid', 'notbad', 'starred'
    ]
}

In [23]:
def prev_id (image_id):
    i = image_id
    while i in current_ids:
        if i == label_img_ids[0]:
            break
        i = label_img_ids[max(0, find_index(label_img_ids, lambda x: x == i) - 1)]
    current_ids.persist_remove(image_id)
    current_ids.persist_add(i)
    return i

def next_id (image_id):
    i = image_id
    while i in current_ids:
        if i == label_img_ids[-1]:
            break
        i = label_img_ids[min(len(label_img_ids), find_index(label_img_ids, lambda x: x == i) + 1)]
    current_ids.persist_remove(image_id)
    current_ids.persist_add(i)
    return i

def prev_img (output, image_id):
    show_img(prev_id(image_id), output)

def next_img (output, image_id):
    show_img(next_id(image_id), output)

def show_img (image_id, output=None):
    output.clear_output()
    with output:
        display(make_label_box(image_id, options=labelOptions, next_callback=partial(next_img, output), prev_callback=partial(prev_img, output)))

for i, current_id in zip(range(len(current_ids)), list(current_ids)):
    show_img(current_id, cell_outputs[i])

In [24]:
VBox(cell_outputs)

VBox(children=(Output(layout=Layout(height='830px')), Output(layout=Layout(height='830px'))))

In [25]:
imageId = 'tumblr/wtf-viz/60472192241_0'
# imageId = 'reddit/dataisugly/cofock_0'
make_label_box(imageId, labelOptions)

TypeError: 'NoneType' object is not subscriptable

In [None]:
# visImages = [i for i in visimages.find()]
# visImages.sort(key=lambda x: x['popularity_score'])
# visImages.reverse()

# def label_images ():
#     for i in visImages:
#         visLabel = vislabels.find_one({'image_id': i['image_id']})
#         if visLabel == None:
#             yield i['image_id']

# cell_output = widgets.Output()
# label_img_ids = [i for i in label_images()]
# current = {'image_id': label_img_ids[0]}

In [None]:
# def prev_img (image_id):
#     show_img(label_img_ids[max(0, find_index(label_img_ids, lambda x: x == image_id) - 1)])

# def next_img (image_id):
#     show_img(label_img_ids[min(len(label_img_ids), find_index(label_img_ids, lambda x: x == image_id) + 1)])

# def show_img (image_id):
#     current['image_id'] = image_id
#     cell_output.clear_output()
#     with cell_output:
#         display(make_label_box(image_id, next_callback=next_img, prev_callback=prev_img))

# show_img(current['image_id'])

In [None]:
# cell_output

In [None]:
# label_img = label_images()

In [None]:
# for i in range(3):
#     try:
#         next_image_id = next(label_img)
#     except StopIteration:
#         print('StopIteration')
#         break

#     display(make_label_box(next_image_id))

In [None]:
# def make_colored_text (color, text):
#     return r'\(\color{' + color + '} {' + text + '}\)'