In [1]:
# !pip install imutils
# !pip install opencv-python

In [2]:
from pymongo import MongoClient
from pathlib import Path
from tqdm.notebook import tqdm
from shutil import copyfile
import imutils
import cv2
from PIL import Image
import json
from itertools import chain
from pydash import pick, omit
from datetime import datetime

from lib.parallel import parallel

In [3]:
images_dir = Path('./images')
gallery_dir = Path('./bad-vis-images')

In [4]:
!rm -r {gallery_dir/'preview'}
!rm -r {gallery_dir/'thumbnail'}
!rm -r {gallery_dir/'meta'}
!rm -r {gallery_dir/'image_lists'}

In [5]:
gallery_dir.mkdir(exist_ok=True)

preview_dir = gallery_dir/'preview'
thumbnail_dir = gallery_dir/'thumbnail'
meta_dir = gallery_dir/'meta'
imagelist_dir = gallery_dir/'image_lists'

preview_dir.mkdir(exist_ok=True)
thumbnail_dir.mkdir(exist_ok=True)
meta_dir.mkdir(exist_ok=True)
imagelist_dir.mkdir(exist_ok=True)

In [6]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
posts = db['posts']
imagededup = db['imagededup']
visimages = db['visimages']
vislabels = db['vislabels']

In [7]:
visimages.drop()

In [8]:
thumbnail_size = 140

In [9]:
class VisImage ():
    _attrs = [
        'id',
        'image_name',
        'image_path',
        'thumbnail_path',

        'datetime',
        'url',
        'title',
        'author',
        'source',
        'source_url',
        'source_platform',
        'popularity_score',

    #     'size',
    #     'width',
    #     'height',
        'phash',

        'labels',
        'remarks'
    ]

    def __init__ (self, image):
        self._image = image

        for k, v in image.items():
            if k in ['labels', 'datetime']:
                continue
            setattr(self, k, v)

    def digest (self):
        return {a:getattr(self, a) for a in VisImage._attrs if hasattr(self, a)}

    @property
    def image_name (self):
        return f"{self._image['id']}_{self._image['index_in_album']}"

    @property
    def datetime (self):
        return datetime.fromisoformat(self._image['datetime']).timestamp()

    @property
    def visLabel (self):
        if not hasattr(self, '_labels'):
            image_ids = [self._image['image_id']] + [i for i in self._image['duplicated_images']]
            visLabels = [l for l in vislabels.find({'image_id': {'$in': image_ids}})]
            if len(visLabels) == 0:
                return None

            if len(visLabels) > 1:
                print(f"labels warning: more than 1 label {image_ids} {visLabels}")

            self._labels = visLabels[0]
        return self._labels

    @property
    def labels (self):
        if not self.visLabel:
            return []
        labels = [l for l in self.visLabel['labels'] if l != 'flag:starred']
        if self._image.get('animated'):
            labels.append('layout:animated')
        return labels

    @property
    def exclude_from_list (self):
        return len(self.labels) == 0 or 'flag:invalid' in self.labels or 'flag:needreview' in self.labels or 'flag:notbad' in self.labels

    @property
    def remarks (self):
        if not self.visLabel:
            return ''
        return self.visLabel['remarks']

    @property
    def image_path (self):
        return str(preview_dir/f"{self.image_name}{self._image['ext']}")

    @property
    def thumbnail_path (self):
        return str(thumbnail_dir/f"{self.image_name}.jpg")

    def makePreview (self):
        copyfile(self._image['file_path'], self.image_path)

    def makeThumbnail (self):
        ratio = thumbnail_size / max([self._image['width'], self._image['height']])
        thumbnail_width = min([thumbnail_size, int(round(self._image['width'] * ratio))])
        thumbnail_height = min([thumbnail_size, int(round(self._image['height'] * ratio))])

        im = Image.open(self._image['file_path']).convert('RGB')
        im.thumbnail((thumbnail_width, thumbnail_height))
        im.save(self.thumbnail_path, 'JPEG')

    def makeMetaData (self):
        digest = self.digest()
        with open(meta_dir/f"{self.image_name}.json", 'w') as f:
            json.dump(digest, f)
        visimages.insert_one(digest)

In [10]:
def finalize_image (image):
    visImage = VisImage(image)
    if visImage.exclude_from_list:
        return None
    visImage.makePreview()
    visImage.makeThumbnail()
    visImage.makeMetaData()
    return visImage

In [11]:
visImages = parallel(finalize_image, imagededup.find(), total=imagededup.estimated_document_count())

HBox(children=(FloatProgress(value=0.0, max=5840.0), HTML(value='')))




In [12]:
visImages = [i for i in visImages if i]

# Image Lists

In [13]:
visImages = [i for i in visimages.find()]

In [14]:
visImages.sort(key=lambda x: x['popularity_score'])

In [15]:
visImages.reverse()

In [16]:
len(visImages)

500

In [17]:
for i in visImages:
    del i['_id']

In [18]:
json.dump({'images': visImages}, open(imagelist_dir/'all.json', 'w'), separators=(',', ':'))

In [19]:
batch_cuts = [0] + [i for i in range(225, len(visImages), 400)] + [len(visImages)]
for (batch_begin, batch_end) in zip(batch_cuts[:-1], batch_cuts[1:]):
    json.dump({
        'images': visImages[batch_begin:batch_end],
        'next': f"image_lists/{batch_end}.json" if batch_end != len(visImages) else None
    }, open(imagelist_dir/f"{batch_begin}.json", 'w'), separators=(',', ':'))

# Labels Options

In [27]:
labelOptions = [{
    'tag': 'fault',
    'name': 'Issues',
    'options': [
        'percentage:sum', 'percentage:encoding', 'percentage:wholepart',
        'label', 'description',
        'axis', 'axis:label', 'axis:flipped', 'axis:truncated', "axis:missing", 'axis:double', # axis:dual
        'legend',
        'color', 'color:over12',
        'scale', 'scale:log', 'scale:inconsistent', 'binning',
        'area',
        'picto:distortion', 'picto:area',
        'position', 'itemorder',
        'connection',
        '3d', 'animation',
        'cluttering', 'occulusion',
        'data', 'data:selective', 'data:questionable', 'data:missingvalues', 'data:redundant', 'data:prediction',
        'index:comparison', # index:crossbasiscomparison
        'parody', 'faultylogic', 'missingcontext',
        'confirmationbias', 'chartjunk', 'betteralternative',
        'faultystatistics', 'invalidcomparison',
        'map:population', 'invalidencoding',
        'unreadable', 'visuallyawful',
        'encoding', 'cannotaddup', 'legibility'
    ]}, {
    'tag': 'form',
    'name': 'Chart Type',
    'options': [
        'barchart', 'linechart', 'scatterplot', 'dotplot', 'barcodechart', 'areachart', 'histogram',
        'nodelink', 'radarchart', 'bubblechart', 'dumbbellplot',
        'piechart', 'donutchart',
        'pyramid', 'venn', 'choropleth', 'flowmap', 'map', 'chernoffface',
        'table', 'heatmap', 'treemap', 'quadrant',
        'boxplot', 'violinplot', 'parallelcoor', 'streamgraph',
        'pictogram', 'guagechart', 'sankeydiagram', 'chorddiagram', 'sunbrust', 'voronoi',
        'unknown'
    ]}, {
    'tag': 'data',
    'name': 'Data Types',
    'options': [
        'categorical', 'quantitative', 'indexvalue', 'accumulated',
        'percentage', 'probability',
        'ordinal', 'ranking', 'sequential',
        'timeseries', 'cyclic',
        'geospatial', 'flow',
        'network', 'tree',
        'set', 'bitmap',
        'text', 'multivariate',
    ]}, {
    'tag': 'media',
    'name': 'Medium',
    'options': [
        'inreallife', 'printed', 'handdrawn', 'tv', 'ads', 'news', 'NSFW'
    ]}, {
    'tag': 'layout',
    'name': 'Layout',
    'options': [
        'circular', 'infographics', 'stacked', 'map', 'juxtaposition', 'overlay', 'mixed'
    ]}, {
    'tag': 'metaphor',
    'name': 'Metaphor',
    'options': [
        'pictograph', 'periodictable', 'gear', 'clock'
    ]}, {
    'tag': 'flag',
    'name': 'Flag',
    'options': [
        'needreview', 'invalid', 'notbad', 'starred'
    ]}
]

In [28]:
# dump all tags but flag
json.dump(labelOptions[:-1], open(imagelist_dir/f"labelOptions.json", 'w'), separators=(',', ':'))

# Sprite

In [22]:
# ims = []
# sprites_path = str(gallery_dir/'0.jpg')
# for i in range(225):
#     visImage = visImages[i]
#     visImage['sprites_path'] = sprites_path
#     visImage['sprites_pos_x'] = (i % 15) * thumbnail_size
#     visImage['sprites_pos_y'] = (i // 15) * thumbnail_size
#     im = cv2.imread(visImage['thumbnail_path'])
#     w, h = im.shape[:2]
#     new_im = cv2.copyMakeBorder(im, 0, thumbnail_size-w, 0, thumbnail_size-h, cv2.BORDER_CONSTANT, value=[0, 0, 0])
#     ims.append(new_im)
# montages = imutils.build_montages(ims, (140, 140), (15, 15))
# cv2.imwrite(sprites_path, montages[0])