In [1]:
import json
from pymongo import MongoClient
from collections import Counter
from pathlib import Path
import os
import datetime
from pydash import omit
from bson import json_util

from IPython.display import JSON

In [2]:
mongo_uri = json.load(open('./credentials/mongodb_credentials.json'))['uri']
mongo = MongoClient(mongo_uri)
db = mongo['bad-vis']
vislabels = db['vislabels']

In [3]:
visLabels = [l for l in vislabels.find()]

# Backup Labels

In [4]:
handmade_dir = Path('./handmade')
label_dir = handmade_dir/'labels'

In [5]:
if os.path.isfile(label_dir/'labels.json'):
    os.rename(label_dir/'labels.json', label_dir/f"labels_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.json")

In [6]:
json.dump([omit(l, '_id') for l in vislabels.find()], open(label_dir/'labels.json', 'w'), default=json_util.default)

# Restore Labels

In [None]:
# if os.path.isfile(label_dir/'labels.json'):
#     labels = json_util.loads(open(label_dir/'labels.json').read())
# vislabels.insert_many(labels)

# Label Statistic

In [7]:
idxLabelPairs = {(idx, l) for idx, labels in enumerate(visLabels) for l in labels['labels']}
labelCnts = Counter([l for labels in visLabels for l in labels['labels']])

In [8]:
# idxLabelPairs

In [9]:
labelCnts.most_common()

[('data:categorical', 737),
 ('effect:confuesed', 558),
 ('data:quantitative', 483),
 ('effect:distorted', 477),
 ('data:percentage', 402),
 ('form:barchart', 289),
 ('data:timeseries', 245),
 ('form:linechart', 198),
 ('domain:public', 197),
 ('fault:scale:inconsistent', 184),
 ('form:piechart', 156),
 ('domain:health', 140),
 ('domain:politics', 134),
 ('data:geospatial', 126),
 ('domain:entertainment', 103),
 ('domain:business', 100),
 ('flag:notbad', 98),
 ('media:printed', 95),
 ('domain:computer', 94),
 ('form:choropleth', 85),
 ('domain:international', 84),
 ('fault:colormess', 80),
 ('fault:betteralternative', 77),
 ('fault:axis:truncated', 75),
 ('fault:3d', 69),
 ('form:pictogram', 68),
 ('domain:environment', 67),
 ('layout:juxtaposition', 64),
 ('media:tv', 63),
 ('form:donutchart', 59),
 ('domain:personal', 56),
 ('domain:education', 54),
 ('domain:sports', 52),
 ('fault:cluttering', 52),
 ('form:map', 50),
 ('fault:percentage:sum', 50),
 ('fault:axis:missing', 49),
 ('dat

# Labels

In [10]:
chart_types = [
    {'tag': 'form:barchart',             'subcategory': 'bar',    'name': 'Bar Chart'},
    {'tag': 'form:stackedbarchart',      'subcategory': 'bar',    'name': 'Stacked Bar Chart'},
    {'tag': 'form:dotplot',              'subcategory': 'bar',    'name': 'Dot Plot'},
    {'tag': 'form:histogram',            'subcategory': 'bar',    'name': 'Histogram'},
    {'tag': 'form:barcodechart',         'subcategory': 'bar',    'name': 'Barcode Chart'},
    {'tag': 'form:progressbar',          'subcategory': 'bar',    'name': 'Progress Bar'},

    {'tag': 'form:linechart',            'subcategory': 'line',   'name': 'Line Chart'},
    {'tag': 'form:nodelink',             'subcategory': 'line',   'name': 'Node-Link Diagram'},
    {'tag': 'form:parallelcoor',         'subcategory': 'line',   'name': 'Parallel Coordinates'},
    {'tag': 'form:radarchart',           'subcategory': 'line',   'name': 'Radar Chart'},
    {'tag': 'form:sankeydiagram',        'subcategory': 'line',   'name': 'Sankey Diagram'},
    {'tag': 'form:boxplot',              'subcategory': 'line',   'name': 'Box Plot'},
    {'tag': 'form:violinplot',           'subcategory': 'line',   'name': 'Violin Plot'},
    {'tag': 'form:timeline',             'subcategory': 'line',   'name': 'Time Line'},
    {'tag': 'form:flowchart',            'subcategory': 'line',   'name': 'Flow Chart'},

    {'tag': 'form:connectedscatterplot', 'subcategory': 'dot',    'name': 'Connected Scatter Plot'},
    {'tag': 'form:scatterplot',          'subcategory': 'dot',    'name': 'Scatter Plot'},
    {'tag': 'form:dumbbellplot',         'subcategory': 'dot',    'name': 'Dumbbell Plot'},
    {'tag': 'form:polargraph',           'subcategory': 'dot',    'name': 'Polar Graph'},

    {'tag': 'form:bubblechart',          'subcategory': 'circle', 'name': 'Bubble Chart'},
    {'tag': 'form:piechart',             'subcategory': 'circle', 'name': 'Pie Chart'},
    {'tag': 'form:donutchart',           'subcategory': 'circle', 'name': 'Donut Chart'},
    {'tag': 'form:guagechart',           'subcategory': 'circle', 'name': 'Guage Chart'},
    {'tag': 'form:sunbrust',             'subcategory': 'circle', 'name': 'Sunbrust Diagram'},
    {'tag': 'form:venn',                 'subcategory': 'circle', 'name': 'Venn Diagram'},
    {'tag': 'form:nestedbubble',         'subcategory': 'circle', 'name': 'Nested Bubbles'},
    {'tag': 'form:polararea',            'subcategory': 'circle', 'name': 'Polar Area Chart'},
    {'tag': 'form:chord',                'subcategory': 'circle', 'name': 'Chord Diagram'},

    {'tag': 'form:areachart',            'subcategory': 'area',   'name': 'Area Chart'},
    {'tag': 'form:stackedareachart',     'subcategory': 'area',   'name': 'Stacked Area Chart'},
    {'tag': 'form:streamgraph',          'subcategory': 'area',   'name': 'Stream Graph'},
    {'tag': 'form:treemap',              'subcategory': 'area',   'name': 'Treemap'},
    {'tag': 'form:voronoi',              'subcategory': 'area',   'name': 'Voronoi Diagram'},
    {'tag': 'form:pyramid',              'subcategory': 'area',   'name': 'Pyramid'},
    {'tag': 'form:proportionalarea',     'subcategory': 'area',   'name': 'Proportional Area Chart'},

    {'tag': 'form:choropleth',           'subcategory': 'map',    'name': 'Choropleth'},
    {'tag': 'form:flowmap',              'subcategory': 'map',    'name': 'Flow Map'},
    {'tag': 'form:map',                  'subcategory': 'map',    'name': 'Map'},

    {'tag': 'form:table',                'subcategory': 'table',  'name': 'Table'},
    {'tag': 'form:heatmap',              'subcategory': 'table',  'name': 'Heatmap'},
    {'tag': 'form:quadrant',             'subcategory': 'table',  'name': 'Quadrant'},

    {'tag': 'form:pictogram',            'subcategory': 'icon',   'name': 'Pictogram'},
    {'tag': 'form:chernoffface',         'subcategory': 'icon',   'name': 'Chernoff Face'},

    {'tag': 'form:diagram',              'subcategory': 'other',  'name': 'Diagram'},
    {'tag': 'form:wordle',               'subcategory': 'other',  'name': 'Word Cloud'},
    {'tag': 'form:unknown',              'subcategory': 'other',  'name': 'Unclassified'},
]


In [11]:
data_types = [
    # attribute level means you can tell its type by a single column
    {'tag': 'data:categorical',  'subcategory': 'attribute', 'name': 'Categorical'},
    # a measurement of magnitude that supports arithmetic comparison
    {'tag': 'data:quantitative', 'subcategory': 'attribute', 'name': 'Quantitative'},
    # homogeneous range from a **minimum** to a **maximum** value
    {'tag': 'data:sequential',   'subcategory': 'attribute', 'name': 'Sequential'},
    # deconstructed into two sequences pointing in opposite directions that meet at a **common zero point**
    {'tag': 'data:diverging',    'subcategory': 'attribute', 'name': 'Diverging'},
    # we cannot do full-fledged arithmetic, but there is a well-defined ordering
    {'tag': 'data:ordinal',      'subcategory': 'attribute', 'name': 'Ordinal'},

    # time related
    {'tag': 'data:timeseries',   'subcategory': 'time',      'name': 'Time Series'},
    {'tag': 'data:cyclic',       'subcategory': 'time',      'name': 'Cyclic'},

    # coordinated, spatial meaning for x, y, or z
    {'tag': 'data:geospatial',   'subcategory': 'position',  'name': 'Geospatial'},
    {'tag': 'data:flow',         'subcategory': 'position',  'name': 'Flow'},
    {'tag': 'data:bitmap',       'subcategory': 'position',  'name': 'Bitmap'},
    {'tag': 'data:positional',   'subcategory': 'position',  'name': 'Positional'},

    # records are inter-linked, set type is containment relationship
    {'tag': 'data:network',      'subcategory': 'relation',  'name': 'Network'},
    {'tag': 'data:tree',         'subcategory': 'relation',  'name': 'Tree'},
    {'tag': 'data:set',          'subcategory': 'relation',  'name': 'Set'},

    # derived attributes
    {'tag': 'data:ranking',      'subcategory': 'derived',   'name': 'Ranked'},
    {'tag': 'data:indexvalue',   'subcategory': 'derived',   'name': 'Index Value'},
    {'tag': 'data:accumulated',  'subcategory': 'derived',   'name': 'Accumulated'},
    {'tag': 'data:percentage',   'subcategory': 'derived',   'name': 'Percentage'},
    {'tag': 'data:probability',  'subcategory': 'derived',   'name': 'Probability'},

    # long string
    {'tag': 'data:text',         'subcategory': 'other',      'name': 'Text'},
    # entity level means you can tell its type by a single record
    {'tag': 'data:multivariate', 'subcategory': 'other',    'name': 'Multivariate'},
]


In [12]:
layout_types = [
    {'tag': 'layout:circular',      'name': 'Circular'},
    {'tag': 'layout:juxtaposition', 'name': 'Juxtaposition'},
    {'tag': 'layout:overlay',       'name': 'Superimposed'},
    {'tag': 'layout:mixed',         'name': 'Multiple-views'},
    {'tag': 'layout:periodictable', 'name': 'Periodictable'},
    {'tag': 'layout:clock',         'name': 'Clock'},
    {'tag': 'layout:calendar',      'name': 'Calendar'},

    # {'tag': 'layout:infographics',    'name': 'Infographics'}, # move to media
    # {'tag': 'layout:stacked',         'name': 'Stacked'}, # stakced area chart, stacked bar chart
    # {'tag': 'layout:map',             'name': 'Map'},
]


In [13]:
domain_types = [
    {'tag': 'domain:politics',      'name': 'Politics'},
    {'tag': 'domain:business',      'name': 'Business'},
    {'tag': 'domain:entertainment', 'name': 'Entertainment'},
    {'tag': 'domain:scientific',    'name': 'Scientific'},
    {'tag': 'domain:sports',        'name': 'Sports'},
    {'tag': 'domain:health',        'name': 'Health'},
    {'tag': 'domain:education',     'name': 'Education'},
    {'tag': 'domain:catering',      'name': 'Food'},
    {'tag': 'domain:computer',      'name': 'Computer'},
    {'tag': 'domain:environment',   'name': 'Environment'},
    {'tag': 'domain:international', 'name': 'International'}, # international affair
    {'tag': 'domain:public',        'name': 'Public'}, # general public affair
    {'tag': 'domain:personal',      'name': 'Personal'}, # personal affair
    {'tag': 'domain:unknown',       'name': 'Unknown'},
]

In [14]:
media_types = [
    {'tag': 'media:tv',          'name': 'TV'},
    {'tag': 'media:news',        'name': 'News Article'},
    {'tag': 'media:ads',         'name': 'Ads'},
    {'tag': 'media:printed',     'name': 'Printed'},
    {'tag': 'media:handdrawn',   'name': 'Handdrawn'},
    {'tag': 'media:inreallife',  'name': 'Inreallife'},
    {'tag': 'media:NSFW',        'name': 'NSFW'}, # Not safe for work
    # the thin line between infographics and multiple-views is whether the picture is message bearing, multiple-views is for analytical purpose like dashboard
    {'tag': 'media:infographic', 'name': 'Infographic'},
    {'tag': 'media:google',      'name': 'Google Products'},
]

In [15]:
effect_types = [
    {'tag': 'effect:reversed',     'name': 'Reversed'},
    {'tag': 'effect:distorted',    'name': 'Distorted'},
    {'tag': 'effect:confuesed',    'name': 'Confused'},
    {'tag': 'effect:suggestive',   'name': 'Suggestive'},
]

In [16]:
# based on Surfacing Visualization Mirage
# errors arisen at different dicision points
fault_types = [
    # data curation
    {'tag': 'fault:data:selective',           'subcategories': ['data'],                   'name': 'Selective Data',             'description': 'Cherry-picking the scope of data, e.g., picking an arbitrary starting point of the time axis.'},
    {'tag': 'fault:data:missingvalues',       'subcategories': ['data'],                   'name': 'Missing Values',             'description': 'Null values in the chart that causes confusion.'},
    {'tag': 'fault:data:redundant',           'subcategories': ['data'],                   'name': 'Redundant Values',           'description': 'Items appear more than once in the chart.'},
    {'tag': 'fault:data:questionable',        'subcategories': ['data'],                   'name': 'Questionable Data',          'description': 'Dubious data source.'},
    {'tag': 'fault:data:hypothetical',        'subcategories': ['data'],                   'name': 'Hypothetical Data',          'description': 'Not real data, but drawn like backed by data.'},

    # data wrangling
    {'tag': 'fault:binning',                  'subcategories': ['derive'],                 'name': 'Arbitrary Binning',          'description': 'Items are grouped arbitrarily.'},
    {'tag': 'fault:itemorder',                'subcategories': ['derive'],                 'name': 'Items Out of Order',         'description': 'Items are not sorted, or out of order.'},
    {'tag': 'fault:percentage:sum',           'subcategories': ['derive'],                 'name': 'Percentage Not 100%',        'description': 'Percentages do not add up to 100% in a whole-part relationship.'},
    {'tag': 'fault:faultystatistics',         'subcategories': ['derive'],                 'name': 'Faulty Regression',          'description': 'Improper use of statistics, e.g., a regression line with high error.'},
    {'tag': 'fault:data:prediction',          'subcategories': ['derive'],                 'name': 'Unreliable Forecasting',     'description': 'Plotting future data points derived from unreliable prediction.'},
    # {'tag': 'fault:baselinealignment',        'subcategories': ['derive'],                 'name': 'Misaligned Baseline',        'description': 'Values are indexed or normalized by an unconventional method.'},

    # visualizing: graph drawing
    {'tag': 'fault:position',                 'subcategories': ['graph'],                  'name': 'Poor Positioning',           'description': 'Poor positioning, alignment, or rotation of items.'}, # Need to rename, sound like responsibility of computer but examples are human responsible
    {'tag': 'fault:encoding',                 'subcategories': ['graph'],                  'name': 'Plotting Error',             'description': 'Valid encoding choice but incorrectly drawn, e.g., a glitch.'},
    {'tag': 'fault:color:over12',             'subcategories': ['graph', 'color'],         'name': 'Too Many Colors',            'description': 'Too many colors, mostly related to the scalability issues.'},
    {'tag': 'fault:scale:inconsistent',       'subcategories': ['graph', 'scale'],         'name': 'Misrepresentation',          'description': 'The geometric shapes do not match the labelled values.'},
    {'tag': 'fault:scale:inconsistentacross', 'subcategories': ['graph', 'scale'],         'name': 'Inconsistent Scale Across',  'description': 'Different scales across related charts.'},
    {'tag': 'fault:missinglabel',             'subcategories': ['graph', 'text'],          'name': 'Missing Item Labels',        'description': 'Missing label for item details.'},
    {'tag': 'fault:wronglabel',               'subcategories': ['graph', 'text'],          'name': 'Incorrect Item Labels',      'description': 'Items are labeled with incorrect description.'},
    {'tag': 'fault:axis:missing',             'subcategories': ['graph', 'axis'],          'name': 'Missing Axis',               'description': 'Axis is missing for a coordinated chart.'},
    {'tag': 'fault:axis:label',               'subcategories': ['graph', 'axis', 'text'],  'name': 'Missing Axis Labels',        'description': 'Axis is not labeled with tick marks.'},
    {'tag': 'fault:axis:incorrectlabel',      'subcategories': ['graph', 'axis', 'text'],  'name': 'Incorrect Axis Labels',      'description': 'Axis is incorrectly labeled.'},
    {'tag': 'fault:inconsistentaxisinterval', 'subcategories': ['graph', 'axis'],          'name': 'Inconsistent Tick Interval', 'description': 'Intervals on an axis does not follow a scale.'},
    {'tag': 'fault:inconsistentinterval',     'subcategories': ['graph', 'axis'],          'name': 'Inconsistent Time Interval', 'description': 'Inconsistent time interval between data points.'},
    {'tag': 'fault:legend',                   'subcategories': ['graph', 'scale'],         'name': 'Missing Legend',             'description': 'Legend is needed but missing.'},

    # reading: reading difficulty
    {'tag': 'fault:betteralternative',        'subcategories': ['visual', 'encoding'],     'name': 'Ineffective Encoding',       'description': 'Chart is understandable, but unintuitive, there are better alternatives.'}, # Potentially expandable
    {'tag': 'fault:colormess',                'subcategories': ['visual', 'color'],        'name': 'Undistinguishable Colors',   'description': 'Categorical colors are undistinguishable with each other.'},
    {'tag': 'fault:animation',                'subcategories': ['visual'],                 'name': 'Unjustified Animation',      'description': 'Eye beats memory, comparing changes across animation frames is impossible.'},
    {'tag': 'fault:legendconfuse',            'subcategories': ['visual', 'scale'],        'name': 'Confusing Legend',           'description': 'Lengend is not helpful in understanding the chart.'},
    {'tag': 'fault:labelconfuse',             'subcategories': ['visual', 'text'],         'name': 'Confusing Label',            'description': 'Labels are hard to understand or not understandable at all.'},
    {'tag': 'fault:cluttering',               'subcategories': ['visual'],                 'name': 'Cluttering',                 'description': 'Overwhelmed by visual clutter.'},
    {'tag': 'fault:occulusion',               'subcategories': ['visual'],                 'name': 'Occlusion',                  'description': 'Some items are not visible in the visualization because of blockage by another item.'},
    {'tag': 'fault:chartjunk',                'subcategories': ['visual'],                 'name': 'Chart Junk',                 'description': 'Chart ornament that causes negative effect in understanding.'},
    {'tag': 'fault:legibility',               'subcategories': ['visual', 'text'],         'name': 'Illegible Text',             'description': 'Text in the chart are illegible.'},
    {'tag': 'fault:visuallyawful',            'subcategories': ['visual'],                 'name': 'Visually Awful',             'description': 'Very disturbing while looking at it.'},
    {'tag': 'fault:scale',                    'subcategories': ['visual', 'scale'],        'name': 'Cryptic Scale',              'description': 'Data is drawn to an unconventional scale.'},

    # reading: perception distortion
    {'tag': 'fault:colorscale',               'subcategories': ['perception', 'color'],    'name': 'Ineffective Color Scheme',   'description': 'Ineffective color scheme, e.g., categorical colors for continuous variable.'}, # Categorical color for continuous variable
    {'tag': 'fault:colorconvention',          'subcategories': ['perception', 'color'],    'name': 'Color Mismatch',             'description': 'Mismatch between colors and the objects that they represent, e.g., blue for apples.'},
    {'tag': 'fault:mapprojection',            'subcategories': ['perception', 'encoding'], 'name': 'Distorted Map Projection',   'description': 'Distort map projection to encode information'},
    {'tag': 'fault:scale:log',                'subcategories': ['perception', 'scale'],    'name': 'Incorrect Use of Log Scale', 'description': 'Incorrect use of log scale.'},
    {'tag': 'fault:axis:double',              'subcategories': ['perception', 'axis'],     'name': 'Dual Axes',                  'description': 'Plotting more than one different scale on a single dimension, most often the y-axis.'},
    {'tag': 'fault:axis:flipped',             'subcategories': ['perception', 'axis'],     'name': 'Flipped Axis',               'description': 'Axis values are increasing in an unconventional direction.'},
    {'tag': 'fault:axis:truncated',           'subcategories': ['perception', 'axis'],     'name': 'Truncated Axis',             'description': 'Non-zero starting point on an axis that exaggerates or understates differences between items.'},
    {'tag': 'fault:colorcorrelation',         'subcategories': ['perception', 'color'],    'name': 'False Color Correlation',    'description': 'Items with same or similar colors are not related.'},
    {'tag': 'fault:3d',                       'subcategories': ['perception'],             'name': 'Unjustified 3D',             'description': 'Unjustified use of 3D, it distorts the perceived size of the items.'},
    {'tag': 'fault:area',                     'subcategories': ['perception', 'encoding'], 'name': 'Area Encoding',              'description': 'Incorrect use of area encoding that exaggerates or understates differences between items.'},
    {'tag': 'fault:picto:distortion',         'subcategories': ['perception', 'encoding'], 'name': 'Area Encoded Pictogram',     'description': 'Pictogram is improperly scaled to reflect value differences.'},

    # reading: message
    {'tag': 'fault:annotation',               'subcategories': ['logical', 'text'],        'name': 'Misleading Annotation',      'description': 'Misleading annotation or highlight that suggests unfaithful message.'},
    {'tag': 'fault:nonsequitur',              'subcategories': ['logical'],                'name': 'Disguised Visualization',    'description': 'Looks like a visualization, but not visualizing any data, also known as non-sequitur.'}, # Fake visualization, non-sequitur
    {'tag': 'fault:description',              'subcategories': ['logical', 'text'],        'name': 'False Description',          'description': 'Description and visualization do not add up.'},
    {'tag': 'fault:connection',               'subcategories': ['logical', 'encoding'],    'name': 'Invalid Connection',         'description': 'Lines or arrows are connecting unrelated dots.'},
    {'tag': 'fault:cannotunderstand',         'subcategories': ['logical'],                'name': 'Not Understandable',         'description': 'Cannot decode any data nor information.'},
    {'tag': 'fault:percentage:wholepart',     'subcategories': ['logical'],                'name': 'Faulty Whole Part',          'description': 'Not whole-part relationship but draw as a chart for whole-part relationship like pie chart.'},
    {'tag': 'fault:faultylogic',              'subcategories': ['logical'],                'name': 'Faulty Logic',               'description': 'Conveying an illogical message.'}, # Potentially expandable
    {'tag': 'fault:missingcontext',           'subcategories': ['logical'],                'name': 'Missing Context',            'description': 'Need background information in order to understand the message.'},
    {'tag': 'fault:invalidcomparison',        'subcategories': ['logical'],                'name': 'Invalid Comparison',         'description': 'Comparison between items that are incomparable.'},
    {'tag': 'fault:map:population',           'subcategories': ['logical'],                'name': 'Just a Population Map',      'description': 'The observed pattern is nothing more than the pattern of population density.'},
    {'tag': 'fault:spuriousocorrelation',     'subcategories': ['logical'],                'name': 'Spurious Correlation',       'description': 'Suggesting causal relationships from correlations.'},

    # {'tag': 'fault:data',                     'subcategories': ['data'],                 'name': 'data'},
    # {'tag': 'fault:aggregation',              'subcategories': ['derive'],               'name': 'Improper Aggregation'},
    # {'tag': 'fault:locationinfo',             'subcategories': ['derive'],               'name': 'Wrong Location Information'},
    # {'tag': 'fault:picto:area',               'subcategories': ['perception'],           'name': 'picto:area'},
    # {'tag': 'fault:percentage:encoding',      'subcategories': ['design'],               'name': 'percentage:encoding'}, # combine with Plotting error
    # {'tag': 'fault:label',                    'subcategories': ['graph'],                'name': 'Labels'}, # Review
    # {'tag': 'fault:axis',                     'subcategories': ['graph'],                'name': 'Axis'}, # Either missing axis labels or missing axis
    # {'tag': 'fault:unreadable',               'subcategories': ['visual'],               'name': 'Incomprehensible'}, # combine with invalid encoding choice
    # {'tag': 'fault:confirmationbias',         'subcategories': ['logical'],              'name': 'Confirmation Bias'}, # relabel
    # {'tag': 'fault:cannotaddup',              'subcategories': ['logical'],              'name': 'Cannot Add Up'}, # relabel
    # {'tag': 'fault:index:comparison',         'subcategories': ['logical'],              'name': 'index:comparison'}, # combine with invalid comparison
    # {'tag': 'fault:invalidencoding',          'subcategories': ['design', 'encoding'],   'name': 'Invalid Encoding Choice',    'description': 'Only understandable by extracting the data from the chart, but not its visual form.'}, # combine with not understandable

    # {'tag': 'fault:parody',                   'subcategories': ['other'],                'name': 'parody'}, # move to flag

]


In [17]:
flag_types = [
    {'tag': 'flag:needreview', 'name': 'Need Review'},
    {'tag': 'flag:duplicated', 'name': 'Duplicated'},
    {'tag': 'flag:invalid',    'name': 'Invalid'},
    {'tag': 'flag:notbad',     'name': 'Not Bad'},
    {'tag': 'flag:starred',    'name': 'Starred'},
    {'tag': 'flag:parody',     'name': 'Parody'},
]


In [18]:
descriptions = [
    {'tag': 'form:barchart',     'description': 'Data encoded with length. Chart lies when length disproportional to the encoded value.'},
    {'tag': 'form:linechart',    'description': 'Can be seen as a scatterplot with connected dots. Most widely used with time series data. Chart lies when the context is hidden, changes being magnified or diminished.'},
    {'tag': 'form:scatterplot',  'description': 'Direct encoding of attributes into X and Y axes. Mostly use to find correlation. Chart lies when an agenda of a trend or correlation is imposed.'},
    {'tag': 'form:dotplot',      'description': 'Similar to bar charts. Data encoded with number of dots, but compared as length since the dots are aligned. Mostly used for observing distribution.'},
    {'tag': 'form:barcodechart', 'description': 'Can be seen as a 1-D scatter plot. Most commonly used to show frequency distribution.'},
    {'tag': 'form:areachart',    'description': 'Can be seen as a line chart with color filled area under the curve. Most widely used to convey the accumulated volumn. Chart lies when the area is distorted by truncating the Y-axis.'},
    {'tag': 'form:histogram',    'description': 'Similar to bar chart, but breaking down the continuous X-axis into bins in order to observe the distribution. Chart lies when the bin size is uneven or being manipulated.'},
    {'tag': 'form:nodelink',     'description': 'Dots are connected to show their relationships. Most common diagram for network data. The X-axis or Y-axis positions of nodes has no meaning.'},
    {'tag': 'form:parallelcoor', 'description': 'Multiple Y-axes in one chart, each line is an instance in data and each Y-axis is the attribute values. Most widely used in multi-dimensional data.'},
    {'tag': 'form:radarchart',   'description': 'Can be seen as a parallel coordinates plot wrapped in a circle or regular polygon. The area enclosed by the lines conveys the sum of attribute values. Also common in comparison to spot the skyline (see "skyline problem").'},
    {'tag': 'form:bubblechart',  'description': 'Similar to scatter plot, but with size encoding on each circle. Most common issue is occlusion, also inherits the issues of size encoding.'},
    {'tag': 'form:boxplot',      'description': 'A box depicts the statistics summary of the underlying data. The whiskers show the maximum/minimum and box shows 25th quartile, median and 75th quartile.'},
    {'tag': 'form:violinplot',   'description': 'Similar to box plot. In addition, the width encodes the density distribution of the underlying data. Skewed distribution can be observed.'},
    {'tag': 'data:ordinal',      'description': 'Ordinal values are values that represent the order between items. These values can be used comparatively but not arithmetically, like alphabetical order.'},
    {'tag': 'data:timeseries',   'description': 'Time series data concern changes over time, most commonly with a fixed interval. The time values can be absolute (Nov 23, 2012) or relative (Day 0, Day 1, ...).'},
    {'tag': 'data:cyclic',       'description': 'Cyclical data are series of data with repeated sequences that can be aligned. Examples are running laps, seasonal and monthly cycle.'}
]

In [19]:
subcats = [s for f in fault_types for s in f['subcategories']]
Counter(subcats).most_common()

[('graph', 13),
 ('visual', 11),
 ('perception', 11),
 ('logical', 11),
 ('text', 8),
 ('axis', 8),
 ('scale', 6),
 ('data', 5),
 ('derive', 5),
 ('color', 5),
 ('encoding', 5)]

In [20]:
subcategories = [
    {'category': 'fault', 'group': 'grammar',  'subcategory': 'axis',       'name': 'Axis',                'description': 'Issues related to the axes of coordinated planes'},
    {'category': 'fault', 'group': 'grammar',  'subcategory': 'scale',      'name': 'Legends and Scales',  'description': 'Issues related to legends and the use of scales'},
    {'category': 'fault', 'group': 'grammar',  'subcategory': 'color',      'name': 'Color',               'description': 'Issues in the use of colors'},
    {'category': 'fault', 'group': 'grammar',  'subcategory': 'text',       'name': 'Text',                'description': 'Issues related to titles, annotations and labels'},
    {'category': 'fault', 'group': 'grammar',  'subcategory': 'encoding',   'name': 'Encoding',            'description': 'Issues related to visual encoding'},

    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'data',       'name': 'Data',                'description': 'Issues related to the underlying data, its sources and completeness'},
    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'derive',     'name': 'Data Transformation', 'description': 'Issues introduced when processing data or applying statistical transformations'},
    # {'category': 'fault', 'group': 'pipeline', 'subcategory': 'design',     'name': 'Design Choices',      'description': 'Issues in designer\'s visual design choices'},
    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'graph',      'name': 'Chart Making',        'description': 'Issues introduced in the graph drawing process, leads to missing graph elements or incomplete graph'},
    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'visual',     'name': 'Reading',             'description': 'Difficulties in reading'},
    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'perception', 'name': 'Perception',          'description': 'Issues related to the weaknesses in human perception'},
    {'category': 'fault', 'group': 'pipeline', 'subcategory': 'logical',    'name': 'Message',             'description': 'Visualizations that are trying to convey nonsense or illogical messages'},

    {'category': 'form',                       'subcategory': 'bar',        'name': 'Bar'},
    {'category': 'form',                       'subcategory': 'line',       'name': 'Line'},
    {'category': 'form',                       'subcategory': 'dot',        'name': 'Dot'},
    {'category': 'form',                       'subcategory': 'circle',     'name': 'Circle'},
    {'category': 'form',                       'subcategory': 'area',       'name': 'Area'},
    {'category': 'form',                       'subcategory': 'map',        'name': 'Map'},
    {'category': 'form',                       'subcategory': 'table',      'name': 'Table'},
    {'category': 'form',                       'subcategory': 'icon',       'name': 'Icon'},
    {'category': 'form',                       'subcategory': 'other',      'name': 'Other'},

    {'category': 'data',                       'subcategory': 'attribute',  'name': 'Attribute',           'description': 'Basic attributes'},
    {'category': 'data',                       'subcategory': 'derived',    'name': 'Derived',             'description': 'Derived attributes'},
    {'category': 'data',                       'subcategory': 'time',       'name': 'Time',                'description': 'Time related data'},
    {'category': 'data',                       'subcategory': 'position',   'name': 'Coordinated',         'description': 'Spatial meaning in x-, y-, or z-axes'},
    {'category': 'data',                       'subcategory': 'relation',   'name': 'Relational',          'description': 'Items are inter-linked'},
    {'category': 'data',                       'subcategory': 'other',      'name': 'Other',               'description': 'Other uncategorized data types'},
]


In [21]:
for s in subcategories:
    if 'description' in s:
        print(f"{s['name']}\t{s['description']}")

Axis	Issues related to the axes of coordinated planes
Legends and Scales	Issues related to legends and the use of scales
Color	Issues in the use of colors
Text	Issues related to titles, annotations and labels
Encoding	Issues related to visual encoding
Data	Issues related to the underlying data, its sources and completeness
Data Transformation	Issues introduced when processing data or applying statistical transformations
Chart Making	Issues introduced in the graph drawing process, leads to missing graph elements or incomplete graph
Reading	Difficulties in reading
Perception	Issues related to the weaknesses in human perception
Message	Visualizations that are trying to convey nonsense or illogical messages
Attribute	Basic attributes
Derived	Derived attributes
Time	Time related data
Coordinated	Spatial meaning in x-, y-, or z-axes
Relational	Items are inter-linked
Other	Other uncategorized data types


In [22]:
groups = [
    {'category': 'fault', 'group': 'grammar', 'name': 'Graphical Components', 'description': 'Issues grouped by different components of a chart'},
    {'category': 'fault', 'group': 'pipeline', 'name': 'Visual Analytics Pipeline', 'description': 'Issues grouped by different stages of visual analytics'}
]

In [23]:
labelOptions = [
    {'category': 'fault',  'name': 'Issues',      'options': fault_types, 'subcategories': [s for s in subcategories if s['category'] == 'fault'], 'hierarchical': True, 'grouped': True, 'sorted': True},
    {'category': 'form',   'name': 'Chart Types', 'options': chart_types, 'subcategories': [s for s in subcategories if s['category'] == 'form'], 'group_subcat_threshold': 10},
    {'category': 'data',   'name': 'Data Types',  'options': data_types,  'subcategories': [s for s in subcategories if s['category'] == 'data']},
    {'category': 'domain', 'name': 'Domains',     'options': domain_types},
    {'category': 'media',  'name': 'Medium',      'options': media_types},
    {'category': 'layout', 'name': 'Layouts',     'options': layout_types},
    {'category': 'effect', 'name': 'Effects',     'options': effect_types},
    {'category': 'flag',   'name': 'Flags',       'options': flag_types},
]


In [24]:
for l in labelOptions:
    for o in l['options']:
        o['count'] = labelCnts[o['tag']]

In [25]:
JSON(labelOptions)

<IPython.core.display.JSON object>

In [26]:
labelTags = {
    'categories': [{k:v for k, v in o.items() if k not in ['options', 'subcategories']} for o in labelOptions],
    'subcategories': subcategories,
    'groups': groups,
    'tags': [{**o, 'category': c['category']} for c in labelOptions for o in c['options']],
}

In [27]:
JSON(labelTags)

<IPython.core.display.JSON object>

# Export Labels

In [28]:
json.dump(labelOptions, open('tmp/labelOptions.json', 'w'), separators=(',', ':'))

In [29]:
json.dump(labelTags, open('tmp/labelTags.json', 'w'), separators=(',', ':'))

# Replace Labels

In [30]:
# # replace labels
# current_label = 'fault:invalidencoding'
# new_label = 'fault:cannotunderstand'

# for l in vislabels.find():
# #     if 'form:barchart' in l['labels'] and 'layout:stacked' in l['labels']:
#     if len([l for l in l['labels'] if current_label in l]) > 0:
#         ori_length = len(l['labels'])
#         print(len(l['labels']), l['_id'], l['image_id'], l['labels'])

#         l['labels'] = [l for l in l['labels'] if current_label not in l]
#         if new_label not in l['labels']:
#             l['labels'].append(new_label)

#         new_length = len(l['labels'])
#         print(new_length - ori_length, new_label in l['labels'], len(l['labels']), l['labels'])

#         # vislabels.update_one({'_id': l['_id']}, {'$set': {'labels': l['labels']}})
# #     break