In [None]:
# !pip install altair_saver
# !conda install -y -c conda-forge vega-cli vega-lite-cli
# !conda install -y -c conda-forge altair_saver
# !npm install vega-lite vega-cli canvas

In [2]:
import json
from IPython.display import JSON

import pandas as pd
import altair as alt
from altair_saver import save
import pydash as _
from altair import datum
from pathlib import Path
import os

In [3]:
charts_dir = Path('./charts')
!rm -r charts_dir
charts_dir.mkdir(exist_ok=True)

rm: cannot remove 'charts_dir': No such file or directory


In [4]:
labelTags = json.load(open('../image_lists/labelTags.json'))
labelOptions = json.load(open('../image_lists/labelOptions.json'))
allImages = json.load(open('../image_lists/all.json'))['images']

In [5]:
JSON(labelTags)

<IPython.core.display.JSON object>

In [6]:
JSON(labelOptions)

<IPython.core.display.JSON object>

In [7]:
# JSON(allImages)

In [8]:
from pyppeteer import launch
async def svg2png (in_svg_path, out_png_path):
    browser = await launch(args=['--no-sandbox'])
    page = await browser.newPage()
    await page.goto(f"file://{os.path.abspath(in_svg_path)}")
    dimensions = await page.evaluate('''() => {
        return {
            width: document.documentElement.width.baseVal.value,
            height: document.documentElement.height.baseVal.value
        }
    }''')
    await page.setViewport({'width': 1334, 'height': int(dimensions['height'])})
    # await page.setViewport({'width': int(dimensions['width']), 'height': int(dimensions['height'])})
    await page.screenshot({'path': out_png_path})
    await browser.close()

## Statistics

In [9]:
issuetypes = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})",
    'subcategory': _.find(labelOptions[0]['subcategories'], lambda x: x['subcategory'] == t['subcategories'][0])['name']
} for t in labelTags['tags'] if t['category'] == 'fault']

In [10]:
df_issuetypes = pd.DataFrame(issuetypes)

In [11]:
df_issuetypes = pd.DataFrame(issuetypes)
df_issuetypes = df_issuetypes.sort_values(['subcategory', 'count'], ascending=[True, False])
subcat_order = _.map_([o for o in labelOptions[0]['subcategories'] if o['group'] == 'pipeline'], 'name')
df_issuetypes = df_issuetypes.set_index('subcategory')
df_issuetypes = df_issuetypes.loc[subcat_order].reset_index(level=0)

In [12]:
issuetypes_base = alt.Chart(df_issuetypes).mark_bar().encode(
    x=alt.X('count:Q', title=None, scale=alt.Scale(domain=[0, 200])),
    y=alt.Y('namecount:N', sort='-x', title=None)
)

issuetypes_data = issuetypes_base.transform_filter(
    (datum.subcategory == 'Data')
).mark_bar(color='gray').properties(title='Data: Issues related to the underlying data')

issuetypes_derive = issuetypes_base.transform_filter(
    (datum.subcategory == 'Data Transformation')
).mark_bar(color='lightskyblue').properties(title='Data Transformation: Issues introduced when processing data')

issuetypes_graph = issuetypes_base.transform_filter(
    (datum.subcategory == 'Graph Drawing')
).mark_bar(color='ForestGreen').properties(title='Graph Drawing: Issues Introduced in the graph drawing process').encode(
    issuetypes_base.encoding.x.copy()
)
issuetypes_graph.encoding.x.title = 'Number of Visualizations (Non-exclusive)'

issuetypes_visual = issuetypes_base.transform_filter(
    (datum.subcategory == 'Reading')
).mark_bar(color='gold').properties(title='Reading: Difficulties in reading visualizations')

issuetypes_perception = issuetypes_base.transform_filter(
    (datum.subcategory == 'Perception')
).mark_bar(color='brown').properties(title='Perception: Issues related to human perception')

issuetypes_logical = issuetypes_base.transform_filter(
    (datum.subcategory == 'Message')
).mark_bar(color='purple').properties(title='Message: Visualizations that are trying to convey illogical message').encode(
    issuetypes_base.encoding.x.copy()
)
issuetypes_logical.encoding.x.title = 'Number of Visualizations (Non-exclusive)'

In [13]:
issuetypes_chart = (issuetypes_data & issuetypes_derive & issuetypes_graph) | (issuetypes_visual & issuetypes_perception & issuetypes_logical)
issuetypes_chart

In [14]:
save(issuetypes_chart, f"{charts_dir/'issuetypes.svg'}")
await svg2png(f"{charts_dir/'issuetypes.svg'}", f"{charts_dir/'issuetypes.png'}")

In [15]:
datatypes = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})",
    'subcategory': _.find(labelOptions[2]['subcategories'], lambda x: x['subcategory'] == t['subcategory'])['name']
} for t in labelTags['tags'] if t['category'] == 'data']
df_datatypes = pd.DataFrame(datatypes)
df_datatypes = df_datatypes.sort_values(['subcategory', 'count'], ascending=[True, False])
subcat_order = _.map_(labelOptions[2]['subcategories'], 'name')
df_datatypes = df_datatypes.set_index('subcategory')
df_datatypes = df_datatypes.loc[subcat_order].reset_index(level=0)

In [16]:
datatypes_chart = alt.Chart(df_datatypes).mark_bar().encode(
    x=alt.X(field='count', type='quantitative', title='Number of Visualizations (Non-exclusive)', scale=alt.Scale(domain=[0, 750])),
    y=alt.Y(field='namecount', type='nominal', sort=list(df_datatypes['name']), title=None),
    color=alt.Color(field='subcategory', type='nominal', legend=alt.Legend(title=None), sort=subcat_order)
)
datatypes_chart

In [17]:
save(datatypes_chart, f"{charts_dir/'datatypes.svg'}")
await svg2png(f"{charts_dir/'datatypes.svg'}", f"{charts_dir/'datatypes.png'}")

In [18]:
charttypes = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})",
    'subcategory': _.find(labelOptions[1]['subcategories'], lambda x: x['subcategory'] == t['subcategory'])['name']
} for t in labelTags['tags'] if t['category'] == 'form']
df_charttypes = pd.DataFrame(charttypes)
df_charttypes = df_charttypes.sort_values(['subcategory', 'count'], ascending=[True, False])
subcat_order = _.map_(labelOptions[1]['subcategories'], 'name')
df_charttypes = df_charttypes.set_index('subcategory')
df_charttypes = df_charttypes.loc[subcat_order].reset_index(level=0)

In [19]:
charttypes_chart = alt.Chart(df_charttypes).mark_bar().encode(
    x=alt.X(field='count', type='quantitative', title='Number of Visualizations (Non-exclusive for multiple view visualizations)', scale=alt.Scale(domain=[0, 300])),
    y=alt.Y(field='namecount', type='nominal', sort=list(df_charttypes['name']), title=None),
    color=alt.Color(field='subcategory', type='nominal', legend=alt.Legend(title=None), sort=subcat_order)
)
charttypes_chart

In [20]:
save(charttypes_chart, f"{charts_dir/'charttypes.svg'}")
await svg2png(f"{charts_dir/'charttypes.svg'}", f"{charts_dir/'charttypes.png'}")

In [21]:
domains = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})" if t['name'] != 'Unknown' else f"*{t['name']} ({t['count']})",
} for t in labelTags['tags'] if t['category'] == 'domain']
df_domains = pd.DataFrame(domains)
df_domains = df_domains.sort_values(['count'], ascending=[False])
df_domains

Unnamed: 0,tag,name,count,namecount
11,domain:public,Public,168,Public (168)
5,domain:health,Health,130,Health (130)
0,domain:politics,Politics,124,Politics (124)
1,domain:business,Business,94,Business (94)
2,domain:entertainment,Entertainment,87,Entertainment (87)
8,domain:computer,Computer,85,Computer (85)
10,domain:international,International,77,International (77)
9,domain:environment,Environment,59,Environment (59)
6,domain:education,Education,50,Education (50)
4,domain:sports,Sports,45,Sports (45)


In [22]:
domains_chart = alt.Chart(df_domains).mark_bar(color='gray').encode(
    x=alt.X('count:Q', title='Number of Visualizations'),
    y=alt.Y('namecount:N', sort=list(df_domains['name']), title=None)
).properties(title='Data Domains')
domains_chart

In [23]:
save(domains_chart, f"{charts_dir/'domains.svg'}")
await svg2png(f"{charts_dir/'domains.svg'}", f"{charts_dir/'domains.png'}")

In [24]:
medium = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})",
} for t in labelTags['tags'] if t['category'] == 'media']
df_medium = pd.DataFrame(medium)
df_medium = df_medium.sort_values(['count'], ascending=[False])
df_medium

Unnamed: 0,tag,name,count,namecount
3,media:printed,Printed,89,Printed (89)
0,media:tv,TV,61,TV (61)
2,media:ads,Ads,19,Ads (19)
7,media:infographic,Infographic,18,Infographic (18)
1,media:news,News Article,14,News Article (14)
8,media:google,Google Products,14,Google Products (14)
4,media:handdrawn,Handdrawn,7,Handdrawn (7)
5,media:inreallife,Inreallife,2,Inreallife (2)
6,media:NSFW,NSFW,0,NSFW (0)


In [25]:
medium_chart = alt.Chart(df_medium).mark_bar(color='gray').encode(
    x=alt.X('count:Q', title='Number of Visualizations'),
    y=alt.Y('namecount:N', sort='-x', title=None)
).properties(title='Visualization Medium')
medium_chart

In [26]:
save(medium_chart, f"{charts_dir/'medium.svg'}")
await svg2png(f"{charts_dir/'medium.svg'}", f"{charts_dir/'medium.png'}")

In [27]:
effects = [{
    **_.pick(t, ['tag', 'name', 'count']),
    'namecount': f"{t['name']} ({t['count']})",
} for t in labelTags['tags'] if t['category'] == 'effect']
df_effects = pd.DataFrame(effects)
df_effects = df_effects.sort_values(['count'], ascending=[False])
df_effects

Unnamed: 0,tag,name,count,namecount
2,effect:confuesed,Confused,547,Confused (547)
1,effect:distorted,Distorted,472,Distorted (472)
3,effect:suggestive,Suggestive,25,Suggestive (25)
0,effect:reversed,Reversed,10,Reversed (10)


In [28]:
effects_chart = alt.Chart(df_effects).mark_bar(color='gray').encode(
    x=alt.X('count:Q', title='Number of Visualizations'),
    y=alt.Y('namecount:N', sort='-x', title=None)
).properties(title='Perceived Effects')
effects_chart

In [29]:
save(effects_chart, f"{charts_dir/'effects.svg'}")
await svg2png(f"{charts_dir/'effects.svg'}", f"{charts_dir/'effects.png'}")

## Co-occurrence

### Charts and Issues

In [30]:
records = []
for issuetype in issuetypes:
    for charttype in charttypes:
        records.append({
            'Issue Types': issuetype['namecount'],
            'issuetype_tag': issuetype['tag'],
            'Chart Types': charttype['namecount'],
            'charttype_tag': charttype['tag'],
            'count': len([image for image in allImages if issuetype['tag'] in image['labels'] and charttype['tag'] in image['labels']])
        })

In [31]:
df_charttypes_issuetypes = pd.DataFrame(records)

In [32]:
base = alt.Chart(df_charttypes_issuetypes[df_charttypes_issuetypes['count'] > 0]).encode(
    alt.X('Issue Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_issuetypes['namecount'])),
    alt.Y('Chart Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_charttypes['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='blues', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the chart
charttypes_issuetypes_chart = heatmap + text
charttypes_issuetypes_chart

In [33]:
save(charttypes_issuetypes_chart, f"{charts_dir/'charttypes_issuetypes.svg'}")
await svg2png(f"{charts_dir/'charttypes_issuetypes.svg'}", f"{charts_dir/'charttypes_issuetypes.png'}")

### Data and Issues

In [34]:
records = []
for issuetype in issuetypes:
    for datatype in datatypes:
        records.append({
            'Issue Types': issuetype['namecount'],
            'issuetype_tag': issuetype['tag'],
            'Data Types': datatype['namecount'],
            'datatype_tag': datatype['tag'],
            'count': len([image for image in allImages if issuetype['tag'] in image['labels'] and datatype['tag'] in image['labels']])
        })

In [35]:
df_datatypes_issuetypes = pd.DataFrame(records)

In [48]:
base = alt.Chart(df_datatypes_issuetypes[df_datatypes_issuetypes['count'] > 0]).encode(
    alt.X('Issue Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_issuetypes['namecount'])),
    alt.Y('Data Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_datatypes['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='blues', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the chart
datatypes_issuetypes_chart = (heatmap + text).properties(width=1334)
datatypes_issuetypes_chart

In [49]:
save(datatypes_issuetypes_chart, f"{charts_dir/'datatypes_issuetypes.svg'}")
await svg2png(f"{charts_dir/'datatypes_issuetypes.svg'}", f"{charts_dir/'datatypes_issuetypes.png'}")

### Domains and Issues

In [38]:
records = []
for issuetype in issuetypes:
    for domain in domains:
        records.append({
            'Issue Types': issuetype['namecount'],
            'issuetype_tag': issuetype['tag'],
            'Domains': domain['namecount'],
            'domain_tag': domain['tag'],
            'count': len([image for image in allImages if issuetype['tag'] in image['labels'] and domain['tag'] in image['labels']])
        })

In [39]:
df_domains_issuetypes = pd.DataFrame(records)

In [40]:
base = alt.Chart(df_domains_issuetypes[df_domains_issuetypes['count'] > 0]).encode(
    alt.X('Issue Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_issuetypes['namecount'])),
    alt.Y('Domains:N', scale=alt.Scale(paddingInner=0), sort=list(df_domains['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='blues', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the chart
domains_issuetypes_chart = (heatmap + text)
domains_issuetypes_chart

In [41]:
save(domains_issuetypes_chart, f"{charts_dir/'domains_issuetypes.svg'}")
await svg2png(f"{charts_dir/'domains_issuetypes.svg'}", f"{charts_dir/'domains_issuetypes.png'}")

### Domains and Charts

In [42]:
records = []
for charttype in charttypes:
    for domain in domains:
        records.append({
            'Chart Types': charttype['namecount'],
            'charttype_tag': charttype['tag'],
            'Domains': domain['namecount'],
            'domain_tag': domain['tag'],
            'count': len([image for image in allImages if charttype['tag'] in image['labels'] and domain['tag'] in image['labels']])
        })
df_domains_charttypes = pd.DataFrame(records)
base = alt.Chart(df_domains_charttypes[df_domains_charttypes['count'] > 0]).encode(
    alt.X('Chart Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_charttypes['namecount'])),
    alt.Y('Domains:N', scale=alt.Scale(paddingInner=0), sort=list(df_domains['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='oranges', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the charttype
domains_charttypes_chart = (heatmap + text)
domains_charttypes_chart

In [43]:
save(domains_charttypes_chart, f"{charts_dir/'domains_charttypes.svg'}")
await svg2png(f"{charts_dir/'domains_charttypes.svg'}", f"{charts_dir/'domains_charttypes.png'}")

### Effects and Issues

In [44]:
records = []
for issuetype in issuetypes:
    for effect in effects:
        records.append({
            'Issue Types': issuetype['namecount'],
            'issuetype_tag': issuetype['tag'],
            'Effects': effect['namecount'],
            'effect_tag': effect['tag'],
            'count': len([image for image in allImages if issuetype['tag'] in image['labels'] and effect['tag'] in image['labels']])
        })
df_effects_issuetypes = pd.DataFrame(records)
base = alt.Chart(df_effects_issuetypes[df_effects_issuetypes['count'] > 0]).encode(
    alt.X('Issue Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_issuetypes['namecount'])),
    alt.Y('Effects:N', scale=alt.Scale(paddingInner=0), sort=list(df_effects['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='blues', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the chart
effects_issuetypes_chart = (heatmap + text)
effects_issuetypes_chart

In [45]:
save(effects_issuetypes_chart, f"{charts_dir/'effects_issuetypes.svg'}")
await svg2png(f"{charts_dir/'effects_issuetypes.svg'}", f"{charts_dir/'effects_issuetypes.png'}")

### Effects and Charts

In [46]:
records = []
for charttype in charttypes:
    for effect in effects:
        records.append({
            'Chart Types': charttype['namecount'],
            'charttype_tag': charttype['tag'],
            'Effects': effect['namecount'],
            'effect_tag': effect['tag'],
            'count': len([image for image in allImages if charttype['tag'] in image['labels'] and effect['tag'] in image['labels']])
        })
df_effects_charttypes = pd.DataFrame(records)
base = alt.Chart(df_effects_charttypes[df_effects_charttypes['count'] > 0]).encode(
    alt.X('Chart Types:N', scale=alt.Scale(paddingInner=0), sort=list(df_charttypes['namecount'])),
    alt.Y('Effects:N', scale=alt.Scale(paddingInner=0), sort=list(df_effects['namecount']))
)

# Configure heatmap
heatmap = base.mark_rect().encode(
    color=alt.Color('count:Q',
        scale=alt.Scale(scheme='oranges', domain=[0, 100]),
        legend=alt.Legend(direction='vertical', title='# of Vis')
    )
)

# Configure text
text = base.mark_text(baseline='middle').encode(
    text='count:Q',
    color=alt.value('white')
)

# Draw the chart
effects_charttypes_chart = (heatmap + text)
effects_charttypes_chart

In [47]:
save(effects_charttypes_chart, f"{charts_dir/'effects_charttypes.svg'}")
await svg2png(f"{charts_dir/'effects_charttypes.svg'}", f"{charts_dir/'effects_charttypes.png'}")