In [None]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import altair as alt
import os
from theme import apply_theme
from constants import EVALUATION_DATE_FOLDER
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

# Visualize
For previous code, refer to `./deprecated/04-Visualize.ipynb`.

## Data Portals

In [None]:
"""
Merge data for visualization
"""
df = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_evaluation.csv'))

# Can be useful for visualization
ERROR_TYPES = df.columns.tolist()
ERROR_TYPES.remove('page_id')
ERROR_TYPES.remove('is_success')
ERROR_TYPES.remove('error_count')
ERROR_TYPES.remove('alert_count')
ERROR_TYPES.remove('contrast_count')
ERROR_TYPES.remove('category_count')
ERROR_TYPES.remove('count')

# Add `id` of resources and `page_type` of pages
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_pages.csv'))
df = df.merge(df_pages[['id', 'page_id', 'page_type']], left_on='page_id', right_on='page_id', how='left')

# Add metadata of resources
df_meta = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_metadata.csv'))
# df_meta.drop(columns=['url'], inplace=True)
df = df.merge(df_meta, left_on='id', right_on='id', how='left')

# df
# ERROR_TYPES
len(df[df.page_type != 'home'])

In [None]:
# Let's add some useful columns for visualization
df['error_and_contrast_count'] = df['error_count'] + df['contrast_count']
df['has_error'] = (df['error_and_contrast_count']).apply(lambda x: x > 0)

In [None]:
THEME_COLOR = '#409F7A'

In [None]:
df_home = df[df['page_type'] == 'home']

In [None]:
df_home['count'].max()

In [None]:
point = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.X(f'count:Q').bin(maxbins=400).scale(domain=[0, 2200], clamp=True),
    alt.Y('count()').scale(type='log'),
    alt.Tooltip('count()')
).properties(
    width=500
)

point

In [None]:
# Top countries by the number of resources
COUNTRY_N = 20
TOP_COUNTRY_BY_COUNT = df_home.country.value_counts().reset_index().sort_values(by='count', ascending=False)[:COUNTRY_N].country.tolist()
TOP_COUNTRY_BY_COUNT

In [None]:
FIELD = 'count'

COUNTRIES_SORT = df_home[df_home.country.isin(TOP_COUNTRY_BY_COUNT)][['country', FIELD]].groupby('country').median().sort_values(by=FIELD, ascending=False).reset_index().country.tolist()

point = alt.Chart(
    df_home[df_home.country.isin(COUNTRIES_SORT)]
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.X(f'median({FIELD}):Q', title='The Median Number of Errors & Alerts'),
    alt.Y('country:N', sort=COUNTRIES_SORT).axis(grid=True),
    alt.Tooltip(['count(count):Q'])
).properties(
    width=500
)

per = alt.Chart(
    df_home[df_home.country.isin(COUNTRIES_SORT)]
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.X(f'mean(has_error):Q', title='Percent Pages w/ Errors & Alerts'),
    alt.Y('country:N', sort=COUNTRIES_SORT).axis(grid=True),
    alt.Tooltip(['count(count):Q'])
).properties(
    width=500
)

bar = point.mark_bar(
    color=THEME_COLOR
).encode(
    alt.X(f'count({FIELD}):Q'),
    alt.Y('country:N', sort=COUNTRIES_SORT),
).properties(
    width=100
)

chart = bar | point

apply_theme(chart)

In [None]:
# Top countries by the number of resources
N = 20
TOP = df_home.host_institution.value_counts().reset_index().sort_values(by='count', ascending=False)[:N].host_institution.tolist()
TOP

In [None]:
FIELD = 'count'
C = 'host_institution'
SORT = df_home[df_home[C].isin(TOP)][[C, FIELD]].groupby(C).median().sort_values(by=FIELD, ascending=False).reset_index()[C].tolist()

point = alt.Chart(
    df_home[df_home[C].isin(SORT)]
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.X(f'median({FIELD}):Q', title='The Median Number of Errors & Alerts'),
    alt.Y(f'{C}:N', sort=SORT, title=None).axis(grid=True),
    alt.Tooltip(['count(count):Q'])
).properties(
    width=500
)

bar = point.mark_bar(
    color=THEME_COLOR
).encode(
    alt.X(f'count({FIELD}):Q'),
    alt.Y(f'{C}:N', sort=SORT, title=None),
).properties(
    width=100
)

chart = bar | point

apply_theme(chart)

In [None]:
plot = alt.Chart(
    df_home
).mark_line(
    color=THEME_COLOR,
    # filled=True
).encode(
    alt.X('founded_year:O'),
    alt.Y('median(count):Q'),
).properties(
    width=500
)
apply_theme(plot, x_label_angle=90)

## Impact

In [None]:
plot = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.Y('count()'),
    alt.X('citation:Q').bin(maxbins=1000).scale(domain=[0, 1000], clamp=True),
).properties(
    width=400,
    height=400
)
apply_theme(plot)

In [None]:
hs = df_home.sort_values(by='citation').citation.tolist()

q1 = hs[int(len(hs) / 4)]
q2 = hs[int(len(hs) / 2)]
q3 = hs[int(len(hs) / 4 * 3)]

print(q1, q2, q3)

In [None]:
df_home['citation_group'] = df_home.citation.apply(lambda x: '0-25%' if x <= q1 else '25-50%' if x <= q2 else '50-75%' if x <= q3 else '75%-100%')

plot = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.Y(f'mean(has_error):Q', title='Median Errors & Alerts').scale(domain=[0, 1]).axis(format='%'),
    alt.X('citation_group:N'),
    alt.Tooltip(['count(count):Q'])
).properties(
    width=400,
    height=400
)

apply_theme(plot)

In [None]:
df_home['citation_group'] = df_home.citation.apply(lambda x: '0-25%' if x <= q1 else '25-50%' if x <= q2 else '50-75%' if x <= q3 else '75%-100%')

plot = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    filled=True
).encode(
    alt.Y(f'median(count):Q', title='Median Errors & Alerts'),
    alt.X('citation_group:N'),
    alt.Tooltip(['count(count):Q'])
).properties(
    width=400,
    height=400
)

apply_theme(plot)

## NIH Portals vs. EBI Portals

In [None]:
# NIH
# NIH_CANDIDATES = df_home[
#     (df_home.country == 'United States') & 
#     (~df_home.host_institution.str.contains('University', na=False)) &
#     (df_home.host_institution.str.contains('National', na=False))
# ].host_institution.value_counts().reset_index().sort_values(by='count', ascending=False).host_institution.tolist()
NIH_INSTS = [
    'National Center for Biotechnology Information',
    'National Cancer Institute',
    'National Heart, Lung, and Blood Institute',
    'National Center for Advancing Translational Sciences',
    'National Institutes of Health',
    'National Human Genome Research Institute',
    'National Institute of Environmental Health Sciences',
    'National Library of Medicine',
    'National Institute of Standards and Technology',
    'National Institute of Health',
    'National Institute on Aging',
    'National Institute of Neurological Disorders & Stroke',
    'National Institute of Child Health and Human Development',
    'National Eye Institute',
    'National Institute of Allergy and Infectious Diseases',
    'National Institute of Arthritis and Musculoskeletal and Skin Diseases'
]

EBI_INSTS = [
    'European Bioinformatics Institute'
]

df_insts = df_home[df_home.host_institution.isin(NIH_INSTS + EBI_INSTS)]

df_insts['nih_vs_ebi'] = df_insts['host_institution'].apply(lambda x: 'NIH' if x in NIH_INSTS else 'EBI')

# df_insts

In [None]:
plot = alt.Chart(
    df_insts
).mark_bar(
    color=THEME_COLOR,
    # thickness=4,
    # filled=True,
    size=50
).encode(
    alt.Y('median(count):Q', title='Median Errors & Alerts').scale(padding=2),
    alt.X('nih_vs_ebi:N', title=None, sort=['NIH', 'EBI']),
).properties(
    width=200
)

per = alt.Chart(
    df_insts
).mark_bar(
    color=THEME_COLOR,
    # thickness=4,
    # filled=True,
    size=50
).encode(
    alt.Y('mean(has_error):Q', title='Percent Pages w/ Errors & Alerts').scale(domain=[0, 1]).axis(format='%'),
    alt.X('nih_vs_ebi:N', title=None, sort=['NIH', 'EBI']),
).properties(
    width=200
)

bar = plot.encode(
    alt.Y('count()')
)

apply_theme(plot | per | bar, x_label_angle=0)

## US vs. Others

In [None]:
df_home['is_us'] = df_home.country.isin(['United States'])
df_home['is_us'] = df_home['is_us'].apply(lambda x: 'US' if x else 'Non-US')

In [None]:
plot = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    size=50
).encode(
    alt.Y('median(count):Q', title='Median Errors & Alerts'),
    alt.X('is_us:N', title=None)
).properties(
    width=200
)

per = alt.Chart(
    df_home
).mark_bar(
    color=THEME_COLOR,
    size=50
).encode(
    alt.Y('mean(has_error):Q', title='Percent Pages w/ Errors & Alerts').scale(domain=[0, 1]).axis(format='%'),
    alt.X('is_us:N', title=None)
).properties(
    width=200
)

bar = plot.encode(
    alt.Y('count()')
)

apply_theme(plot | per | bar, x_label_angle=0)

## NIH Portals vs. Non-NIH US Portals

In [None]:
df_us = df_home[df_home.country == 'United States']
df_us['is_nih'] = df_us.host_institution.isin(NIH_INSTS)
df_us['is_nih'] = df_us['is_nih'].apply(lambda x: 'NIH' if x else 'Others')

In [None]:
plot = alt.Chart(
    df_us
).mark_bar(
    color=THEME_COLOR,
    size=50
).encode(
    alt.Y('median(count):Q', title='Median Errors & Alerts'),
    alt.X('is_nih:N', title=None)
).properties(
    width=200
)

per = alt.Chart(
    df_us
).mark_bar(
    color=THEME_COLOR,
    size=50
).encode(
    alt.Y('mean(has_error):Q', title='Percent Pages w/ Errors & Alerts').scale(domain=[0, 1]).axis(format='%'),
    alt.X('is_nih:N', title=None)
).properties(
    width=200
)

bar = plot.encode(
    alt.Y('count()')
)

apply_theme(plot | per | bar, x_label_angle=0)

## NIH Breakdown

In [None]:
df_nih = df_us[df_us.is_nih == 'NIH']

In [None]:
plot = alt.Chart(
    df_nih
).mark_bar(
    color=THEME_COLOR
).encode(
    alt.X('count:Q', title='Errors & Alerts'),
    alt.Y('short_name:N', title=None, sort='-x'),
    tooltip=['url']
).properties(
)

apply_theme(plot, x_label_angle=0)

## Temporal
- WCAG 1.0: 1999-2007
- WCAG 2.0: 2008-2017
- WCAG 2.1: 2018-
- WCAG 2.2: 2023

In [None]:
df_home['wcag'] = df_home.founded_year.apply(lambda x: '-1998 (Before WCAG 1.0)' if x < 1998 else '1999-2007 (WCAG 1.0)' if x < 2008 else '2008-2017 (WCAG 2.0)' if x < 2018 else '2018- (WCAG 2.1)')
# df_home[df_home.wcag == '1999-2007 (WCAG 1.0)'].sort_values(by='count', ascending=False).url

In [None]:
plot = alt.Chart(
    df_home[df_home.wcag != '-1998 (Before WCAG 1.0)']
).mark_bar(
    color=THEME_COLOR,
    # filled=True
).encode(
    alt.X('wcag:N'),
    alt.Y('median(count):Q', title='Median Errors & Alerts'),
).properties(
    width=400,
    height=400
)

per = alt.Chart(
    df_home[df_home.wcag != '-1998 (Before WCAG 1.0)']
).mark_bar(
    color=THEME_COLOR,
    # filled=True
).encode(
    alt.X('wcag:N'),
    alt.Y('mean(has_error):Q', title='Median Errors & Alerts'),
).properties(
    width=400,
    height=400
)

bar = plot.encode(
    alt.Y('count()'),
    alt.X('wcag:N'),
)

apply_theme(plot | per | bar, x_label_angle=-90)

## By Page Type

In [None]:
def mean_errors_in_page_by_page_type(df):
    point = alt.Chart(df).mark_point(filled=True, color=THEME_COLOR, size=100).encode(
        alt.X('mean(error_and_contrast_count):Q', title='Mean Errors'),
        alt.Y('page_type:N', title='Page Type', sort=['home', 'documentation', 'search_result', 'search', 'data_entity']),
    )
    error_bar = alt.Chart(df).mark_errorbar(color=THEME_COLOR, thickness=2).encode(
        alt.X('error_and_contrast_count:Q', title='Mean Errors'),
        alt.Y('page_type:N', title='Page Type', sort=['home', 'documentation', 'search_result', 'search', 'data_entity']),
    )
    return apply_theme(
        (point + error_bar).properties(
            title='Average # of Errors in a Page',
            height=300,
            width=400
        )
    )
mean_errors_in_page_by_page_type(df)

In [None]:
def error_pages_by_page_type(df):
    bar = alt.Chart(df).mark_bar(color=THEME_COLOR).encode(
        alt.Y('mean(has_error):Q', title='Percentage of Pages').axis(format='%', tickCount=20),
        alt.X('page_type:N', title='Page Type', sort='-y'),
        alt.Tooltip('mean(has_error):Q')
    ).properties(
        title='Proportion of Pages with Errors',
        width=500
    )

    text = bar.mark_text(size=20, baseline='top', dy=10, color='white').encode(
        alt.Text('mean(has_error):Q', format='.1%')
    )
    return apply_theme(
        bar + text,
        x_label_angle=-90
    )
error_pages_by_page_type(df)

## By Error Type

In [None]:
id_vars = ['page_id', 'page_type']
df_long = df[ERROR_TYPES + id_vars].melt(id_vars=id_vars, value_vars=ERROR_TYPES, var_name='error_type', value_name='error_count')
df_long.head(3)

In [None]:
TOP_ERROR_TYPES = df_long.groupby(by='error_type').sum().sort_values(by='error_count', ascending=False)[0:10].reset_index().error_type.tolist()
TOP_ERROR_TYPES

In [None]:
apply_theme(
        alt.Chart(df_long[df_long.error_type.isin(TOP_ERROR_TYPES)]).mark_bar(color=THEME_COLOR).encode(
        alt.Y('sum(error_count):Q', title='Total Errors & Alerts'),
        alt.X('error_type:N', title='Error Type', sort='-y'),
    ).properties(
        width=400
    ),
    x_label_angle=-90
)

In [None]:
wcag_level_map = {
	'error_label_missing':'Perceivable, Operable, Understandable',
	'error_language_missing':'Understandable',
	'error_button_empty':'Perceivable, Operable',
	'alert_alt_suspicious':'Perceivable',
	'alert_hPerceivable_missing':'Perceivable, Operable',
	'alert_link_redundant':'Operable',
	'alert_link_pdf':'Perceivable',
	'alert_text_small':'Perceivable',
	'contrast_contrast':'Perceivable',
	'alert_select_missing_label':'Perceivable, Operable, Understandable',
	'alert_region_missing':'Perceivable, Operable',
	'error_alt_missing':'Perceivable',
	'error_link_empty':'Operable',
	'error_aria_menu_broken':'Operable, Robust',
	'alert_alt_duplicate':'Perceivable',
	'alert_label_orphaned':'Perceivable, Operable, Understandable',
	'alert_heading_skipped':'Perceivable, Operable',
	'alert_link_suspicious':'Operable',
	'alert_event_handler':'Operable',
	'alert_text_justified':'Understandable',
	'alert_heading_missing':'Perceivable, Operable',
	'alert_link_internal_broken':'Operable',
	'error_alt_link_missing':'Perceivable, Operable',
	'alert_youtube_video':'Perceivable',
	'alert_title_redundant':'Understandable',
	'error_th_empty':'Perceivable',
	'alert_noscript':'Other',
	'alert_image_title':'Perceivable',
	'alert_label_title':'Perceivable, Operable, Understandable',
	'alert_accesskey':'Operable',
	'alert_table_layout':'Perceivable',
	'error_heading_empty':'Operable, Understandable',
	'alert_alt_redundant':'Perceivable',
	'alert_legend_missing':'Perceivable, Operable, Understandable',
	'alert_tabindex':'Operable',
	'alert_heading_possible':'Perceivable, Operable',
	'alert_underline':'Understandable',
	'error_link_skip_broken':'Operable',
	'alert_fieldset_missing':'Perceivable, Operable, Understandable',
	'alert_plugin':'Perceivable',
	'alert_alt_long':'Perceivable',
	'alert_javascript_jumpmenu':'Operable, Understandable',
	'error_alt_spacer_missing':'Perceivable',
	'error_title_invalid':'Operable',
	'error_aria_reference_broken':'Perceivable',
	'alert_table_caption_possible':'Perceivable',
	'alert_link_powerpoint':'Perceivable',
	'alert_link_excel':'Perceivable',
	'alert_audio_video':'Perceivable',
	'alert_list_possible':'Perceivable',
	'error_label_multiple':'Perceivable, Operable, Understandable',
	'error_meta_refresh':'Operable',
	'error_alt_area_missing':'Perceivable, Operable',
	'alert_flash':'Perceivable, Operable',
	'error_alt_map_missing':'Perceivable',
	'error_label_empty':'Perceivable, Operable, Understandable',
	'alert_html5_video_audio':'Perceivable',
	'error_alt_input_missing':'Perceivable, Operable',
	'alert_link_word':'Perceivable',
	'alert_longdesc':'Perceivable',
	'error_marquee':'Operable',
	'error_blink':'Operable',
	'alert_applet': 'Other',
	'alert_h1_missing': 'Perceivable, Operable'
}

In [None]:
id_vars = ['page_id', 'page_type']
df_long = df[ERROR_TYPES + id_vars].melt(id_vars=id_vars, value_vars=ERROR_TYPES, var_name='error_type', value_name='error_count')
df_long['wcag_level'] = df_long['error_type'].apply(lambda x: wcag_level_map[x])

def f(levels, b, l):
    if l in levels:
        return b
    else:
        return 0
for l in ['Perceivable', 'Operable', 'Understandable', 'Robust', 'Other']:
    df_long[l] = df_long.apply(lambda x: f(x.wcag_level, x.error_count, l), axis=1)

# df_long['Operable'] = df_long['error_count']
# df_long['Understandable'] = df_long['error_count']
# df_long['Robust'] = df_long['error_count']
# df_long['Other'] = df_long['error_count']

df_long.head(3)
# ERROR_TYPES

In [None]:
id_vars = ['page_id', 'page_type', 'error_type']
LEVELS = ['Perceivable', 'Operable', 'Understandable', 'Robust', 'Other']
df_long_level = df_long[LEVELS + id_vars].melt(id_vars=id_vars, value_vars=LEVELS, var_name='level', value_name='count')
df_long_level.head(3)

In [None]:
def average_errors_by_error_type_and_page_type(df_long_level):
    bar = alt.Chart(df_long_level[df_long_level.page_type != 'home']).mark_bar(color=THEME_COLOR).encode(
        alt.X('sum(count):Q', title='Total Errors & Alerts'),
        alt.Y('level:N', title='Error Groups', sort=LEVELS)
    ).properties(
        width=200
    )
    # error_bar = alt.Chart(df_long).mark_errorbar(color=THEME_COLOR, thickness=2).encode(
    #     alt.X('error_count:Q', title='Average Number of Errors'),
    #     alt.Y('wcag_level:N', title='Error Type').axis(grid=True)
    # )
    return apply_theme(
        (bar).facet(
            alt.Column('page_type:N', title=None),
        ).properties(
        )
    )
average_errors_by_error_type_and_page_type(df_long_level)

In [None]:
def num_of_errors_by_resources(df):
    bar = alt.Chart(df).mark_bar(color=THEME_COLOR).encode(
        alt.X('sum(error_count):Q', title='The Number of Errors').scale(type='symlog').axis(tickCount=20, orient='top'),
        alt.Y('short_name:N', title=None, sort='-x'),
        alt.Column('page_type:N', title='Page Type'),
    )
    return apply_theme(
        (bar).properties(
            title='Average # of Errors in a Page by Page Type',
            # height=300,
            width=200
        )
    )
num_of_errors_by_resources(df)

In [None]:
def average_errors_by_error_type_and_page_type(df_long):
    point = alt.Chart(df_long).mark_point(filled=True, color=THEME_COLOR).encode(
        alt.X('mean(error_count):Q', title='Average Number of Errors').scale(domain=[0, 200], clamp=True),
        alt.Y('error_type:N', title='Error Type')
    ).properties(
        width=200
    )
    error_bar = alt.Chart(df_long).mark_errorbar(color=THEME_COLOR, thickness=2).encode(
        alt.X('error_count:Q', title='Average Number of Errors'),
        alt.Y('error_type:N', title='Error Type').axis(grid=True)
    )
    return apply_theme(
        (point + error_bar).facet(
            alt.Column('page_type:N', title='Page Type'),
        ).properties(
            title='Average # of Errors in a Page by Page Type'
        )
    )
average_errors_by_error_type_and_page_type(df_long)

In [None]:
# Let's add some useful columns for visualization
df_long['has_error'] = df_long.error_count.apply(lambda x: x > 0)

In [None]:
def error_pages_by_page_type(df_long):
    bar = alt.Chart(df_long).mark_bar(color=THEME_COLOR).encode(
        alt.X('mean(has_error):Q', title='Percentage of Pages').scale(domain=[0, 1]).axis(format='%'),
        alt.Y('error_type:N', title='Error Type')
    ).properties(
        width=200
    )
    return apply_theme(
        bar.facet(
            alt.Column('page_type:N', title='Page Type'),
        ).properties(
            title='Proportion of Error Pages by Page Type'
        )
    )
error_pages_by_page_type(df_long)