In [None]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import altair as alt
import json
from theme import apply_theme
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

## Load Evaluation Data

In [None]:
jdf = pd.read_json('../output/journals-merged.json')
print(jdf.columns)
jdf.head()

In [None]:
pdf = pd.read_json('../output/portals-merged.json')
print(pdf.columns)
pdf.head()

In [None]:
vdf = pd.read_json('../output/visualizations-merged.json')
print(vdf.columns)
vdf.head()

In [None]:
alt.Chart(
    jdf[jdf.issueType == 'alert']
).mark_bar().encode(
    x=alt.X('publisher', sort='-y'),
    y=alt.Y('sum(issueCount)')
)

## Percent of Resources with Accessibility Issues

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    ('Visualizations', vdf, '#3275B4')
]

charts = []
for (title, data, color) in mapping:

    N = len(data.baId.unique().tolist())

    filtered = data[
        (data.issueType == 'error')
        | (data.issueType == 'contrast')
        # | (data.issueType == 'alert')
    ].copy()

    grouped = filtered.groupby(by='baId').sum().reset_index()
    grouped['hasIssues'] = grouped.issueCount.apply(lambda x: x >= 1)
    grouped = grouped.groupby(by=['hasIssues']).count().reset_index()
    grouped['percentHasIssues'] = grouped.issueCount / N
    grouped

    chart = alt.Chart(grouped).mark_bar().encode(
        x=alt.X('hasIssues:N'),
        y=alt.Y('percentHasIssues:Q', title=None, axis=alt.Axis(format='%', labels=False if title != 'Journals' else True), scale=alt.Scale(domain=[0, 1])),
        color=alt.Color('hasIssues:N', legend=None, scale=alt.Scale(range=['#A0A0A0', color])),
        tooltip=['percentHasIssues']
    )

    chart = chart.properties(
        title={
            "text": f'{title}',
            "subtitle": [
                f'N={N}',
                # '\"Error\" and \"Contrast\" issues are counted'
            ],
            # "subtitleColor": 'gray'
        },
        width=300
    )
    charts.append(chart)
apply_theme(alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared')).properties(
    title='Percent of Resources with Accessibility Issues (\"Error\" & \"Contrast\")'
)

## Outstanding Accessibility Issues

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    ('Visualizations', vdf, '#3275B4')
]

charts = []
for (title, data, color) in mapping:
    df = data.copy()
    df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
    df = df[df.issueType != 'alert']
    df = df.groupby(by='issueLongDescription').mean().reset_index()
    sort = df.sort_values(by='issueCount', ascending=False).issueLongDescription.unique().tolist()
    
    chart = alt.Chart(df).mark_bar().encode(
        y=alt.Y('issueLongDescription', sort=sort, title=None, axis=alt.Axis(labels=False if title != 'Journals' else True, grid=True)),
        x=alt.X('mean(issueCount)'),
        color=alt.Color('issueLongDescription', scale=alt.Scale(range=[color]), legend=None)
    )
    charts.append(chart.properties(title=f'{title}'))
apply_theme(alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared')).properties(
    title='Average Number of Accessibility Issues (\"Error\" & \"Contrast\")'
)

## Accessibility by Data Portal Found Year
Note: we do not have a time variable to track the accessibility over time

In [None]:
df = pdf.copy()
df = df[(df.issueType == 'error') | (df.issueType == 'contrast')]
df = df[df.foundedYear != '']
df = df[df.foundedYear.astype('int') >= 1998]

grouped = df.groupby(by=['baId', 'foundedYear']).agg({'issueCount': 'sum', 'dbId': 'count'}).reset_index()

chart = alt.Chart(grouped).mark_point(
    opacity=1,
    filled=True
).encode(
    x=alt.X('foundedYear:T'),
    y=alt.Y('mean(issueCount)'),
    size=alt.Size('count(baId):Q'),
    # color=alt.Color('baId', scale=alt.Scale(), legend=None),
    tooltip=['count(baId)']
)
chart.properties(title='Accessibility of Journals by Founded Year')

In [None]:
df = pdf.copy()
df = df[(df.issueType == 'error') | (df.issueType == 'contrast')]
df = df[df.foundedYear != '']
# df = df[df.foundedYear.astype('int') >= 1998]

grouped = df.groupby(by=['baId', 'foundedYear']).agg({'issueCount': 'sum'}).reset_index()
grouped['hasIssues'] = grouped.issueCount.apply(lambda x: x > 0)
grouped = grouped.groupby(by=['hasIssues', 'foundedYear']).count().reset_index()

tdf = grouped.groupby(by='foundedYear').agg({'issueCount': 'sum'}).rename(columns={'issueCount': 'totalIssueCount'})
grouped = grouped.set_index('foundedYear')
grouped = grouped.join(tdf).reset_index()
grouped['issuePercent'] = grouped.issueCount / grouped.totalIssueCount
grouped = grouped[grouped['hasIssues'] == False]
MAX_COUNT_PER_YEAR = grouped['totalIssueCount'].max()

chart = alt.Chart(grouped).mark_bar(
    opacity=1,
    filled=True,
    # size=20
).encode(
    x=alt.X('foundedYear'),
    y=alt.Y('issuePercent', axis=alt.Axis(format='%'), title='Percent of Resources w/o Issues'),
    # color=alt.value('#E6A01B')
    # color=alt.Color('hasIssues', scale=alt.Scale(domain=[False, True], range=['blue', 'red']), legend=None),
    color=alt.Color('totalIssueCount', scale=alt.Scale(scheme='bluegreen', domain=[0, MAX_COUNT_PER_YEAR]), legend=alt.Legend(gradientLength=120, gradientThickness=20, titlePadding=20), title='# of Resources'),
    tooltip=['totalIssueCount']
).properties(
    width=800
)
apply_theme(chart.properties(title='The Percent of Data Portals without Accessibility Issues by Founded Year'), x_label_angle=310)
# grouped

## Accessibility vs. Impact

In [None]:
highlight = True
mapping = [
    ('Journals', jdf, 'hIndex', 'issueCount', 'title', '#CC7DAA'),
    ('Data Portals', pdf, 'citation', 'issueCount', 'shortName', '#409F7A'),
    ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for (title, data, impactField, issueField, nameField, color) in mapping:

    filtered = data[((data.issueType == 'error') | (data.issueType == 'contrast'))].copy()
    # filtered = data[(data[impactField] >= 1) & (data[issueField] >= 1)]
    N = len(data.baId.unique().tolist())
        
    """
    Do we want to highlight certain groups of interest?
    """
    filtered['highlight'] = True
    if highlight:
        if title == 'Journals':
            filtered['highlight'] = filtered.publisher.apply(lambda x: x == 'Nature Publishing Group')
        elif title == 'Data Portals':
            filtered['highlight'] = filtered.hostInstitution.apply(lambda x: x == 'National Cancer Institute')
        else:
            continue
    
    chart = alt.Chart(
        filtered
    ).mark_point(
        opacity=0.3,
        # filled=True,
        size=70
    ).encode(
        x=alt.X(f'mean({impactField}):Q', title=impactField, scale=alt.Scale(type='symlog')),
        y=alt.Y(f'sum({issueField}):Q', title=issueField, scale=alt.Scale(type='symlog')),
        # color='hostInstitution',
        color=alt.Color(f'{nameField}:N', scale=alt.Scale(range=['grey' if highlight else color]), legend=None),
        tooltip=[nameField]
    )
    

    trend = chart.transform_regression(
        impactField,
        issueField,
        method='log'
    ).mark_line(
        opacity=1,
        stroke='black',
        strokeDash=[3 ,3]
    ).encode(
        x=alt.X(impactField, title=impactField, scale=alt.Scale(type='symlog')),
        y=alt.Y(issueField, title=issueField, scale=alt.Scale(type='symlog'))
    )
    
    if highlight:
        overlay = chart.mark_point(filled=True, opacity=0.7, size=140).encode(
            color=alt.Color(f'highlight:N', scale=alt.Scale(domain=[True], range=[color]), legend=None)
        )
        # .transform_filter(
        #     alt.FieldEqualPredicate(field='highlight', equal=True)
        # )
        chart = (chart + overlay).resolve_scale(color='independent')

    # top = alt.Chart(
    #     filtered
    # ).transform_density(
    #     impactField,
    #     as_=[impactField, f'{impactField}Density']
    # ).mark_area().encode(
    #     x=alt.X(f'{impactField}:Q', bin=True, scale=alt.Scale(type='log')),
    #     y='count()',
    # ).properties(
    #     height=60
    # )
    charts.append((chart + trend).properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        },
        width=340,
        height=300
    ))
apply_theme(alt.hconcat(*charts).resolve_scale(color='independent', y='shared')).properties(
    title='Impact Scores by Number of Accessibility Issues (\"Error\" & \"Contrast\")'
)