In [None]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import altair as alt
from theme import apply_theme
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

In [None]:
DATA_DATE = 'Feb 24, 2023'

## Load Evaluation Data

In [None]:
jdf = pd.read_json('../output/journals-merged.json')
print(jdf.columns)
jdf.head()

In [None]:
pdf = pd.read_json('../output/portals-merged.json')
print(pdf.columns)
pdf.head()

In [None]:
# Not using visualization resources since the result is less high quality
# vdf = pd.read_json('../output/visualizations-merged.json')
# print(vdf.columns)
# vdf.head()

In [None]:
MAX_ISSUES_JOURNAL = jdf[jdf.issueType != 'alert'].groupby(by=['baId']).sum().issueCount.max()

MAX_ISSUES_PORTAL = pdf[pdf.issueType != 'alert'].groupby(by=['baId']).sum().issueCount.max()

print(f'The most errors detected on a single journal portal was {MAX_ISSUES_JOURNAL}!')
print(f'The most errors detected on a single data portal was {MAX_ISSUES_PORTAL}!')

## Percent of Resources with Accessibility Issues

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):

    N = len(data.baId.unique().tolist())

    # The accessibility issues we count 
    filtered = data[
        (data.issueType == 'error')
        | (data.issueType == 'contrast')
    ].copy()

    # Calculate how many resources have accessibility issues 
    grouped = filtered.groupby(by='baId').sum().reset_index()
    grouped['hasIssues'] = grouped.issueCount.apply(lambda x: x >= 1)
    grouped['hasIssues'] = grouped['hasIssues'].apply(lambda x: 'Has Issues' if x else 'No Issues') # More readable names

    # Calculate the percent
    grouped = grouped.groupby(by=['hasIssues']).count().reset_index()
    grouped['percentHasIssues'] = grouped.issueCount / N

    chart = alt.Chart(grouped).mark_bar().encode(
        x=alt.X('hasIssues:N', title=None),
        y=alt.Y('percentHasIssues:Q', title='The percentage of resources' if idx == 0 else None, axis=alt.Axis(format='%', labels=True if idx == 0 else False), scale=alt.Scale(domain=[0, 1])),
        color=alt.Color('hasIssues:N', legend=None, scale=alt.Scale(range=[color, '#A0A0A0'], domain=['Has Issues', 'No Issues'])),
        tooltip=['percentHasIssues']
    )

    text = chart.mark_text(
        dy=20,
        size=20,
        # align='right',
        # angle=270
    ).encode(
        text=alt.Text('percentHasIssues', format='.1%'),
        color=alt.value('white')
    ).transform_filter(
        alt.datum['percentHasIssues'] > 0.2
    )

    chart = chart + text

    chart = chart.properties(
        title={
            "text": f'{title}',
            "subtitle": f'N={N}',
        },
        width=300
    )
    charts.append(chart)
    
apply_theme(
    alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared').properties(
        title={
            "text": 'Percentage of Resources with Accessibility Issues',
            "subtitle": [
                'Only \"error\" and \"contrast\" issues are counted',
                f'Data as of {DATA_DATE}',
            ]
        }
))

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):
    N = len(data.baId.unique().tolist())
        
    df = data.copy()
    df = df[df.issueType != 'alert']
    df = df.groupby(by=['baId']).sum().reset_index()

    chart = alt.Chart(df).mark_point(
        filled=True,
        size=200
    ).encode(
        x=alt.X('issueCount', aggregate='mean', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value(color)
    ).properties(
        width=400,
        height=70
    )

    error_bars = chart.mark_errorbar(extent='ci').encode(
        x=alt.X('issueCount:Q', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value('black')
    )

    chart = error_bars + chart

    chart = chart.properties(title={
        "text": f'{title}',
        "subtitle": f'N={N}',
    })
    
    charts.append(chart)

apply_theme(alt.vconcat(*charts, spacing=10).resolve_scale(color='independent', x='shared')).properties(
    title={
        "text": 'The Average Number of Accessibility Issues',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):
    N = len(data.baId.unique().tolist())
        
    df = data.copy()
    # df = df[df.issueType != 'alert']
    df = df.groupby(by=['baId']).sum().reset_index()

    chart = alt.Chart(df).mark_point(
        filled=True,
        size=200
    ).encode(
        x=alt.X('issueCount', aggregate='mean', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value(color)
    ).properties(
        width=400,
        height=70
    )

    error_bars = chart.mark_errorbar(extent='ci').encode(
        x=alt.X('issueCount:Q', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value('black')
    )

    chart = error_bars + chart

    chart = chart.properties(title={
        "text": f'{title}',
        "subtitle": f'N={N}',
    })
    
    charts.append(chart)

apply_theme(alt.vconcat(*charts, spacing=10).resolve_scale(color='independent', x='shared')).properties(
    title={
        "text": 'The Average Number of Accessibility Issues',
        "subtitle": [
            'All of \"error\", \"contrast\", and \"alert\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

In [None]:
mapping = [
    ('Journals', jdf, 'hIndex', 'issueCount', 'title', '#CC7DAA'),
    ('Data Portals', pdf, 'citation', 'issueCount', 'shortName', '#409F7A'),
    # ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for (title, df, impactField, issueField, nameField, color) in mapping:

    # filtered = df.copy()
    filtered = df[(df.issueType == 'error') | (df.issueType == 'contrast')].copy()
    
    N = len(df.baId.unique().tolist())

    agg = filtered.groupby(by=['baId']).sum().groupby(by=[issueField]).count().reset_index()

    chart = alt.Chart(
       agg 
    ).mark_bar(stroke='black').encode(
        x=alt.X(f'{issueField}:Q', scale=alt.Scale(type='symlog', constant=10, nice=False), title='The number of issues encountered'),
        y=alt.Y(f'sum({impactField}):Q', title='The number of resources'),
        color=alt.value(color)
    ).properties(
        width=500
    )
    
    chart = chart.properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        }
    )
    
    charts.append(chart)
    
apply_theme(alt.hconcat(*charts).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Number of Resources by the Number of Accessibility Issues',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

In [None]:
mapping = [
    ('Journals', jdf, 'hIndex', 'issueCount', 'title', '#CC7DAA'),
    ('Data Portals', pdf, 'citation', 'issueCount', 'shortName', '#409F7A'),
    # ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for (title, df, impactField, issueField, nameField, color) in mapping:

    filtered = df.copy()
    # filtered = df[(df.issueType == 'error') | (df.issueType == 'contrast')].copy()
    
    N = len(df.baId.unique().tolist())

    agg = filtered.groupby(by=['baId']).sum().groupby(by=[issueField]).count().reset_index()

    chart = alt.Chart(
       agg 
    ).mark_bar(stroke='black').encode(
        x=alt.X(f'{issueField}:Q', scale=alt.Scale(type='symlog', constant=10, nice=False), title='The number of issues encountered'),
        y=alt.Y(f'sum({impactField}):Q', title='The number of resources'),
        color=alt.value(color)
    ).properties(
        width=500
    )
    
    chart = chart.properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        }
    )
    
    charts.append(chart)
    
apply_theme(alt.hconcat(*charts).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Number of Resources by the Number of Accessibility Issues',
        "subtitle": [
            'All of \"error\", \"contrast\", and \"alert\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

## Popular Accessibility Issues

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):
    df = data.copy()
    df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
    df = df[df.issueType != 'alert']
    cdf = df.groupby(by='issueLongDescription').mean().reset_index()
    sort = cdf.sort_values(by='issueCount', ascending=False).issueLongDescription.unique().tolist()
    
    N = len(data.baId.unique().tolist())

    chart = alt.Chart(df).mark_point(
        filled=True
    ).encode(
        y=alt.Y('issueLongDescription', sort=sort, title=None, axis=alt.Axis(labels=True if idx == 0 else False, grid=True)),
        x=alt.X('issueCount', aggregate='mean', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value(color),
        tooltip=[
            'count(issueCount)'
        ]
    )

    error_bars = chart.mark_errorbar(extent='ci').encode(
        x=alt.X('issueCount:Q', title='Average number of accessibility issues (95% CIs)'),
        color=alt.value('black')
    )

    chart = error_bars + chart

    chart = chart.properties(title={
        "text": f'{title}',
        "subtitle": f'N={N}',
    })
    
    charts.append(chart)

apply_theme(alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Average Number of Accessibility Issues Encountered in Each Resource',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ],
        "dx":200
    }
)

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):
    df = data.copy()
    df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
    df = df[df.issueType != 'alert']
    df = df[df.issueDesc != 'No Issues Found']
    
    # Calculate the number of resources that have accessibility issues 
    df = df.groupby(by=['baId', 'issueLongDescription']).sum().reset_index()
    df['hasIssues'] = df.issueCount.apply(lambda x: x >= 1)
    df['hasIssues'] = df['hasIssues'].apply(lambda x: 'Has Issues' if x else 'No Issues') # More readable names

    df = df.groupby(by=['hasIssues', 'issueLongDescription']).count().reset_index()
    N = len(data.baId.unique().tolist())
    df['percentHasIssues'] = df.issueCount / N
    
    sort = df.sort_values(by='percentHasIssues', ascending=False).issueLongDescription.unique().tolist()
    
    chart = alt.Chart(df).mark_bar(
        filled=True
    ).encode(
        y=alt.Y('issueLongDescription', sort=sort, title=None, axis=alt.Axis(labels=True if idx == 0 else False, grid=True)),
        x=alt.X('percentHasIssues', title='The percentage of resources', axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 0.8], reverse=True if idx == 0 else False)),
        color=alt.value(color)
    )
    
    text = chart.mark_text(
        dx=-10 if idx == 0 else 10,
        align='right' if idx == 0 else 'left'
    ).encode(
        text=alt.Text('percentHasIssues', format='.1%')
    )
    
    rule = alt.Chart().mark_rule(color='grey', strokeDash=[5, 5]).encode(x=alt.datum(0.5))

    chart = chart + rule + chart + text
    
    chart = chart.properties(title={
        "text": f'{title}',
        "subtitle": f'N={N}',
    })
    
    charts.append(chart)

apply_theme(alt.hconcat(*charts, spacing=0).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Percentage of Resources with Corresponding Accessibility Issues',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ],
        "dx":200
    }
)

In [None]:
mapping = [
    ('Journals', jdf, '#CC7DAA'),
    ('Data Portals', pdf, '#409F7A'),
    # ('Visualizations', vdf, '#3275B4')
]

charts = []
for idx, (title, data, color) in enumerate(mapping):
    df = data.copy()
    df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
    df = df[df.issueType == 'alert']
    df = df[df.issueDesc != 'No Issues Found']
    
    # Calculate the number of resources that have accessibility issues 
    df = df.groupby(by=['baId', 'issueLongDescription']).sum().reset_index()
    df['hasIssues'] = df.issueCount.apply(lambda x: x >= 1)
    df['hasIssues'] = df['hasIssues'].apply(lambda x: 'Has Issues' if x else 'No Issues') # More readable names

    df = df.groupby(by=['hasIssues', 'issueLongDescription']).count().reset_index()
    N = len(data.baId.unique().tolist())
    df['percentHasIssues'] = df.issueCount / N
    
    sort = df.sort_values(by='percentHasIssues', ascending=False).issueLongDescription.unique().tolist()
    
    chart = alt.Chart(df).mark_bar(
        filled=True
    ).encode(
        y=alt.Y('issueLongDescription', sort=sort, title=None, axis=alt.Axis(labels=True if idx == 0 else False, grid=True)),
        x=alt.X('percentHasIssues', title='The percentage of resources', axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 0.8], reverse=True if idx == 0 else False)),
        color=alt.value(color)
    )
    
    text = chart.mark_text(
        dx=-10 if idx == 0 else 10,
        align='right' if idx == 0 else 'left'
    ).encode(
        text=alt.Text('percentHasIssues', format='.1%')
    )
    
    rule = alt.Chart().mark_rule(color='grey', strokeDash=[5, 5]).encode(x=alt.datum(0.5))

    chart = chart + rule + chart + text
    
    chart = chart.properties(title={
        "text": f'{title}',
        "subtitle": f'N={N}',
    })
    
    charts.append(chart)

apply_theme(alt.hconcat(*charts, spacing=0).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Percentage of Resources with Corresponding Accessibility Issues',
        "subtitle": [
            'Only \"alert\" issues are counted',
            f'Data as of {DATA_DATE}',
        ],
        "dx":200
    }
)

## Accessibility by Data Portal Found Year
Note: we do not have a time variable to track the accessibility over time

In [None]:
df = pdf.copy()
# df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
df = df[(df.issueType == 'error') | (df.issueType == 'contrast')]
df = df[df.foundedYear != '']
# df = df[df.foundedYear.astype('int') >= 1998]

# How many resources have any issues in each founded year?
issue_year = df.groupby(by=['baId', 'foundedYear']).agg({'issueCount': 'sum'}).reset_index()
issue_year['hasIssues'] = issue_year.issueCount.apply(lambda x: x > 0)
issue_year = issue_year.groupby(by=['hasIssues', 'foundedYear']).count().reset_index()
issue_year = issue_year[issue_year.hasIssues == True]

# How many resources are they in each founded year?
resource_year = df.groupby(by=['baId', 'foundedYear']).count().reset_index() # .agg({'baId': 'count'}).rename(columns={'baId': 'resourcesInFoundedYear'})
resource_year = resource_year.groupby(by=['foundedYear']).count()
resource_year = resource_year.rename(columns={'baId': 'resourcesInFoundedYear'})

print(resource_year.resourcesInFoundedYear.sum())

# merge
df = issue_year.merge(resource_year.reset_index()[['foundedYear', 'resourcesInFoundedYear']],  on='foundedYear', how='left')

# Calculate percentage
df['issuePercentage'] = df.issueCount /  df.resourcesInFoundedYear

# df = df[~df.issueLongDescription.str.contains('No Issues Found')]

chart = alt.Chart(df).mark_bar(
).encode(
    x=alt.X('foundedYear:O', title='Founded Year', axis=alt.Axis(labelAngle=270)),
    y=alt.Y('issuePercentage', axis=alt.Axis(format='%', orient='right', tickCount=3), title=None), # title='The percentage of resources'
    color=alt.Color('issueCount', scale=alt.Scale(scheme='bluegreen'), legend=alt.Legend(orient='right', titleLimit=400, titleOrient='top',gradientLength=220, gradientThickness=20, titlePadding=20), title='The Number of Resources'),
).properties(
    width=800,
    height=200,
    title={
        "text": 'Data Portals',
        "subtitle": f'N={N}',
    }
)

apply_theme(
    chart,
    header_label_orient='left'
).properties(
    title={
        "text": 'The Percentage of Data Portals with Accessibility Issues on Each Founded Year',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)
# df.tail(50)

In [None]:
df = pdf.copy()
# df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
df = df[(df.issueType == 'error')]
df = df[df.foundedYear != '']
# df = df[df.foundedYear.astype('int') >= 1998]

# How many resources have any issues in each founded year?
issue_year = df.groupby(by=['baId', 'foundedYear']).agg({'issueCount': 'sum'}).reset_index()
issue_year['hasIssues'] = issue_year.issueCount.apply(lambda x: x > 0)
issue_year = issue_year.groupby(by=['hasIssues', 'foundedYear']).count().reset_index()
issue_year = issue_year[issue_year.hasIssues == True]

# How many resources are they in each founded year?
resource_year = df.groupby(by=['baId', 'foundedYear']).count().reset_index() # .agg({'baId': 'count'}).rename(columns={'baId': 'resourcesInFoundedYear'})
resource_year = resource_year.groupby(by=['foundedYear']).count()
resource_year = resource_year.rename(columns={'baId': 'resourcesInFoundedYear'})

print(resource_year.resourcesInFoundedYear.sum())

# merge
df = issue_year.merge(resource_year.reset_index()[['foundedYear', 'resourcesInFoundedYear']],  on='foundedYear', how='left')

# Calculate percentage
df['issuePercentage'] = df.issueCount /  df.resourcesInFoundedYear

# df = df[~df.issueLongDescription.str.contains('No Issues Found')]

chart = alt.Chart(df).mark_bar(
).encode(
    x=alt.X('foundedYear:O', title='Founded Year', axis=alt.Axis(labelAngle=270)),
    y=alt.Y('issuePercentage', axis=alt.Axis(format='%', orient='right', tickCount=3), title=None), # title='The percentage of resources'
    color=alt.Color('issueCount', scale=alt.Scale(scheme='bluegreen'), legend=alt.Legend(orient='right', titleLimit=400, titleOrient='top',gradientLength=220, gradientThickness=20, titlePadding=20), title='The Number of Resources'),
).properties(
    width=800,
    height=200,
    title={
        "text": 'Data Portals',
        "subtitle": f'N={N}',
    }
)

apply_theme(
    chart,
    header_label_orient='left'
).properties(
    title={
        "text": 'The Percentage of Data Portals with Accessibility Issues on Each Founded Year',
        "subtitle": [
            'Only \"error\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)
# df.tail(50)

In [None]:
df = pdf.copy()
df['issueLongDescription'] = df['issueDesc'] + ' (' + df['issueId'] + ')'
df = df[(df.issueType == 'error') | (df.issueType == 'contrast')]
df = df[df.foundedYear != '']
# df = df[df.foundedYear.astype('int') >= 1998]

# How many resources have any issues in each founded year?
issue_year = df.groupby(by=['baId', 'foundedYear', 'issueLongDescription']).agg({'issueCount': 'sum'}).reset_index()
issue_year['hasIssues'] = issue_year.issueCount.apply(lambda x: x > 0)
issue_year = issue_year.groupby(by=['hasIssues', 'foundedYear', 'issueLongDescription']).count().reset_index()

# How many resources are they in each founded year?
resource_year = df.groupby(by=['baId', 'foundedYear']).count().reset_index() # .agg({'baId': 'count'}).rename(columns={'baId': 'resourcesInFoundedYear'})
resource_year = resource_year.groupby(by=['foundedYear']).count()
resource_year = resource_year.rename(columns={'baId': 'resourcesInFoundedYear'})

print(resource_year.resourcesInFoundedYear.sum())

# merge
df = issue_year.merge(resource_year.reset_index()[['foundedYear', 'resourcesInFoundedYear']],  on='foundedYear', how='left')

# Calculate percentage
df['issuePercentage'] = df.issueCount /  df.resourcesInFoundedYear

df = df[~df.issueLongDescription.str.contains('No Issues Found')]

chart = alt.Chart(df).mark_bar(
).encode(
    x=alt.X('foundedYear:O', title='Founded Year', axis=alt.Axis(labelAngle=270)),
    y=alt.Y('issuePercentage', axis=alt.Axis(format='%', orient='right', tickCount=3), title=None), # title='The percentage of resources'
    color=alt.Color('issueCount', scale=alt.Scale(scheme='bluegreen'), legend=alt.Legend(orient='right', titleLimit=400, titleOrient='top',gradientLength=220, gradientThickness=20, titlePadding=20), title='The Number of Resources'),
    row=alt.Row('issueLongDescription', sort=sort, header=alt.Header(labelAngle=0, labelAlign='left', labelAnchor='middle', labelBaseline='middle'), title=None),
).properties(
    width=800,
    height=50,
    title={
        "text": 'Data Portals',
        "subtitle": f'N={N}',
    }
)

apply_theme(
    chart,
    header_label_orient='left'
).properties(
    title={
        "text": 'The Percentage of Data Portals with Accessibility Issues on Each Founded Year',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ],
        "dx": 130
    }
)
# df.tail(50)

## Accessibility vs. Impact

In [None]:
NIH_INST = ['National Cancer Institute', 'National Center for Biotechnology Information', 'National Institute of Agrobiological Resources', 'National Center for Protein Sciences', 'National Institute for Research in Reproductive Health', 'National Institute of Allergy and Infectious Diseases', 'National Institute of Agrobiological Sciences', 'National Institute of Biomedical Technologies', 'National Institute of Genetics', 'National Institute of Biomedical Innovation', 'National Center for Advancing Translational Sciences', 'National Center for Scientific Research', 'National Institute of Advanced Industrial Science and Technology', 'National Institutes of Natural Sciences', 'National Human Genome Research Institute', 'National Institute of Environmental Health Sciences', 'National Institute of Agricultural Botany', 'National Agricultural Library', 'National Institute of Recherche Agronomique', 'National Library of Medicine', 'National Aeronautics and Space Act', 'National Institute of Plant Genome Research', 'National Institute of Health', 'National Institutes of Biomedical Innovation, Health and Nutrition', 'National Research Council Plant Biotechnology Institute', 'National Research Council', 'National Heart, Lung, and Blood Institute', 'National Institute of Biology', 'National Institute of Standards and Technology', 'National Center for Computational Toxicology', 'National Center for Genome Resources', 'National Institute of Technology and Evaluation', 'National Museum of Natural History', 'National Institute on Aging', 'National Institute of Crop Science', 'National Institute for Agricultural Research (INRA)', 'National Centre for Mathematics and Interdisciplinary Sciences, Chinese Academy of Sciences', 'National Institute of Agronomic Research', 'National Cancer Center', 'National Evolutionary Synthesis Center', 'National Eye Institute', 'National Institute of Chemistry', 'National Center for Genetic Engineering and Biotechnology', 'National Center for Toxicological Research', 'National Conservatory of Arts and Crafts', 'National Institutes of Health', 'National Neuroscience Institute', 'National Agri Food Biotechnology Institute (NABI)']

In [None]:
highlight = True
mapping = [
    ('Journals', jdf, 'hIndex', 'issueCount', 'title', '#CC7DAA'),
    ('Data Portals', pdf, 'citation', 'issueCount', 'shortName', '#409F7A'),
    # ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for (title, df, impactField, issueField, nameField, color) in mapping:

    filtered = df[(df.issueType == 'error') | (df.issueType == 'contrast')].copy()
    
    # filtered = data[(data[impactField] >= 1) & (data[issueField] >= 1)]
    N = len(df.baId.unique().tolist())
        
    """
    Do we want to highlight certain groups of interest?
    """
    filtered['highlight'] = True
    if highlight:
        if title == 'Journals':
            filtered['highlight'] = filtered.publisher.apply(lambda x: x == 'Nature Publishing Group')
        elif title == 'Data Portals':
            filtered['highlight'] = False
            filtered.loc[
                filtered.hostInstitution.isin(NIH_INST),
                'highlight'
            ] = True
        else:
            continue
    
    chart = alt.Chart(
        filtered
    ).mark_point(
        opacity=0.3,
        # filled=True,
        size=70
    ).encode(
        x=alt.X(f'mean({impactField}):Q', title=impactField, scale=alt.Scale(type='symlog')),
        y=alt.Y(f'sum({issueField}):Q', title=issueField, scale=alt.Scale(type='symlog')),
        # color='hostInstitution',
        color=alt.Color(f'{nameField}:N', scale=alt.Scale(range=['grey' if highlight else color]), legend=None),
        tooltip=[nameField]
    )
    

    trend = chart.transform_regression(
        impactField,
        issueField,
        method='log'
    ).mark_line(
        opacity=1,
        stroke='black',
        strokeDash=[3 ,3]
    ).encode(
        x=alt.X(impactField, title=impactField, scale=alt.Scale(type='symlog', constant=4)),
        y=alt.Y(issueField, title=issueField, scale=alt.Scale(type='symlog', constant=4))
    )
    
    if highlight:
        overlay = chart.mark_point(filled=True, opacity=0.7, size=140).encode(
            color=alt.Color(f'highlight:N', scale=alt.Scale(domain=[True], range=[color]), legend=None)
        )
        # .transform_filter(
        #     alt.FieldEqualPredicate(field='highlight', equal=True)
        # )
        chart = (chart + overlay).resolve_scale(color='independent')

    agg = filtered.groupby(by=['baId']).sum().groupby(by=[issueField]).count().reset_index()
    # print(agg)
    right = alt.Chart(
       agg 
    ).mark_area().encode(
        x=alt.X(f'sum({impactField}):Q', title=None),
        y=alt.Y(f'{issueField}:Q', scale=alt.Scale(type='symlog', nice=False), axis=None),
        color=alt.value('grey' if highlight else color)
        # color=alt.Color(f'{nameField}:N', scale=alt.Scale(range=['grey' if highlight else color]), legend=None),
    ).properties(
        width=60
    )
    
    chart = (chart + trend).properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        },
        width=800,
        height=700
    )
    
    chart = alt.hconcat(chart, right).resolve_scale(y='shared')
    
    charts.append(chart)
apply_theme(alt.hconcat(*charts).resolve_scale(color='independent', y='shared')).properties(
    title={
        "text": 'The Number of Accessibility Issues by Impact Scores (Individual Resources)',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

In [None]:
highlight = True
mapping = [
    ('Journal Publishers', jdf, 'hIndex', 'issueCount', 'title', 'publisher', '#CC7DAA'),
    ('Data Portal Host Institutions', pdf, 'citation', 'issueCount', 'shortName', 'hostInstitution', '#409F7A'),
    # ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for idx, (title, df, impactField, issueField, nameField, groupField, color) in enumerate(mapping):

    filtered = df[(df.issueType == 'error') | (df.issueType == 'contrast')].copy()
    
    # filtered = data[(data[impactField] >= 1) & (data[issueField] >= 1)]
    N = len(df[groupField].unique().tolist())
        
    filtered = filtered.groupby(by=[groupField, nameField]).agg({impactField: 'mean', issueField: 'sum'}).reset_index().groupby(by=[groupField]).agg({impactField: 'mean', issueField: 'mean', nameField: 'count'}).reset_index()
    
    """
    Do we want to highlight certain groups of interest?
    """
    filtered['highlight'] = True
    if highlight:
        if title == 'Journal Publishers':
            filtered['highlight'] = filtered.publisher.apply(lambda x: x == 'Nature Publishing Group')
        elif title == 'Data Portal Host Institutions':
            filtered['highlight'] = False
            filtered.loc[
                filtered.hostInstitution.isin(NIH_INST),
                'highlight'
            ] = True
        else:
            continue
    
    chart = alt.Chart(
        filtered
    ).mark_point(
        opacity=0.6,
        filled=True,
        size=70
    ).encode(
        x=alt.X(f'mean({impactField}):Q', title=impactField, scale=alt.Scale(type='symlog')),
        y=alt.Y(f'sum({issueField}):Q', title='The average number of accessibility issues', scale=alt.Scale(type='symlog'), axis=alt.Axis(
            orient='left' if idx == 0 else 'left'
        )),
        color=alt.Color(f'{groupField}:N', scale=alt.Scale(range=['grey' if highlight else color]), legend=None),
        size=alt.Size(nameField, scale=alt.Scale(range=[10, 1000], zero=False), title='The number of Resources', legend=alt.Legend(titleLimit=500)),
        tooltip=[groupField, alt.Tooltip(nameField, title='count')]
    )

    trend = chart.transform_regression(
        impactField,
        issueField,
        method='log'
    ).mark_line(
        opacity=1,
        stroke='black',
        strokeDash=[3 ,3]
    ).encode(
        x=alt.X(impactField, title=impactField, scale=alt.Scale(type='symlog', constant=4)),
        y=alt.Y(issueField, title='The average number of accessibility issues', scale=alt.Scale(type='symlog', constant=4))
    )
    
    if highlight:
        overlay = chart.encode(
            color=alt.Color(f'highlight:N', scale=alt.Scale(domain=[True], range=[color]), legend=None)
        )
        # .transform_filter(
        #     alt.FieldEqualPredicate(field='highlight', equal=True)
        # )
        text = overlay.mark_text(
            dy=10,
            align='left'
        ).encode(
            text=groupField,
            size=alt.value(10)
        )
        chart = (chart + overlay).resolve_scale(color='independent')
    
    chart = (chart + trend).properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        },
        width=800,
        height=700
    )
    
    # chart = alt.hconcat(chart, right).resolve_scale(y='shared')
    
    charts.append(chart)
apply_theme(
    alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared')
).properties(
    title={
        "text": 'The Number of Accessibility Issues by Impact Scores (Organizers)',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)

In [None]:
highlight = False
mapping = [
    ('Journal Countries', jdf, 'hIndex', 'issueCount', 'title', 'country', '#CC7DAA'),
    ('Data Portal Countries', pdf, 'citation', 'issueCount', 'shortName', 'country', '#409F7A'),
    # ('Visualizations', vdf, 'githubStars', 'issueCount', 'name', '#3275B4')
]

charts = []
for idx, (title, df, impactField, issueField, nameField, groupField, color) in enumerate(mapping):

    filtered = df[(df.issueType == 'error') | (df.issueType == 'contrast')].copy()
    
    # filtered = data[(data[impactField] >= 1) & (data[issueField] >= 1)]
    N = len(df[groupField].unique().tolist())
        
    filtered = filtered.groupby(by=[groupField, nameField]).agg({impactField: 'mean', issueField: 'sum'}).reset_index().groupby(by=[groupField]).agg({impactField: 'mean', issueField: 'mean', nameField: 'count'}).reset_index()
    """
    Do we want to highlight certain groups of interest?
    """
    filtered['highlight'] = True
    if highlight:
        if title == 'Journal Countries':
            filtered['highlight'] = filtered.publisher.apply(lambda x: x == 'Nature Publishing Group')
        elif title == 'Data Portal Countries':
            filtered['highlight'] = False
            filtered.loc[
                filtered.hostInstitution.isin(NIH_INST),
                'highlight'
            ] = True
        else:
            continue
    
    chart = alt.Chart(
        filtered
    ).mark_point(
        opacity=0.6,
        filled=True,
        size=70
    ).encode(
        x=alt.X(f'mean({impactField}):Q', title=impactField, scale=alt.Scale(type='symlog')),
        y=alt.Y(f'sum({issueField}):Q', title='The average number of accessibility issues', scale=alt.Scale(type='symlog'), axis=alt.Axis(
            orient='left' if idx == 0 else 'left'
        )),
        color=alt.Color(f'{groupField}:N', scale=alt.Scale(range=['grey' if highlight else color]), legend=None),
        size=alt.Size(nameField, scale=alt.Scale(range=[10, 1000], zero=False), title='The number of Resources', legend=alt.Legend(titleLimit=500)),
        tooltip=[groupField, alt.Tooltip(nameField, title='count')]
    )
    
    text = chart.mark_text(
        dy=10,
        align='left'
    ).encode(
        text=groupField,
        size=alt.value(10)
    )

    trend = chart.transform_regression(
        impactField,
        issueField,
        method='log'
    ).mark_line(
        opacity=1,
        stroke='black',
        strokeDash=[3 ,3]
    ).encode(
        x=alt.X(impactField, title=impactField, scale=alt.Scale(type='symlog', constant=4)),
        y=alt.Y(issueField, title='The average number of accessibility issues', scale=alt.Scale(type='symlog', constant=4))
    )
    
    if highlight:
        overlay = chart.encode(
            color=alt.Color(f'highlight:N', scale=alt.Scale(domain=[True], range=[color]), legend=None)
        )
        # .transform_filter(
        #     alt.FieldEqualPredicate(field='highlight', equal=True)
        # )
        chart = (chart + overlay).resolve_scale(color='independent')
    
    chart = (chart + text + trend).properties(
        title={
            "text": title,
            "subtitle": f'N={N}'
        },
        width=800,
        height=700
    )
    
    # chart = alt.hconcat(chart, right).resolve_scale(y='shared')
    
    charts.append(chart)
apply_theme(
    alt.hconcat(*charts, spacing=10).resolve_scale(color='independent', y='shared')
).properties(
    title={
        "text": 'The Number of Accessibility Issues by Impact Scores (Country)',
        "subtitle": [
            'Only \"error\" and \"contrast\" issues are counted',
            f'Data as of {DATA_DATE}',
        ]
    }
)