# Figures

In [2]:
import pandas as pd
import altair as alt

In [3]:
# Define custom theme to be applied to all plots
def theme():
    return {
        "config": {
            "title": {
                "dy": 1,
                "fontSize": 22,
                "fontWeight": 400,
                "align": "center",
                "anchor": "middle",
                "subtitleColor": "grey",
                "subtitleFontSize": 22
            },
            "view": {
                "fill": "#FCFCFC",
            },
            "header": {
                "titleFontSize": 23,
                "labelFontSize": 23,
                "labelFontWeight": 400,
            },
            "axis": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 20,
                "labelFontWeight": 400,
                "labelLimit": 1000,
                "domainWidth": 1.5,
                "domainColor": "black",
                "tickColor": "black",
                "domain": False
            },
            "axisX": {
                "domain": True
            },
            "legend": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 23,
                "labelLimit": 1000,
                "strokeColor": '#F4F6F7',
                "padding": 15
            }
        }
    }

alt.themes.register("theme", theme)
alt.themes.enable("theme")

ThemeRegistry.enable('theme')

In [37]:
def consistency(df=None):
    df.resource_category = df.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

In [203]:
"""
The name of the folder you created under `../data/`
"""
TIME_STAMP_FOLDER_NAME = '08-01-2024'

In [204]:
COLORS = {
    'Data Portals': '#56B4E9',
    'Journal Websites': '#CC79A7',
    'US Government Websites': '#009E73'
}

## Calculate Baseline

Median FF of US Government Websites

In [205]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')
BASELINE = df[df.resource_category == 'government'].failure_rate.median()

  df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')


## Calculate Num Pages

In [206]:
NUM_GOV_PAGES = len(df[df.resource_category == 'government'])
NUM_GOV_PAGES

852

In [207]:
NUM_DP_PAGES = len(df[df.resource_category == 'data-portal'])
NUM_DP_PAGES

3112

In [208]:
NUM_JW_PAGES = len(df[df.resource_category == 'journal'])
NUM_JW_PAGES

5099

## Estimated Failure Rates
This data is shared by the collaborator and not directly created from the notebooks in this repository.

In [209]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/2024-08-19_failure_rate_meta.csv')

### By Resource Category

In [210]:
df_overall = df[
    # (df.continent == 'ALL') &
    (df.country == 'ALL') &
    (df.publisher == 'ALL') |
    (df.web_type == 'government') &
    (df.publisher == 'ALL')
]

df_overall.web_type = df_overall.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

df_overall

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_overall.web_type = df_overall.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


Unnamed: 0,web_type,continent,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
0,Journal Websites,ALL,ALL,ALL,0.028875,0.013659,0.028133,0.029635,208657,4980299,4994
194,US Government Websites,North America,United States,ALL,0.014602,0.037293,0.013587,0.015692,11517,718179,4994
195,Data Portals,ALL,ALL,ALL,0.062697,0.016597,0.060812,0.064636,11517,718179,4994


In [211]:
base = alt.Chart(df_overall).mark_circle(
    size=100,
    opacity=1
).encode(
    alt.Y('web_type:N', title=None, sort=['US Government Websites', 'Journal Websites', 'Data Portals']),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%', tickCount=5).scale(domain=[0, 0.1]),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
    ]
).properties(
    height=200,
    width=300
)

error = base.mark_errorbar(
    thickness=2,
    color='black'
).encode(
    alt.X('ci_95L', title='Estimated failure rate'),
    alt.X2('ci_95U'),
    color=alt.value('black')
)

plot = base + error + base

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/resource-category-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/resource-category-estimated.json')

plot

### By Hosting Institutions

In [212]:
df_org = df[(df.web_type != 'government') & (df.publisher != 'ALL') & (df.publisher.notnull())]
df_org.web_type = df_org.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')
df_org

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_org.web_type = df_org.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


Unnamed: 0,web_type,continent,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
73,Journal Websites,,,ABV-press Publishing house,0.219781,0.032217,0.209145,0.230800,1797,8203,6
74,Journal Websites,,,Academic Press Inc.,0.019512,0.119018,0.015515,0.024513,72,3690,18
75,Journal Websites,,,Adis International Ltd,0.001061,0.333519,0.000552,0.002039,9,8797,8
76,Journal Websites,,,Advanstar Communications Inc.,0.064402,0.169771,0.047031,0.087600,668,8594,5
77,Journal Websites,,,Agricultural Research Communication Centre,0.024727,0.084092,0.021049,0.029029,145,5867,5
...,...,...,...,...,...,...,...,...,...,...,...
332,Data Portals,,,Wellcome Sanger Institute,0.045122,0.267720,0.027201,0.073953,405,13404,14
333,Data Portals,,,Wuhan University,0.034161,0.540175,0.012121,0.092526,82,2726,5
334,Data Portals,,,Yale University,0.032111,0.368054,0.015871,0.063892,195,6282,12
335,Data Portals,,,Yonsei University,0.062245,0.554024,0.021918,0.164303,112,4604,7


In [213]:
top_10_journal_publishers_by_size = df_org[(df_org.web_type == 'Journal Websites') & (df_org.units > 52)]
len(top_10_journal_publishers_by_size)

10

In [214]:
top_10_data_portal_publishers_by_size = df_org[(df_org.web_type == 'Data Portals') & (df_org.units > 18) & (df_org.publisher != 'China Agricultural University')]
len(top_10_data_portal_publishers_by_size)

10

In [215]:
df_org_filtered = pd.concat([top_10_journal_publishers_by_size, top_10_data_portal_publishers_by_size])
df_org_filtered

Unnamed: 0,web_type,continent,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
80,Journal Websites,,,American Psychological Association,0.07483,0.038585,0.06976,0.080237,726,9702,66
84,Journal Websites,,,Bentham Science Publishers B.V.,0.031073,0.03425,0.029114,0.033159,3659,122483,96
85,Journal Websites,,,BioMed Central Ltd.,0.014046,0.045373,0.012867,0.015333,1632,115260,118
89,Journal Websites,,,Cambridge University Press,0.022314,0.053734,0.020128,0.02473,4193,200216,72
107,Journal Websites,,,Elsevier BV,0.020424,0.091866,0.017116,0.024355,593,26524,83
113,Journal Websites,,,Elsevier Masson s.r.l.,0.023805,0.073998,0.020658,0.027419,551,19256,65
161,Journal Websites,,,Springer International Publishing AG,0.002299,0.186732,0.001595,0.003312,190,54309,53
166,Journal Websites,,,Springer Netherlands,0.001038,0.292201,0.000585,0.001838,330,120296,108
167,Journal Websites,,,Springer New York,0.001201,0.090977,0.001005,0.001435,121,112851,103
174,Journal Websites,,,Springer Verlag,0.001244,0.096412,0.00103,0.001503,210,162815,148


In [216]:
base = alt.Chart(df_org_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('publisher:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(tickCount=3, format='.0%').scale(domain=[0, 0.12]),
    alt.Color('web_type:N').scale(range=list(COLORS.values())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('publisher:N', title='Publisher'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/publisher-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/publisher-estimated.json')

plot

### Country

In [217]:
df_country = df[(df.web_type != 'government') & (df.country != 'ALL') & (df.country.notnull())]

In [218]:
top_10_journal_countries_by_size = df_country[(df_country.web_type == 'journal') & (df_country.units > 144)]
len(top_10_journal_countries_by_size)
# top_10_journal_countries_by_size

10

In [219]:
top_10_data_countries_by_size = df_country[(df_country.web_type == 'data_portal') & (df_country.units > 60)]
len(top_10_data_countries_by_size)
# top_10_data_countries_by_size
# df_country.web_type.unique().tolist()

10

In [220]:
countries_data_portals = top_10_data_countries_by_size.country.unique().tolist()
# [
#     'Spain', 'China', 'Germany', 'India', 'Italy', 'Japan', 'Canada', 'France', 'United States', 'United Kingdom'
# ]
countries_journals = top_10_journal_countries_by_size.country.unique().tolist()
# [
#     'China', 'Germany', 'India', 'Spain', 'Italy', 'Switzerland', 'Netherlands', 'France', 'United States', 'United Kingdom'
# ]
df_country_filtered = df_country[
    (df_country.web_type == 'data_portal') & (df_country.country.isin(countries_data_portals))|
    (df_country.web_type == 'journal') & (df_country.country.isin(countries_journals))
]

In [221]:
df_country_filtered.web_type = df_country_filtered.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_country_filtered.web_type = df_country_filtered.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


In [222]:
base = alt.Chart(df_country_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('country:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%').scale(domain=[0, 0.15]),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/country-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/country-estimated.json')

plot

### Continent

In [223]:
df_continents = df[(df.web_type != 'government') & (df.continent != 'ALL') & (df.continent.notnull())]
df_continents.web_type = df_continents.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')
df_continents

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_continents.web_type = df_continents.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


Unnamed: 0,web_type,continent,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
67,Journal Websites,Africa,,ALL,0.036003,0.076621,0.031139,0.041594,2731,65869,71
68,Journal Websites,Asia,,ALL,0.046265,0.027786,0.04392,0.048728,64754,1027465,1119
69,Journal Websites,Europe,,ALL,0.025141,0.018207,0.024281,0.02603,98399,2644802,2555
70,Journal Websites,North America,,ALL,0.020773,0.034739,0.019432,0.022204,26162,841936,898
71,Journal Websites,Oceania,,ALL,0.043518,0.065256,0.038494,0.049163,4330,111250,95
72,Journal Websites,South America,,ALL,0.034672,0.057704,0.031079,0.038663,12281,288977,256
372,Data Portals,Africa,,,0.053976,0.184923,0.038192,0.075768,392,9268,16
373,Data Portals,Asia,,,0.07474,0.029666,0.070817,0.078861,34204,480909,1034
374,Data Portals,Europe,,,0.059757,0.029328,0.056608,0.063069,34500,776663,976
375,Data Portals,North America,,,0.05429,0.028878,0.051456,0.05727,43961,935868,916


In [224]:
base = alt.Chart(df_continents).mark_circle(size=100, opacity=1).encode(
    alt.Y('continent:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%', tickCount=4).scale(domain=[0, 0.1]),
    alt.Color('web_type:N').scale(range=list(COLORS.values())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('continent:N', title='Continent'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

# plot = apply_theme(plot)

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/continent-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/continent-estimated.json')

plot

## Raw Accessibility Results

In [225]:
df_pages = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')
df_pages.resource_category = df_pages.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'Data Portals' if x == 'data-portal' else 'US Government Websites')
df_pages.head(1)

  df_pages = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')


Unnamed: 0,resource_category,website_id,page_id,page_type,page_url,db_id,short_name,accession,full_name,description,...,coverage,categories,areas,country,url,continent,violations,passes,total_checks,failure_rate
0,Data Portals,-INED-COVID,0,home,https://dc-covid.site.ined.fr/en,8047.0,INED COVID,DBC008047,The demography of COVID-19 deaths database,The databse provides COVID-19 death counts alo...,...,,,,France,https://dc-covid.site.ined.fr/en,Europe,7,626,633,0.011058


### Histogram

In [275]:
# alt.data_transformers.enable("vegafusion")
# plot = None
# for category in df_pages.resource_category.unique().tolist():
    
#     _ = (
#         alt.Chart(
#             df_pages[df_pages.failure_rate > 0]
#         ).mark_bar(
#             # opacity=0.01
#             color=COLORS[category],
#             stroke='white',
#             strokeWidth=0.5
#         ).encode(
#             alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=1),
#             alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
#         ).properties(
#             title={
#                 "text": category,
#                 "fontWeight": 600,
#                 "color": "black"
#             },
#             height=300,
#             width=400
#         )
#     )

#     baseline = (
#         _.mark_rule(
#             color='black',
#             size=2,
#             # size=500 / len(COUNTRY_SORT),
#             strokeDash=[4, 4]
#         ).encode(
#             alt.X(f'baseline:Q', title='Failure rate'),
#             y=alt.Y()
#         ).transform_calculate(
#             baseline=f"{BASELINE}"
#         )
#     )
    
#     _ = _ + baseline

#     plot = _ if plot is None else plot | _

# plot = plot.properties(
#     # title={
#     #     # 'text': 'The Distribution of Failure Rates',
#     #     # 'subtitle': '* Dashed line represents the average failure rate of US government websites',
#     #     'subtitleColor': 'grey'
#     # }
# )

# # plot = apply_theme(plot)
# # plot.save('../output/plots/ff-dist.png')
# plot

# alt.Chart(
#     df_pages[df_pages.failure_rate > 0]
# ).mark_rule(
#     color='black',
#     size=2,
#     # size=500 / len(COUNTRY_SORT),
#     strokeDash=[4, 4]
# ).encode(
#     alt.X(f'baseline:Q', title='Failure rate'),
#     y=alt.Y()
# )

In [281]:
with alt.data_transformers.enable("vegafusion"):
    df_pages['baseline'] = BASELINE
    
    plot = alt.Chart(
       df_pages
    ).mark_bar(
        
    ).encode(
        alt.X('failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=1, tickCount=3, tickColor='white', offset=-10),
        alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
        alt.Color('resource_category:N', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
        alt.Column('resource_category:N', title=None)
    ).properties(
        height=300,
        width=400
    )

     
    # gov = plot.mark_rule(
    #     color='black',
    #     size=1,
    #     strokeDash=[4, 4]
    # ).encode(
    #     x=alt.X(f'baseline:Q', title="The number of webpages"),
    #     y=alt.Y(),
    #     # alt.Size(),
    #     # alt.YOffset(),
    #     color=alt.value('black')
    # ).transform_calculate(
    #     baseline=f"{BASELINE}"
    # )

    # plot = alt.layer(plot, gov)
    plot = plot.resolve_scale(y='independent')
    # plot = (plot + gov).facet(column=)
    
    """
    Save for the manuscript figures and website plots
    """
    plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png', scale_factor=8)
    # plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/histogram.json') # large data cannot be inlined

    # plot.resolve_scale(y='independent').display()
    plot.display()

### Most Common Issues

In [227]:
issues = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports.csv')
issues = issues[issues.issue_name != '-']
issues = issues[~issues.issue_name.isna()]
issues.issue_name = issues.issue_name.apply(lambda x: x.replace('Bas', 'Bad'))
issues = issues[issues.resource_category != 'government']
issues['issue_exist'] = issues.violations.apply(lambda x: 0 if x <= 0 else 1)

  issues = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports.csv')


In [228]:
issues.head(1)

Unnamed: 0,resource_category,website_id,page_id,page_type,page_url,issue_id,issue_desc,issue_impact,issue_help,issue_url,...,issue_data_related,issue_data_related_rule,issue_pour_category,issue_wcag_level,issue_difficulty_to_fix,issue_missing_label_related,country,url,continent,issue_exist
0,data-portal,-LacFamPred,0,home,http://proteininformatics.org/mkumar/blacfampred/,color-contrast,Ensures the contrast between foreground and ba...,serious,Elements must meet minimum color contrast rati...,https://dequeuniversity.com/rules/axe/4.9/colo...,...,O,related to data representations,,AA,Moderate,X,India,http://proteininformatics.org/mkumar/blacfampred,Asia,1


In [201]:
issues_agg = issues.groupby(
    [
        'issue_name',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
issues_agg.rename(columns={ 'issue_exist': 'num_pages_with_issues' }, inplace=True)

In [202]:
issues_agg['failure_rate'] = issues_agg.num_pages_with_issues
issues_agg.failure_rate /= issues_agg.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])

In [203]:
top_10_issues_dp = issues_agg[issues_agg.resource_category == 'data-portal'].sort_values(by='failure_rate', ascending=False)
top_10_issues_dp = top_10_issues_dp.issue_name.tolist()[:10]
top_10_issues_dp

['Landmark Has Bad Structure (not contained with landmark)',
 'Landmark Has Bad Structure (multiple main)',
 'Low Color Contrast',
 'Lang Is Missing',
 'Image Has No Label',
 'Heading 1 Missing',
 'Link Has No Label',
 'Heading Has Bad Structure (order)',
 'Link Not Clearly Visible',
 'Landmark Has Bad Structure (duplicated)']

In [204]:
top_10_issues_jw = issues_agg[issues_agg.resource_category == 'journal'].sort_values(by='failure_rate', ascending=False)
top_10_issues_jw = top_10_issues_jw.issue_name.tolist()[:10]
top_10_issues_jw

['Landmark Has Bad Structure (not contained with landmark)',
 'Low Color Contrast',
 'Link Has No Label',
 'Image Has No Label',
 'Landmark Has Bad Structure (multiple main)',
 'Landmark Has Bad Structure (duplicated)',
 'Heading Has Bad Structure (order)',
 'Heading 1 Missing',
 'Lang Is Missing',
 'Link Not Clearly Visible']

In [205]:
issues_agg_filtered = pd.concat([
    issues_agg[(issues_agg.resource_category == 'data-portal') & (issues_agg.issue_name.isin(top_10_issues_dp))],
    issues_agg[(issues_agg.resource_category == 'journal') & (issues_agg.issue_name.isin(top_10_issues_jw))]
])

In [206]:
issues_agg_filtered.resource_category = issues_agg_filtered.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

In [217]:
def top_issue_bar(df=None, category=None, sort=None):
    return alt.Chart(df[df.resource_category == category]).mark_bar().encode(
        alt.X('failure_rate:Q', title='The proportion of pages with issues').axis(format='%').scale(domain=[0, 1]),
        alt.Y('issue_name', sort=sort, title=None),
        # alt.Column('resource_category:N'),
        alt.Color('resource_category:N', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
    ).properties(
        title={
            'text': category,
            'dy': -10
        },
        width=400,
        height=400
    )

d = top_issue_bar(issues_agg_filtered, 'Data Portals', top_10_issues_dp)
j = top_issue_bar(issues_agg_filtered, 'Journal Websites', top_10_issues_jw)

plot = alt.hconcat(d, j)

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/top-issues.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/top-issues.json')

plot

### Overall Impact

In [182]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_overall_impact']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_overall_impact']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality.issue_overall_impact = labels_criticality.issue_overall_impact.apply(lambda x: x.title())
labels_criticality.issue_overall_impact = labels_criticality.issue_overall_impact.apply(lambda x: 'Severe' if x == 'Critical' else x)

labels_criticality

Unnamed: 0,resource_category,issue_overall_impact,issue_exist,proportion
0,Data Portals,Severe,2329,0.748393
1,Data Portals,Minor,1860,0.597686
2,Data Portals,Moderate,3061,0.983612
3,Journal Websites,Severe,3521,0.690528
4,Journal Websites,Minor,2668,0.52324
5,Journal Websites,Moderate,5038,0.988037


In [184]:
base = alt.Chart(
    labels_criticality
).mark_bar(size=80).encode(
    alt.X("issue_overall_impact", title=None).scale(domain=['Severe', 'Moderate', 'Minor']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=400
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages with Overall Impact of Issues',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-overall-impact'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

### Criticality

In [192]:
issues.rename(columns={'issue_severity': 'issue_criticality'}, inplace=True)

In [193]:
issues.issue_criticality = issues.issue_criticality.apply(lambda x: 'O' if x == 'severe' else 'X')
labels_issues = issues[issues.issue_criticality == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_criticality']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues.issue_exist = labels_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_issues = labels_issues.groupby(['resource_category', 'issue_criticality']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues['proportion'] = labels_issues.issue_exist
labels_issues.proportion /= labels_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues_op = labels_issues.copy()
labels_issues_op.issue_criticality = 'X'
labels_issues_op.issue_exist -= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_issues_op.issue_exist *= -1
labels_issues_op['proportion'] = labels_issues_op.issue_exist
labels_issues_op.proportion /= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues = pd.concat([labels_issues, labels_issues_op])
labels_issues.issue_criticality = labels_issues.issue_criticality.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(labels_issues)

labels_issues.sort_values(by='issue_criticality', ascending=True, inplace=True)

labels_issues

Unnamed: 0,resource_category,issue_criticality,issue_exist,proportion
0,Data Portals,No,27,0.008676
1,Journal Websites,No,25,0.004903
0,Data Portals,Yes,3085,0.991324
1,Journal Websites,Yes,5074,0.995097


In [194]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey']),
        alt.Opacity("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=[1, 0]),
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(labels_issues, 'Data Portals'),
    pie(labels_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Critical Issues",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-criticality'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

### Missing Labels

In [195]:
labels_issues = issues[issues.issue_missing_label_related == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_missing_label_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues.issue_exist = labels_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_issues = labels_issues.groupby(['resource_category', 'issue_missing_label_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues['proportion'] = labels_issues.issue_exist
labels_issues.proportion /= labels_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues_op = labels_issues.copy()
labels_issues_op.issue_missing_label_related = 'X'
labels_issues_op.issue_exist -= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_issues_op.issue_exist *= -1
labels_issues_op['proportion'] = labels_issues_op.issue_exist
labels_issues_op.proportion /= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues = pd.concat([labels_issues, labels_issues_op])
labels_issues.issue_missing_label_related = labels_issues.issue_missing_label_related.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(labels_issues)

labels_issues.sort_values(by='issue_missing_label_related', ascending=True, inplace=True)

labels_issues

Unnamed: 0,resource_category,issue_missing_label_related,issue_exist,proportion
0,Data Portals,No,884,0.284062
1,Journal Websites,No,1744,0.342028
0,Data Portals,Yes,2228,0.715938
1,Journal Websites,Yes,3355,0.657972


In [196]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_missing_label_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_missing_label_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey'])
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(labels_issues, 'Data Portals'),
    pie(labels_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Missing Labels",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-missle-labels'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

### WCAG Level

In [197]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_wcag_level']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_wcag_level']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality

Unnamed: 0,resource_category,issue_wcag_level,issue_exist,proportion
0,Data Portals,A,2936,0.943445
1,Data Portals,AA,2000,0.642674
2,Journal Websites,A,3767,0.738772
3,Journal Websites,AA,3048,0.597764


In [198]:
base = alt.Chart(
    labels_criticality
).mark_bar(size=80).encode(
    alt.X("issue_wcag_level", title=None).scale(domain=['A', 'AA']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=400
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages by WCAG Levels',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-wcag-levels'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

### Difficulty To Fix

In [199]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_difficulty_to_fix']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_difficulty_to_fix']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality

Unnamed: 0,resource_category,issue_difficulty_to_fix,issue_exist,proportion
0,Data Portals,Difficult,3065,0.984897
1,Data Portals,Easy,323,0.103792
2,Data Portals,Moderate,2797,0.898779
3,Journal Websites,Difficult,5056,0.991567
4,Journal Websites,Easy,732,0.143558
5,Journal Websites,Moderate,3854,0.755834


In [200]:
base = alt.Chart(
    labels_criticality
).mark_bar(size=80).encode(
    alt.X("issue_difficulty_to_fix", title=None).scale(domain=['Difficult', 'Moderate', 'Easy']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=400
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages by Difficulty to Fix in Post-deployment',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-difficulty-to-fix'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

### Data-related Issues

In [201]:
data_issues = issues[issues.issue_data_related == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_data_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

data_issues.issue_exist = data_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

data_issues = data_issues.groupby(['resource_category', 'issue_data_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

data_issues['proportion'] = data_issues.issue_exist
data_issues.proportion /= data_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_data_op = data_issues.copy()
labels_data_op.issue_data_related = 'X'
labels_data_op.issue_exist -= labels_data_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_data_op.issue_exist *= -1
labels_data_op['proportion'] = labels_data_op.issue_exist
labels_data_op.proportion /= labels_data_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

data_issues = pd.concat([data_issues, labels_data_op])
data_issues.issue_data_related = data_issues.issue_data_related.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(data_issues)

data_issues.sort_values(by='issue_data_related', ascending=True, inplace=True)

data_issues

Unnamed: 0,resource_category,issue_data_related,issue_exist,proportion
0,Data Portals,No,27,0.008676
1,Journal Websites,No,25,0.004903
0,Data Portals,Yes,3085,0.991324
1,Journal Websites,Yes,5074,0.995097


In [202]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey']),
        alt.Opacity("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[1, 0]),
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(data_issues, 'Data Portals'),
    pie(data_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Data-related Issues",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
NAME = 'issues-data-related'
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{NAME}.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{NAME}.json')

plot

## Deprecated Below

In [None]:
PAGE_COLUMNS = ['resource_category', 'website_id', 'page_id', 'page_type']
ISSUE_ORIGINAL_COLUMNS = ['issue_id', 'issue_desc', 'issue_impact', 'issue_help', 'issue_url']
ISSUE_COLUMNS = [
    'issue_id',
    'issue_desc',
    'issue_impact',
    'issue_help',
    'issue_url',
    'issue_name',
    'issue_filter',
    'issue_overall_impact',
    'issue_note_overall_impact_hdv',
    'issue_severity',
    'issue_data_related',
    'issue_data_related_rule',
    'issue_pour_category',
    'issue_wcag_level',
    'issue_difficulty_to_fix',
    'issue_missing_label_related'
]

## Aggregate Data By Unique Website

In [None]:
"""
Merge Accessibility Status, Metadata, and Detailed Issues
"""
data_portal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/data-portal/database-commons.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
reports = pd.read_csv(f"../data/{TIME_STAMP_FOLDER_NAME}/results/accessibility-status.csv", dtype={"page_id": "string"})
issue = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/unique-issues-additional-labels-aug-9-2024.csv')

data_portal_metadata['resource_category'] = 'data-portal'
journal_metadata['resource_category'] = 'journal'


reports = reports.merge(data_portal_metadata, how='left', on=PAGE_COLUMNS)
reports = reports.merge(journal_metadata, how='left', on=PAGE_COLUMNS)

reports = reports.merge(issue, how='left', on=ISSUE_ORIGINAL_COLUMNS)

reports.head(3)

In [None]:
"""
Store the column names for metadata of pages
"""
PAGE_METADATA_COLUMNS = [a for a in reports.columns.tolist() if a not in (PAGE_COLUMNS + ISSUE_COLUMNS + ['violations', 'passes', 'total_checks', 'failure_rate'])] + ['resource_category']
# PAGE_METADATA_COLUMNS

In [None]:
"""
Group by Page
"""
reports_aggregated = reports.groupby(
    PAGE_METADATA_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
"""
Failure Rate
"""
reports_aggregated['failure_rate'] = reports_aggregated.violations / reports_aggregated.total_checks

In [None]:
NUM_DATA_PORTALS = len(reports[reports.resource_category == 'data-portal'].website_id.unique().tolist())
NUM_JOURNALS = len(reports[reports.resource_category == 'journal'].website_id.unique().tolist())
(NUM_DATA_PORTALS, NUM_JOURNALS)

In [None]:
reports_aggregated.columns

## Visualize

In [None]:
# COLORS = {
#     'data-portal': '#56B4E9',
#     'journal': '#CC79A7',
#     'government': '#009E73'
# }
# # reports_aggregated

In [None]:
def histogram(df=None, resource_category=None):
    df_copy = df[df.resource_category == resource_category].copy()

    # df_copy = df_copy[~df_copy.titl.isna()]
    
    return (
        alt.Chart(
           df_copy
        ).mark_bar(
            color=COLORS[resource_category]
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=10),
            alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
            # alt.Tooltip(['title:N', 'failure_rate:Q']),
            # yOffset="jitter:Q",
        ).transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        ).properties(
            title={
                "text": resource_category.replace('-', ' ').title(),
                "fontWeight": 600,
                "color": "black"
            },
            height=300,
            width=400
        )
    )

# baseline = (
#     _.mark_rule(
#         color='black',
#         size=2,
#         # size=500 / len(COUNTRY_SORT),
#         strokeDash=[4, 4]
#     ).encode(
#         alt.X(f'baseline:Q', title='Failure rate'),
#         y=alt.Y()
#     ).transform_calculate(
#         baseline=f"{US_GOV_FR_MEAN}"
#     )
# )
    
# _ = _ + baseline

#     plot = _ if plot is None else plot | _

plot = alt.hconcat(
    histogram(df_pages, 'data-portal'),
    histogram(df_pages, 'journal'),
    histogram(df_pages, 'government')
)

plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png')
plot

In [None]:
pd.set_option("display.precision", 100)
reports_aggregated[(reports_aggregated.failure_rate > 0.0363636) & (reports_aggregated.failure_rate < 0.0363637)]
# reports_aggregated

In [None]:
reports[reports.page_url == 'http://n.neurology.org/']

In [None]:
reports[reports.page_url == 'http://arjournals.annualreviews.org/loi/ecolsys']

In [None]:
reports

## Aggregate Data By Unique Issue

In [None]:
reports['issue_exist'] = reports.violations.apply(lambda x: 0 if x <= 0 else 1)
reports_by_issues = reports.groupby(
    ['resource_category'] + ISSUE_COLUMNS,
    dropna=False
).agg({
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    # 'page_url': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

reports_by_issues['failure_rate'] = reports_by_issues.violations / reports_by_issues.total_checks
reports_by_issues.rename(columns={'page_url': 'page_count'}, inplace=True)
reports_by_issues['page_proportion'] = reports_by_issues.issue_exist
reports_by_issues['page_proportion'] /= reports_by_issues.resource_category.apply(lambda x: NUM_DATA_PORTALS if x == 'data-portal' else NUM_JOURNALS)

# grouped.to_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/reports/report.csv', index=False)
reports_by_issues.head(3)

In [None]:
alt.Chart(reports_by_issues[reports_by_issues.resource_category == 'data-portal']).mark_bar(
    color=COLORS['data-portal']
).encode(
    alt.X('page_proportion:Q', title='Proportion of webpages with issues').axis(format='%', orient='top'),
    alt.Y('issue_name:N', sort='-x', title=None).axis(titlePadding=40),
    alt.Color('issue_overall_impact:N').scale(domain=['critical', 'moderate', 'minor'], range=['#d95f02', '#E69F00', 'grey']),
    # alt.Color('resource_category:N').scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
    # alt.Column('resource_category:N')
).properties(
    height=1600,
    width=600
)

In [None]:
import pygwalker as pyg

In [None]:
pyg.walk(reports_by_issues[reports_by_issues.resource_category == 'data-portal'])

## Statistics

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_overall_impact',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp.columns.tolist()

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_missing_label_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_severity',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_data_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_wcag_level',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_difficulty_to_fix',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')