# Figures

In [None]:
import pandas as pd
import altair as alt

In [None]:
alt.__version__

In [None]:
# Define custom theme to be applied to all plots
def theme():
    return {
        "config": {
            "title": {
                "dy": 1,
                "fontSize": 22,
                "fontWeight": 400,
                "align": "center",
                "anchor": "middle",
                "subtitleColor": "grey",
                "subtitleFontSize": 22
            },
            "view": {
                "fill": "#FCFCFC",
            },
            "header": {
                "titleFontSize": 23,
                "labelFontSize": 23,
                "labelFontWeight": 400,
            },
            "axis": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 20,
                "labelFontWeight": 400,
                "labelLimit": 1000,
                "domainWidth": 1.5,
                "domainColor": "black",
                "tickColor": "black",
                "domain": False
            },
            "axisX": {
                "domain": True
            },
            "legend": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 23,
                "labelLimit": 1000,
                "strokeColor": '#F4F6F7',
                "padding": 15
            }
        }
    }

alt.themes.register("theme", theme)
alt.themes.enable("theme")

In [None]:
def consistency(df=None):
    df.resource_category = df.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

In [None]:
"""
The name of the folder you created under `../data/`
"""
TIME_STAMP_FOLDER_NAME = '08-01-2024'

In [None]:
COLORS = {
    'Data Portals': '#56B4E9',
    'Journal Websites': '#CC79A7',
    'US Government Websites': '#009E73'
}

In [None]:
def save_figure(plot=None, name=None, data=None):
    plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{name}.png', scale_factor=8)
    plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/{name}.svg')
    plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{name}.json')
    plot.display()
    if data is not None:
        data.to_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/website/{name}.csv')

## Calculate Baseline

Median FF of US Government Websites

In [None]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')
BASELINE = df[df.resource_category == 'government'].failure_rate.median()
BASELINE

## Calculate Num Pages

In [None]:
NUM_GOV_PAGES = len(df[df.resource_category == 'government'])
NUM_GOV_PAGES

In [None]:
NUM_JW_PAGES = len(df[df.resource_category == 'journal'])
NUM_JW_PAGES

In [None]:
NUM_DP_PAGES = len(df[df.resource_category == 'data-portal'])
NUM_DP_PAGES

In [None]:
len(df[(df.resource_category == 'data-portal') & (df.failure_rate > BASELINE)].index) / NUM_DP_PAGES * 100

## Estimated Failure Rates
This data is shared by the collaborator and not directly created from the notebooks in this repository.

In [None]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/2024-08-19_failure_rate_meta.csv')

### By Resource Category

In [None]:
df_overall = df[
    # (df.continent == 'ALL') &
    (df.country == 'ALL') &
    (df.publisher == 'ALL') |
    (df.web_type == 'government') &
    (df.publisher == 'ALL')
]

df_overall.web_type = df_overall.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

df_overall

In [None]:
base = alt.Chart(df_overall).mark_circle(
    size=100,
    opacity=1
).encode(
    alt.Y('web_type:N', title=None, sort=['US Government Websites', 'Journal Websites', 'Data Portals']),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%', tickCount=5).scale(domain=[0, 0.1]),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
    ]
).properties(
    height=200,
    width=300
)

error = base.mark_errorbar(
    thickness=2,
    color='black'
).encode(
    alt.X('ci_95L', title='Estimated failure rate'),
    alt.X2('ci_95U'),
    color=alt.value('black')
)

plot = base + error + base

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'resource-category-estimated', df_overall)

### By Hosting Institutions

In [None]:
df_org = df[(df.web_type != 'government') & (df.publisher != 'ALL') & (df.publisher.notnull())]
df_org.web_type = df_org.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')
df_org

In [None]:
top_10_journal_publishers_by_size = df_org[(df_org.web_type == 'Journal Websites') & (df_org.units > 52)]
len(top_10_journal_publishers_by_size)

In [None]:
len(df_org[df_org.web_type == 'Journal Websites'].index)

In [None]:
top_10_data_portal_publishers_by_size = df_org[(df_org.web_type == 'Data Portals') & (df_org.units > 18) & (df_org.publisher != 'China Agricultural University')]
len(top_10_data_portal_publishers_by_size)

In [None]:
len(df_org[df_org.web_type == 'Data Portals'].index)

In [None]:
df_org_filtered = pd.concat([top_10_journal_publishers_by_size, top_10_data_portal_publishers_by_size])
df_org_filtered

In [None]:
base = alt.Chart(df_org_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('publisher:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(tickCount=3, format='.0%').scale(domain=[0, 0.12]),
    alt.Color('web_type:N').scale(range=list(COLORS.values())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('publisher:N', title='Publisher'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'publisher-estimated', df_org_filtered)

### Country

In [None]:
df_country = df[(df.web_type != 'government') & (df.country != 'ALL') & (df.country.notnull())]

In [None]:
top_10_journal_countries_by_size = df_country[(df_country.web_type == 'journal') & (df_country.units > 144)]
len(top_10_journal_countries_by_size)
# top_10_journal_countries_by_size

In [None]:
len(df_country[df_country.web_type == 'journal'].index)

In [None]:
top_10_data_countries_by_size = df_country[(df_country.web_type == 'data_portal') & (df_country.units > 60)]
len(top_10_data_countries_by_size)
# top_10_data_countries_by_size
# df_country.web_type.unique().tolist()

In [None]:
len(df_country[df_country.web_type == 'data_portal'].index)

In [None]:
countries_data_portals = top_10_data_countries_by_size.country.unique().tolist()
# [
#     'Spain', 'China', 'Germany', 'India', 'Italy', 'Japan', 'Canada', 'France', 'United States', 'United Kingdom'
# ]
countries_journals = top_10_journal_countries_by_size.country.unique().tolist()
# [
#     'China', 'Germany', 'India', 'Spain', 'Italy', 'Switzerland', 'Netherlands', 'France', 'United States', 'United Kingdom'
# ]
df_country_filtered = df_country[
    (df_country.web_type == 'data_portal') & (df_country.country.isin(countries_data_portals))|
    (df_country.web_type == 'journal') & (df_country.country.isin(countries_journals))
]

In [None]:
df_country_filtered.web_type = df_country_filtered.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

In [None]:
base = alt.Chart(df_country_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('country:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%').scale(domain=[0, 0.15]),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'country-estimated', df_country_filtered)

### Continent

In [None]:
df_continents = df[(df.web_type != 'government') & (df.continent != 'ALL') & (df.continent.notnull())]
df_continents.web_type = df_continents.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')
df_continents

In [None]:
base = alt.Chart(df_continents).mark_circle(size=100, opacity=1).encode(
    alt.Y('continent:N', title=None).sort(field="fail_rate_meta", op="max", order="ascending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%', tickCount=4).scale(domain=[0, 0.1]),
    alt.Color('web_type:N').scale(range=list(COLORS.values())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('continent:N', title='Continent'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    width=300,
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

# plot = apply_theme(plot)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'continent-estimated', df_continents)

## Raw Accessibility Results

In [None]:
df_pages = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')
df_pages.resource_category = df_pages.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'Data Portals' if x == 'data-portal' else 'US Government Websites')
df_pages.head(1)

In [None]:
df_pages[df_pages.page_url == 'https://portal.hubmapconsortium.org/']
df_pages[df_pages.page_url == 'https://www.encodeproject.org/']
df_pages[df_pages.page_url.str.contains('cbioportal')]

### Histogram

In [None]:
# alt.data_transformers.enable("vegafusion")
# plot = None
# for category in df_pages.resource_category.unique().tolist():
    
#     _ = (
#         alt.Chart(
#             df_pages[df_pages.failure_rate > 0]
#         ).mark_bar(
#             # opacity=0.01
#             color=COLORS[category],
#             stroke='white',
#             strokeWidth=0.5
#         ).encode(
#             alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=1),
#             alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
#         ).properties(
#             title={
#                 "text": category,
#                 "fontWeight": 600,
#                 "color": "black"
#             },
#             height=300,
#             width=400
#         )
#     )

#     baseline = (
#         _.mark_rule(
#             color='black',
#             size=2,
#             # size=500 / len(COUNTRY_SORT),
#             strokeDash=[4, 4]
#         ).encode(
#             alt.X(f'baseline:Q', title='Failure rate'),
#             y=alt.Y()
#         ).transform_calculate(
#             baseline=f"{BASELINE}"
#         )
#     )
    
#     _ = _ + baseline

#     plot = _ if plot is None else plot | _

# plot = plot.properties(
#     # title={
#     #     # 'text': 'The Distribution of Failure Rates',
#     #     # 'subtitle': '* Dashed line represents the average failure rate of US government websites',
#     #     'subtitleColor': 'grey'
#     # }
# )

# # plot = apply_theme(plot)
# # plot.save('../output/plots/ff-dist.png')
# plot

# alt.Chart(
#     df_pages[df_pages.failure_rate > 0]
# ).mark_rule(
#     color='black',
#     size=2,
#     # size=500 / len(COUNTRY_SORT),
#     strokeDash=[4, 4]
# ).encode(
#     alt.X(f'baseline:Q', title='Failure rate'),
#     y=alt.Y()
# )

In [None]:
with alt.data_transformers.enable("vegafusion"):
    df_pages['baseline'] = BASELINE
    
    plot = alt.Chart(
       df_pages
    ).mark_bar(
        
    ).encode(
        alt.X('failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=1, tickCount=3, tickColor='white', offset=-10),
        alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
        alt.Color('resource_category:N', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
        alt.Column('resource_category:N', title=None)
    ).properties(
        height=300,
        width=400
    )

     
    # gov = plot.mark_rule(
    #     color='black',
    #     size=1,
    #     strokeDash=[4, 4]
    # ).encode(
    #     x=alt.X(f'baseline:Q', title="The number of webpages"),
    #     y=alt.Y(),
    #     # alt.Size(),
    #     # alt.YOffset(),
    #     color=alt.value('black')
    # ).transform_calculate(
    #     baseline=f"{BASELINE}"
    # )

    # plot = alt.layer(plot, gov)
    plot = plot.resolve_scale(y='independent')
    # plot = (plot + gov).facet(column=)
    
    """
    Save for the manuscript figures and website plots
    """
    plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png', scale_factor=8)
    # plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/histogram.json') # large data cannot be inlined

    # plot.resolve_scale(y='independent').display()
    plot.display()

In [None]:
len(df_pages[(df_pages.failure_rate > 0.5) & (df_pages.resource_category == 'US Government Websites')].index)
# len(df_pages[(df_pages.failure_rate > 0.5) & (df_pages.resource_category == 'Data Portals')].index)

### Most Common Issues

In [None]:
issues = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports.csv')
issues = issues[issues.issue_name != '-']
issues = issues[~issues.issue_name.isna()]
issues.issue_name = issues.issue_name.apply(lambda x: x.replace('Bas', 'Bad'))
issues = issues[issues.resource_category != 'government']
issues['issue_exist'] = issues.violations.apply(lambda x: 0 if x <= 0 else 1)

In [None]:
issues.head(1)

In [None]:
issues_agg = issues.groupby(
    [
        'issue_name',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
issues_agg.rename(columns={ 'issue_exist': 'num_pages_with_issues' }, inplace=True)

In [None]:
issues_agg['failure_rate'] = issues_agg.num_pages_with_issues
issues_agg.failure_rate /= issues_agg.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])

In [None]:
top_10_issues_dp = issues_agg[issues_agg.resource_category == 'data-portal'].sort_values(by='failure_rate', ascending=False)
top_10_issues_dp = top_10_issues_dp.issue_name.tolist()[:10]
top_10_issues_dp

In [None]:
top_10_issues_jw = issues_agg[issues_agg.resource_category == 'journal'].sort_values(by='failure_rate', ascending=False)
top_10_issues_jw = top_10_issues_jw.issue_name.tolist()[:10]
top_10_issues_jw

In [None]:
issues_agg_filtered = pd.concat([
    issues_agg[(issues_agg.resource_category == 'data-portal') & (issues_agg.issue_name.isin(top_10_issues_dp))],
    issues_agg[(issues_agg.resource_category == 'journal') & (issues_agg.issue_name.isin(top_10_issues_jw))]
])

In [None]:
issues_agg_filtered.resource_category = issues_agg_filtered.resource_category.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

In [None]:
def top_issue_bar(df=None, category=None, sort=None):
    return alt.Chart(df[df.resource_category == category]).mark_bar().encode(
        alt.X('failure_rate:Q', title='The proportion of pages with issues').axis(format='%').scale(domain=[0, 1]),
        alt.Y('issue_name', sort=sort, title=None),
        # alt.Column('resource_category:N'),
        alt.Color('resource_category:N', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
    ).properties(
        title={
            'text': category,
            'dy': -10
        },
        width=400,
        height=400
    )

d = top_issue_bar(issues_agg_filtered, 'Data Portals', top_10_issues_dp)
j = top_issue_bar(issues_agg_filtered, 'Journal Websites', top_10_issues_jw)

plot = alt.hconcat(d, j)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'top-issues', issues_agg_filtered)

### Image and Table-related

In [None]:
img = issues.copy()
img['issue_exist'] = img.violations.apply(lambda x: 0 if x <= 0 else 1)
img['is_image_related'] = 'No'
img.loc[img.issue_name.str.contains('Image', case=False), 'is_image_related'] = 'Yes'

img_agg = img.groupby(
    [
        'website_id', 'page_id',
        'is_image_related',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
img_agg.issue_exist = img_agg.issue_exist.apply(lambda x: 0 if x <= 0 else 1)

img_agg = img_agg.groupby(
    [
        'is_image_related',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
img_agg.rename(columns={ 'issue_exist': 'num_pages_with_issues' }, inplace=True)

# # issues_agg[issues_agg.issue_name.str.contains('Image', case=False)]
img_agg = img_agg[img_agg.is_image_related == 'Yes']

img_agg_no = img_agg.copy()
img_agg_no.is_image_related = 'No'
img_agg_no.num_pages_with_issues -= img_agg_no.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])
img_agg_no.num_pages_with_issues *= -1

img_agg = pd.concat(
    [img_agg,
    img_agg_no]
)

img_agg['percentage'] = img_agg.num_pages_with_issues
img_agg.percentage /= img_agg.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])

consistency(img_agg)

img_agg

img_agg.rename(columns={'is_image_related': 'is_related'}, inplace=True)
img_agg = img_agg[img_agg.is_related == 'Yes']
img_agg.is_related = 'Image-related'
img_agg

In [None]:
tbl = issues.copy()
tbl['issue_exist'] = tbl.violations.apply(lambda x: 0 if x <= 0 else 1)
tbl['is_table_related'] = 'No'
tbl.loc[tbl.issue_name.str.contains('Table', case=False), 'is_table_related'] = 'Yes'

tbl_agg = tbl.groupby(
    [
        'website_id', 'page_id',
        'is_table_related',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
tbl_agg.issue_exist = tbl_agg.issue_exist.apply(lambda x: 0 if x <= 0 else 1)

tbl_agg = tbl_agg.groupby(
    [
        'is_table_related',
        'resource_category'
    ],
    dropna=False
).agg({
    'issue_exist': 'sum'
}).reset_index()
tbl_agg.rename(columns={ 'issue_exist': 'num_pages_with_issues' }, inplace=True)

# # issues_agg[issues_agg.issue_name.str.contains('Image', case=False)]
tbl_agg = tbl_agg[tbl_agg.is_table_related == 'Yes']

tbl_agg_no = tbl_agg.copy()
tbl_agg_no.is_table_related = 'No'
tbl_agg_no.num_pages_with_issues -= img_agg_no.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])
tbl_agg_no.num_pages_with_issues *= -1

tbl_agg = pd.concat(
    [tbl_agg,
    tbl_agg_no]
)

tbl_agg['percentage'] = tbl_agg.num_pages_with_issues
tbl_agg.percentage /= tbl_agg.resource_category.apply(lambda x: { 'data-portal': NUM_DP_PAGES, 'government': NUM_GOV_PAGES, 'journal': NUM_JW_PAGES }[x])

consistency(tbl_agg)

tbl_agg

tbl_agg.rename(columns={'is_table_related': 'is_related'}, inplace=True)
tbl_agg = tbl_agg[tbl_agg.is_related == 'Yes']
tbl_agg.is_related = 'Table-related'
tbl_agg

In [None]:
agg = pd.concat([img_agg, tbl_agg])
agg

In [None]:
dp = alt.Chart(
    agg[agg.resource_category == 'Data Portals']
).mark_bar(
    size=70
).encode(
    alt.X('is_related', title=None).axis(labelAngle=0),
    alt.Y('percentage').axis(format='%').scale(domain=[0, 0.6]),
    alt.Color('resource_category', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    title='Data Portals',
    width=300
)
dpt = dp.mark_text(size=24, dy=-13).encode(
    alt.Text('percentage', format='.1%'),
    color=alt.value('black')
)

jw = alt.Chart(
    agg[agg.resource_category == 'Journal Websites']
).mark_bar(
    size=70
).encode(
    alt.X('is_related', title=None).axis(labelAngle=0),
    alt.Y('percentage', title=None).axis(format='%', labels=False, ticks=False),
    alt.Color('resource_category', legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    title='Journal Websites',
    width=300
)

jwt = jw.mark_text(size=24, dy=-13).encode(
    alt.Text('percentage', format='.1%'),
    color=alt.value('black')
)

plot = alt.hconcat(dp + dpt, jw + jwt, spacing=0).resolve_scale(y='shared')

save_figure(plot, 'issues-table-and-image-related', agg)

### Overall Impact

In [None]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_overall_impact']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_overall_impact']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality.issue_overall_impact = labels_criticality.issue_overall_impact.apply(lambda x: x.title())
labels_criticality.issue_overall_impact = labels_criticality.issue_overall_impact.apply(lambda x: 'Severe' if x == 'Critical' else x)

labels_criticality

In [None]:
base = alt.Chart(
    labels_criticality
).mark_bar(
    size=80,
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    alt.X("issue_overall_impact", title=None).scale(domain=['Severe', 'Moderate', 'Minor']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=340
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages with Overall Impact of Issues',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-overall-impact', labels_criticality)

### Criticality

In [None]:
issues.rename(columns={'issue_severity': 'issue_criticality'}, inplace=True)

In [None]:
issues.issue_criticality = issues.issue_criticality.apply(lambda x: 'O' if x == 'severe' else 'X')
labels_issues = issues[issues.issue_criticality == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_criticality']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues.issue_exist = labels_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_issues = labels_issues.groupby(['resource_category', 'issue_criticality']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues['proportion'] = labels_issues.issue_exist
labels_issues.proportion /= labels_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues_op = labels_issues.copy()
labels_issues_op.issue_criticality = 'X'
labels_issues_op.issue_exist -= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_issues_op.issue_exist *= -1
labels_issues_op['proportion'] = labels_issues_op.issue_exist
labels_issues_op.proportion /= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues = pd.concat([labels_issues, labels_issues_op])
labels_issues.issue_criticality = labels_issues.issue_criticality.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(labels_issues)

labels_issues.sort_values(by='issue_criticality', ascending=True, inplace=True)

labels_issues

In [None]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey']),
        alt.Opacity("issue_criticality", title=None, legend=None).scale(domain=['Yes', 'No'], range=[1, 0]),
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(labels_issues, 'Data Portals'),
    pie(labels_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Critical Issues",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-criticality', labels_issues)

### Missing Labels

In [None]:
labels_issues = issues[issues.issue_missing_label_related == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_missing_label_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues.issue_exist = labels_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_issues = labels_issues.groupby(['resource_category', 'issue_missing_label_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_issues['proportion'] = labels_issues.issue_exist
labels_issues.proportion /= labels_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues_op = labels_issues.copy()
labels_issues_op.issue_missing_label_related = 'X'
labels_issues_op.issue_exist -= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_issues_op.issue_exist *= -1
labels_issues_op['proportion'] = labels_issues_op.issue_exist
labels_issues_op.proportion /= labels_issues_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_issues = pd.concat([labels_issues, labels_issues_op])
labels_issues.issue_missing_label_related = labels_issues.issue_missing_label_related.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(labels_issues)

labels_issues.sort_values(by='issue_missing_label_related', ascending=True, inplace=True)

labels_issues

In [None]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_missing_label_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_missing_label_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey'])
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(labels_issues, 'Data Portals'),
    pie(labels_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Missing Labels",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-missle-labels', labels_issues)

### WCAG Level

In [None]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_wcag_level']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_wcag_level']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality

In [None]:
base = alt.Chart(
    labels_criticality
).mark_bar(size=80).encode(
    alt.X("issue_wcag_level", title=None).scale(domain=['A', 'AA']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=400
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages by WCAG Levels',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-wcag-levels', labels_criticality)

### Difficulty To Fix

In [None]:
labels_criticality = issues.groupby(['resource_category', 'website_id', 'page_id', 'issue_difficulty_to_fix']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality.issue_exist = labels_criticality.issue_exist.apply(lambda x: 0 if x == 0 else 1)

labels_criticality = labels_criticality.groupby(['resource_category', 'issue_difficulty_to_fix']).agg({
    'issue_exist': 'sum'
}).reset_index()

labels_criticality['proportion'] = labels_criticality.issue_exist
labels_criticality.proportion /= labels_criticality.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

consistency(labels_criticality)

labels_criticality

In [None]:
base = alt.Chart(
    labels_criticality
).mark_bar(size=80).encode(
    alt.X("issue_difficulty_to_fix", title=None).scale(domain=['Difficult', 'Moderate', 'Easy']).axis(labelAngle=0),
    alt.Y("proportion", title='The proportion of pages').axis(format='%', tickCount=6).scale(domain=[0, 1]),
    alt.Color("resource_category", title=None, legend=None).scale(domain=list(COLORS.keys()), range=list(COLORS.values()))
).properties(
    height=300,
    width=400
)

text = base.mark_text(dy=20, size=24).encode(
    alt.Text("proportion", format='.1%'),
    color=alt.value('white')
)

plot = alt.layer(base, text).facet(column=alt.Column("resource_category", title=None), spacing=0).properties(
    title={
        "text": 'The Proportion of Pages by Difficulty to Fix in Post-deployment',
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-difficulty-to-fix', labels_criticality)

### Data-related Issues

In [None]:
data_issues = issues[issues.issue_data_related == 'O'].groupby(['resource_category', 'website_id', 'page_id', 'issue_data_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

data_issues.issue_exist = data_issues.issue_exist.apply(lambda x: 0 if x == 0 else 1)

data_issues = data_issues.groupby(['resource_category', 'issue_data_related']).agg({
    'issue_exist': 'sum'
}).reset_index()

data_issues['proportion'] = data_issues.issue_exist
data_issues.proportion /= data_issues.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

labels_data_op = data_issues.copy()
labels_data_op.issue_data_related = 'X'
labels_data_op.issue_exist -= labels_data_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)
labels_data_op.issue_exist *= -1
labels_data_op['proportion'] = labels_data_op.issue_exist
labels_data_op.proportion /= labels_data_op.resource_category.apply(lambda x: NUM_DP_PAGES if x == 'data-portal' else NUM_JW_PAGES)

data_issues = pd.concat([data_issues, labels_data_op])
data_issues.issue_data_related = data_issues.issue_data_related.apply(lambda x: 'Yes' if x == 'O' else 'No')



consistency(data_issues)

data_issues.sort_values(by='issue_data_related', ascending=True, inplace=True)

data_issues

In [None]:
def pie(df=None, category=None):
    base = alt.Chart(
        df[df.resource_category == category]
    ).mark_arc().encode(
        alt.Theta("proportion").stack(True),
        alt.Color("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[COLORS[category], 'lightgrey'])
    )
    text = base.mark_text(radius=50, size=24).encode(
        alt.Text("proportion:Q", format='.1%'),
        alt.Color("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=['white', 'lightgrey']),
        alt.Opacity("issue_data_related", title=None, legend=None).scale(domain=['Yes', 'No'], range=[1, 0]),
    )
    return alt.layer(base, text).resolve_scale(color='independent').properties(
        title={
            "text": category,
            "dy": -10
        }
    )

plot = alt.hconcat(
    pie(data_issues, 'Data Portals'),
    pie(data_issues, 'Journal Websites'),
    spacing=50
).resolve_scale(
    color='independent'
).properties(
    title={
        "text": "The Proportion of Pages with Data-related Issues",
        "dy": -10
    }
)

"""
Save for the manuscript figures and website plots
"""
save_figure(plot, 'issues-data-related', data_issues)

## Deprecated Below

In [None]:
PAGE_COLUMNS = ['resource_category', 'website_id', 'page_id', 'page_type']
ISSUE_ORIGINAL_COLUMNS = ['issue_id', 'issue_desc', 'issue_impact', 'issue_help', 'issue_url']
ISSUE_COLUMNS = [
    'issue_id',
    'issue_desc',
    'issue_impact',
    'issue_help',
    'issue_url',
    'issue_name',
    'issue_filter',
    'issue_overall_impact',
    'issue_note_overall_impact_hdv',
    'issue_severity',
    'issue_data_related',
    'issue_data_related_rule',
    'issue_pour_category',
    'issue_wcag_level',
    'issue_difficulty_to_fix',
    'issue_missing_label_related'
]

## Aggregate Data By Unique Website

In [None]:
"""
Merge Accessibility Status, Metadata, and Detailed Issues
"""
data_portal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/data-portal/database-commons.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
reports = pd.read_csv(f"../data/{TIME_STAMP_FOLDER_NAME}/results/accessibility-status.csv", dtype={"page_id": "string"})
issue = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/unique-issues-additional-labels-aug-9-2024.csv')

data_portal_metadata['resource_category'] = 'data-portal'
journal_metadata['resource_category'] = 'journal'


reports = reports.merge(data_portal_metadata, how='left', on=PAGE_COLUMNS)
reports = reports.merge(journal_metadata, how='left', on=PAGE_COLUMNS)

reports = reports.merge(issue, how='left', on=ISSUE_ORIGINAL_COLUMNS)

reports.head(3)

In [None]:
"""
Store the column names for metadata of pages
"""
PAGE_METADATA_COLUMNS = [a for a in reports.columns.tolist() if a not in (PAGE_COLUMNS + ISSUE_COLUMNS + ['violations', 'passes', 'total_checks', 'failure_rate'])] + ['resource_category']
# PAGE_METADATA_COLUMNS

In [None]:
"""
Group by Page
"""
reports_aggregated = reports.groupby(
    PAGE_METADATA_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
"""
Failure Rate
"""
reports_aggregated['failure_rate'] = reports_aggregated.violations / reports_aggregated.total_checks

In [None]:
NUM_DATA_PORTALS = len(reports[reports.resource_category == 'data-portal'].website_id.unique().tolist())
NUM_JOURNALS = len(reports[reports.resource_category == 'journal'].website_id.unique().tolist())
(NUM_DATA_PORTALS, NUM_JOURNALS)

In [None]:
reports_aggregated.columns

## Visualize

In [None]:
# COLORS = {
#     'data-portal': '#56B4E9',
#     'journal': '#CC79A7',
#     'government': '#009E73'
# }
# # reports_aggregated

In [None]:
def histogram(df=None, resource_category=None):
    df_copy = df[df.resource_category == resource_category].copy()

    # df_copy = df_copy[~df_copy.titl.isna()]
    
    return (
        alt.Chart(
           df_copy
        ).mark_bar(
            color=COLORS[resource_category]
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=10),
            alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
            # alt.Tooltip(['title:N', 'failure_rate:Q']),
            # yOffset="jitter:Q",
        ).transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        ).properties(
            title={
                "text": resource_category.replace('-', ' ').title(),
                "fontWeight": 600,
                "color": "black"
            },
            height=300,
            width=400
        )
    )

# baseline = (
#     _.mark_rule(
#         color='black',
#         size=2,
#         # size=500 / len(COUNTRY_SORT),
#         strokeDash=[4, 4]
#     ).encode(
#         alt.X(f'baseline:Q', title='Failure rate'),
#         y=alt.Y()
#     ).transform_calculate(
#         baseline=f"{US_GOV_FR_MEAN}"
#     )
# )
    
# _ = _ + baseline

#     plot = _ if plot is None else plot | _

plot = alt.hconcat(
    histogram(df_pages, 'data-portal'),
    histogram(df_pages, 'journal'),
    histogram(df_pages, 'government')
)

plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png')
plot

In [None]:
pd.set_option("display.precision", 100)
reports_aggregated[(reports_aggregated.failure_rate > 0.0363636) & (reports_aggregated.failure_rate < 0.0363637)]
# reports_aggregated

In [None]:
reports[reports.page_url == 'http://n.neurology.org/']

In [None]:
reports[reports.page_url == 'http://arjournals.annualreviews.org/loi/ecolsys']

In [None]:
reports

## Aggregate Data By Unique Issue

In [None]:
reports['issue_exist'] = reports.violations.apply(lambda x: 0 if x <= 0 else 1)
reports_by_issues = reports.groupby(
    ['resource_category'] + ISSUE_COLUMNS,
    dropna=False
).agg({
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    # 'page_url': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

reports_by_issues['failure_rate'] = reports_by_issues.violations / reports_by_issues.total_checks
reports_by_issues.rename(columns={'page_url': 'page_count'}, inplace=True)
reports_by_issues['page_proportion'] = reports_by_issues.issue_exist
reports_by_issues['page_proportion'] /= reports_by_issues.resource_category.apply(lambda x: NUM_DATA_PORTALS if x == 'data-portal' else NUM_JOURNALS)

# grouped.to_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/reports/report.csv', index=False)
reports_by_issues.head(3)

In [None]:
alt.Chart(reports_by_issues[reports_by_issues.resource_category == 'data-portal']).mark_bar(
    color=COLORS['data-portal']
).encode(
    alt.X('page_proportion:Q', title='Proportion of webpages with issues').axis(format='%', orient='top'),
    alt.Y('issue_name:N', sort='-x', title=None).axis(titlePadding=40),
    alt.Color('issue_overall_impact:N').scale(domain=['critical', 'moderate', 'minor'], range=['#d95f02', '#E69F00', 'grey']),
    # alt.Color('resource_category:N').scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
    # alt.Column('resource_category:N')
).properties(
    height=1600,
    width=600
)

In [None]:
import pygwalker as pyg

In [None]:
pyg.walk(reports_by_issues[reports_by_issues.resource_category == 'data-portal'])

## Statistics

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_overall_impact',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp.columns.tolist()

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_missing_label_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_severity',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_data_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_wcag_level',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_difficulty_to_fix',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')