# Accessibility Report

In [None]:
import pandas as pd
import altair as alt
alt.data_transformers.enable("vegafusion")

In [None]:
# Define custom theme to be applied to all plots
def theme():
    return {
        "config": {
            "title": {
                "dy": 1,
                "fontFamily": "Arial",
                "fontSize": 22,
                "fontWeight": 400,
                "align": "center",
                "anchor": "middle",
                "subtitleColor": "grey",
                "subtitleFontSize": 22
            },
            "view": {
                "fill": "#FCFCFC",
            },
            "header": {
                "titleFontSize": 30,
                "labelFontSize": 30,
                "labelFontWeight": 300,
            },
            "axis": {
                "fontFamily": "SF Pro Text",
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 20,
                "labelFontWeight": 400,
                "labelLimit": 1000,
                "domainWidth": 1.5,
                "domainColor": "black",
                "tickColor": "white",
                "domain": False
            },
            "legend": {
                "fontFamily": "SF Pro Text",
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 23,
                "labelLimit": 1000,
                "strokeColor": '#F4F6F7',
                "padding": 15
            }
        }
    }

alt.themes.register("theme", theme)
alt.themes.enable("theme")

In [None]:
"""
The name of the folder you created under `../data/`
"""
TIME_STAMP_FOLDER_NAME = '08-01-2024'

In [None]:
PAGE_COLUMNS = ['resource_category', 'website_id', 'page_id', 'page_type']
ISSUE_ORIGINAL_COLUMNS = ['issue_id', 'issue_desc', 'issue_impact', 'issue_help', 'issue_url']
ISSUE_COLUMNS = [
    'issue_id',
    'issue_desc',
    'issue_impact',
    'issue_help',
    'issue_url',
    'issue_name',
    'issue_filter',
    'issue_overall_impact',
    'issue_note_overall_impact_hdv',
    'issue_severity',
    'issue_data_related',
    'issue_data_related_rule',
    'issue_pour_category',
    'issue_wcag_level',
    'issue_difficulty_to_fix',
    'issue_missing_label_related'
]

## Aggregate Data By Unique Website

In [None]:
"""
Merge Accessibility Status, Metadata, and Detailed Issues
"""
data_portal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/data-portal/database-commons.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
reports = pd.read_csv(f"../data/{TIME_STAMP_FOLDER_NAME}/results/accessibility-status.csv", dtype={"page_id": "string"})
issue = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/unique-issues-additional-labels-aug-9-2024.csv')

data_portal_metadata['resource_category'] = 'data-portal'
journal_metadata['resource_category'] = 'journal'


reports = reports.merge(data_portal_metadata, how='left', on=PAGE_COLUMNS)
reports = reports.merge(journal_metadata, how='left', on=PAGE_COLUMNS)

reports = reports.merge(issue, how='left', on=ISSUE_ORIGINAL_COLUMNS)

reports.head(3)

In [None]:
"""
Store the column names for metadata of pages
"""
PAGE_METADATA_COLUMNS = [a for a in reports.columns.tolist() if a not in (PAGE_COLUMNS + ISSUE_COLUMNS + ['violations', 'passes', 'total_checks', 'failure_rate'])] + ['resource_category']
# PAGE_METADATA_COLUMNS

In [None]:
"""
Group by Page
"""
reports_aggregated = reports.groupby(
    PAGE_METADATA_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
"""
Failure Rate
"""
reports_aggregated['failure_rate'] = reports_aggregated.violations / reports_aggregated.total_checks

In [None]:
NUM_DATA_PORTALS = len(reports[reports.resource_category == 'data-portal'].website_id.unique().tolist())
NUM_JOURNALS = len(reports[reports.resource_category == 'journal'].website_id.unique().tolist())
(NUM_DATA_PORTALS, NUM_JOURNALS)

In [None]:
reports_aggregated.columns

## Visualize

In [None]:
COLORS = {
    'data-portal': '#56B4E9',
    'journal': '#CC79A7',
    'g': '#009E73'
}
# reports_aggregated

In [None]:
def histogram(df=None, resource_category=None):
    df_copy = df[df.resource_category == resource_category].copy()

    # df_copy = df_copy[~df_copy.titl.isna()]
    
    return (
        alt.Chart(
           df_copy
        ).mark_bar(
            color=COLORS[resource_category]
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=10),
            alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
            # alt.Tooltip(['title:N', 'failure_rate:Q']),
            # yOffset="jitter:Q",
        ).transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        ).properties(
            title={
                "text": resource_category.replace('-', ' ').title(),
                "fontWeight": 600,
                "color": "black"
            },
            height=300,
            width=400
        )
    )

# baseline = (
#     _.mark_rule(
#         color='black',
#         size=2,
#         # size=500 / len(COUNTRY_SORT),
#         strokeDash=[4, 4]
#     ).encode(
#         alt.X(f'baseline:Q', title='Failure rate'),
#         y=alt.Y()
#     ).transform_calculate(
#         baseline=f"{US_GOV_FR_MEAN}"
#     )
# )
    
# _ = _ + baseline

#     plot = _ if plot is None else plot | _

plot = alt.hconcat(
    histogram(reports_aggregated, 'data-portal'),
    histogram(reports_aggregated, 'journal')
)

# plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png')
plot

In [None]:
pd.set_option("display.precision", 100)
reports_aggregated[(reports_aggregated.failure_rate > 0.0363636) & (reports_aggregated.failure_rate < 0.0363637)]
# reports_aggregated

In [None]:
reports[reports.page_url == 'http://n.neurology.org/']

In [None]:
reports[reports.page_url == 'http://arjournals.annualreviews.org/loi/ecolsys']

In [None]:
reports

## Aggregate Data By Unique Issue

In [None]:
reports['issue_exist'] = reports.violations.apply(lambda x: 0 if x <= 0 else 1)
reports_by_issues = reports.groupby(
    ['resource_category'] + ISSUE_COLUMNS,
    dropna=False
).agg({
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    # 'page_url': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

reports_by_issues['failure_rate'] = reports_by_issues.violations / reports_by_issues.total_checks
reports_by_issues.rename(columns={'page_url': 'page_count'}, inplace=True)
reports_by_issues['page_proportion'] = reports_by_issues.issue_exist
reports_by_issues['page_proportion'] /= reports_by_issues.resource_category.apply(lambda x: NUM_DATA_PORTALS if x == 'data-portal' else NUM_JOURNALS)

# grouped.to_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/reports/report.csv', index=False)
reports_by_issues.head(3)

In [None]:
alt.Chart(reports_by_issues[reports_by_issues.resource_category == 'data-portal']).mark_bar(
    color=COLORS['data-portal']
).encode(
    alt.X('page_proportion:Q', title='Proportion of webpages with issues').axis(format='%', orient='top'),
    alt.Y('issue_name:N', sort='-x', title=None).axis(titlePadding=40),
    alt.Color('issue_overall_impact:N').scale(domain=['critical', 'moderate', 'minor'], range=['#d95f02', '#E69F00', 'grey']),
    # alt.Color('resource_category:N').scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
    # alt.Column('resource_category:N')
).properties(
    height=1600,
    width=600
)

In [None]:
import pygwalker as pyg

In [None]:
pyg.walk(reports_by_issues[reports_by_issues.resource_category == 'data-portal'])

## Statistics

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_overall_impact',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp.columns.tolist()

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_missing_label_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_severity',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_data_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_wcag_level',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_difficulty_to_fix',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')