# Accessibility Report (WIP)

In [1]:
import pandas as pd
import altair as alt
# alt.data_transformers.enable("vegafusion")

In [2]:
# Define custom theme to be applied to all plots
def theme():
    return {
        "config": {
            "title": {
                "dy": 1,
                "fontSize": 22,
                "fontWeight": 400,
                "align": "center",
                "anchor": "middle",
                "subtitleColor": "grey",
                "subtitleFontSize": 22
            },
            "view": {
                "fill": "#FCFCFC",
            },
            "header": {
                "titleFontSize": 30,
                "labelFontSize": 30,
                "labelFontWeight": 300,
            },
            "axis": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 20,
                "labelFontWeight": 400,
                "labelLimit": 1000,
                "domainWidth": 1.5,
                "domainColor": "black",
                "tickColor": "white",
                "domain": False
            },
            "axisX": {
                "domain": True
            },
            "legend": {
                "titleFontSize": 23,
                "titleFontWeight": 400,
                "labelFontSize": 23,
                "labelLimit": 1000,
                "strokeColor": '#F4F6F7',
                "padding": 15
            }
        }
    }

alt.themes.register("theme", theme)
alt.themes.enable("theme")

ThemeRegistry.enable('theme')

In [3]:
"""
The name of the folder you created under `../data/`
"""
TIME_STAMP_FOLDER_NAME = '08-01-2024'

In [4]:
COLORS = {
    'Data Portals': '#56B4E9',
    'Journal Websites': '#CC79A7',
    'US Government Websites': '#009E73'
}

## Median FF of US Government Websites

In [26]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')
BASELINE = df[df.resource_category == 'government'].failure_rate.median()

  df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')


## Estimated Failure Rates
This data is shared by the collaborator and not directly created from the notebooks in this repository.

In [98]:
df = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/failure_rate_meta_2024_08_12.csv')

### By Resource Category

In [13]:
df_overall = df[
    # (df.continent == 'ALL') &
    (df.country == 'ALL') &
    (df.publisher == 'ALL') |
    (df.web_type == 'government') &
    (df.publisher == 'ALL')
]

df_overall.web_type = df_overall.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

df_overall

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_overall.web_type = df_overall.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


Unnamed: 0,web_type,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
0,Journal Websites,ALL,ALL,0.028172,0.0137,0.027446,0.028917,208412,4979617,4941
187,US Government Websites,ALL,ALL,0.01372,0.037062,0.012771,0.014739,11435,717945,4941
188,Data Portals,ALL,ALL,0.061157,0.016527,0.059323,0.063043,11435,717945,4941


In [14]:
base = alt.Chart(df_overall).mark_circle(
    size=60,
    opacity=1
).encode(
    alt.Y('web_type:N', title=None, sort=['US Government Websites', 'Journal Websites', 'Data Portals']),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%'),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
    ]
).properties(
    height=200,
    width=300
)

error = base.mark_errorbar(
    thickness=2,
    color='black'
).encode(
    alt.X('ci_95L', title='Estimated failure rate'),
    alt.X2('ci_95U'),
    color=alt.value('black')
)

plot = base + error + base

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/resource-category-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/resource-category-estimated.json')

plot

### By Hosting Institutions

In [15]:
df_org = df[(df.web_type != 'government') & (df.publisher != 'ALL') & (df.publisher.notnull())]
df_org.web_type = df_org.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')
df_org

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_org.web_type = df_org.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


Unnamed: 0,web_type,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
67,Journal Websites,ALL,ABV-press Publishing house,0.219781,0.032217,0.209145,0.230800,1797,8203,6
68,Journal Websites,ALL,Academic Press Inc.,0.019512,0.119018,0.015515,0.024513,72,3690,18
69,Journal Websites,ALL,Adis International Ltd,0.001061,0.333519,0.000552,0.002039,9,8797,8
70,Journal Websites,ALL,Advanstar Communications Inc.,0.064402,0.169771,0.047031,0.087600,668,8594,5
71,Journal Websites,ALL,Agricultural Research Communication Centre,0.024727,0.084092,0.021049,0.029029,145,5867,5
...,...,...,...,...,...,...,...,...,...,...
322,Data Portals,ALL,Wellcome Sanger Institute,0.045122,0.267720,0.027201,0.073953,405,13404,14
323,Data Portals,ALL,Wuhan University,0.034161,0.540175,0.012121,0.092526,82,2726,5
324,Data Portals,ALL,Yale University,0.032111,0.368054,0.015871,0.063892,195,6282,12
325,Data Portals,ALL,Yonsei University,0.062245,0.554024,0.021918,0.164303,112,4604,7


In [39]:
top_10_journal_publishers_by_size = df_org[(df_org.web_type == 'Journal Websites') & (df_org.units > 52)]
len(top_10_journal_publishers_by_size)

10

In [48]:
top_10_data_portal_publishers_by_size = df_org[(df_org.web_type == 'Data Portals') & (df_org.units > 18) & (df_org.publisher != 'China Agricultural University')]
len(top_10_data_portal_publishers_by_size)

10

In [49]:
df_org_filtered = pd.concat([top_10_journal_publishers_by_size, top_10_data_portal_publishers_by_size])
df_org_filtered

Unnamed: 0,web_type,country,publisher,fail_rate_meta,se,ci_95L,ci_95U,total_violations,total_checks,units
74,Journal Websites,ALL,American Psychological Association,0.07483,0.038585,0.06976,0.080237,726,9702,66
78,Journal Websites,ALL,Bentham Science Publishers B.V.,0.031073,0.03425,0.029114,0.033159,3659,122483,96
79,Journal Websites,ALL,BioMed Central Ltd.,0.014046,0.045373,0.012867,0.015333,1632,115260,118
83,Journal Websites,ALL,Cambridge University Press,0.022314,0.053734,0.020128,0.02473,4193,200216,72
101,Journal Websites,ALL,Elsevier BV,0.020424,0.091866,0.017116,0.024355,593,26524,83
107,Journal Websites,ALL,Elsevier Masson s.r.l.,0.023805,0.073998,0.020658,0.027419,551,19256,65
154,Journal Websites,ALL,Springer International Publishing AG,0.002299,0.186732,0.001595,0.003312,190,54309,53
159,Journal Websites,ALL,Springer Netherlands,0.001038,0.292201,0.000585,0.001838,330,120296,108
160,Journal Websites,ALL,Springer New York,0.001201,0.090977,0.001005,0.001435,121,112851,103
167,Journal Websites,ALL,Springer Verlag,0.001244,0.096412,0.00103,0.001503,210,162815,148


In [58]:
base = alt.Chart(df_org_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('publisher:N', title=None).sort(field="fail_rate_meta", op="max", order="descending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%').scale(domain=[0, 0.12]),
    alt.Color('web_type:N').scale(range=list(COLORS.values())).legend(None),
    tooltip=[
        alt.Tooltip('web_type:N', title='Resource Category'),
        alt.Tooltip('publisher:N', title='Publisher'),
        alt.Tooltip('fail_rate_meta', format='.4f', title='Estimated Failure Rate'),
        alt.Tooltip('units', title='Size'),
    ]
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

"""
Save for the manuscript figures and website plots
"""
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/publisher-estimated.png', scale_factor=8)
plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/website/publisher-estimated.json')

plot

### Country

In [99]:
df_country = df[(df.web_type != 'government') & (df.country != 'ALL') & (df.country.notnull())]

In [89]:
top_10_journal_countries_by_size = df_country[(df_country.web_type == 'journal') & (df_country.units > 143) & (df_country.country != 'Poland')]
len(top_10_journal_countries_by_size)
# top_10_journal_countries_by_size

10

In [100]:
top_10_data_countries_by_size = df_country[(df_country.web_type == 'data_portal') & (df_country.units > 14) & (df_country.country != 'Poland')]
len(top_10_data_countries_by_size)
top_10_data_countries_by_size
df_country.web_type.unique().tolist()

['journal']

In [63]:
countries_data_portals = [
    'Spain', 'China', 'Germany', 'India', 'Italy', 'Japan', 'Canada', 'France', 'United States', 'United Kingdom'
]
countries_journals = [
    'China', 'Germany', 'India', 'Spain', 'Italy', 'Switzerland', 'Netherlands', 'France', 'United States', 'United Kingdom'
]
df_country_filtered = df_country[
    (df_country.web_type == 'data_portal') & (df_country.country.isin(countries_data_portals))|
    (df_country.web_type == 'journal') & (df_country.country.isin(countries_journals))
]

In [64]:
df_country_filtered.web_type = df_country_filtered.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_country_filtered.web_type = df_country_filtered.web_type.apply(lambda x: 'Journal Websites' if x == 'journal' else 'US Government Websites' if x == 'government' else 'Data Portals')


In [68]:
base = alt.Chart(df_country_filtered).mark_circle(size=100, opacity=1).encode(
    alt.Y('country:N', title=None).sort(field="fail_rate_meta", op="max", order="descending"),
    alt.X('fail_rate_meta:Q', title='Estimated failure rate').axis(format='.0%'),
    alt.Color('web_type:N').scale(range=list(COLORS.values()), domain=list(COLORS.keys())).legend(None),
    # alt.Size('units:Q')
    # alt.Column('web_type:N')
).properties(
    height=300
)

error = base.mark_errorbar(

).encode(
    alt.X('ci_95L:Q', title='Estimated failure rate'),
    alt.X2('ci_95U:Q')
)

text = base.mark_text(
    dx=15,
    dy=-9
).encode(
    alt.Text('units'),
    color=alt.value('black')
)

gov = base.mark_rule(
    color='black',
    size=1,
    strokeDash=[4, 4]
).encode(
    alt.Y(),
    alt.X(f'baseline:Q', title='Estimated failure rate'),
    alt.Size(),
    alt.YOffset(),
    color=alt.value('black')
).transform_calculate(
    baseline=f"{BASELINE}"
)

plot = (base + error + text + gov)

plot = (
    plot.transform_filter("datum.web_type == 'Data Portals'").properties(title='Data Portals') |
    plot.transform_filter("datum.web_type == 'Journal Websites'").properties(title='Journal Websites')
)

# .facet(
#     alt.Column('web_type:N', title=None)
# ).resolve_scale(y='independent')

# plot = apply_theme(plot)

plot

## Raw Accessibility Results

In [None]:
df_pages = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv')

In [None]:
PAGE_COLUMNS = ['resource_category', 'website_id', 'page_id', 'page_type']
ISSUE_ORIGINAL_COLUMNS = ['issue_id', 'issue_desc', 'issue_impact', 'issue_help', 'issue_url']
ISSUE_COLUMNS = [
    'issue_id',
    'issue_desc',
    'issue_impact',
    'issue_help',
    'issue_url',
    'issue_name',
    'issue_filter',
    'issue_overall_impact',
    'issue_note_overall_impact_hdv',
    'issue_severity',
    'issue_data_related',
    'issue_data_related_rule',
    'issue_pour_category',
    'issue_wcag_level',
    'issue_difficulty_to_fix',
    'issue_missing_label_related'
]

## Aggregate Data By Unique Website

In [None]:
"""
Merge Accessibility Status, Metadata, and Detailed Issues
"""
data_portal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/data-portal/database-commons.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
reports = pd.read_csv(f"../data/{TIME_STAMP_FOLDER_NAME}/results/accessibility-status.csv", dtype={"page_id": "string"})
issue = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/unique-issues-additional-labels-aug-9-2024.csv')

data_portal_metadata['resource_category'] = 'data-portal'
journal_metadata['resource_category'] = 'journal'


reports = reports.merge(data_portal_metadata, how='left', on=PAGE_COLUMNS)
reports = reports.merge(journal_metadata, how='left', on=PAGE_COLUMNS)

reports = reports.merge(issue, how='left', on=ISSUE_ORIGINAL_COLUMNS)

reports.head(3)

In [None]:
"""
Store the column names for metadata of pages
"""
PAGE_METADATA_COLUMNS = [a for a in reports.columns.tolist() if a not in (PAGE_COLUMNS + ISSUE_COLUMNS + ['violations', 'passes', 'total_checks', 'failure_rate'])] + ['resource_category']
# PAGE_METADATA_COLUMNS

In [None]:
"""
Group by Page
"""
reports_aggregated = reports.groupby(
    PAGE_METADATA_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
"""
Failure Rate
"""
reports_aggregated['failure_rate'] = reports_aggregated.violations / reports_aggregated.total_checks

In [None]:
NUM_DATA_PORTALS = len(reports[reports.resource_category == 'data-portal'].website_id.unique().tolist())
NUM_JOURNALS = len(reports[reports.resource_category == 'journal'].website_id.unique().tolist())
(NUM_DATA_PORTALS, NUM_JOURNALS)

In [None]:
reports_aggregated.columns

## Visualize

In [None]:
COLORS = {
    'data-portal': '#56B4E9',
    'journal': '#CC79A7',
    'government': '#009E73'
}
# reports_aggregated

In [None]:
def histogram(df=None, resource_category=None):
    df_copy = df[df.resource_category == resource_category].copy()

    # df_copy = df_copy[~df_copy.titl.isna()]
    
    return (
        alt.Chart(
           df_copy
        ).mark_bar(
            color=COLORS[resource_category]
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=10),
            alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
            # alt.Tooltip(['title:N', 'failure_rate:Q']),
            # yOffset="jitter:Q",
        ).transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        ).properties(
            title={
                "text": resource_category.replace('-', ' ').title(),
                "fontWeight": 600,
                "color": "black"
            },
            height=300,
            width=400
        )
    )

# baseline = (
#     _.mark_rule(
#         color='black',
#         size=2,
#         # size=500 / len(COUNTRY_SORT),
#         strokeDash=[4, 4]
#     ).encode(
#         alt.X(f'baseline:Q', title='Failure rate'),
#         y=alt.Y()
#     ).transform_calculate(
#         baseline=f"{US_GOV_FR_MEAN}"
#     )
# )
    
# _ = _ + baseline

#     plot = _ if plot is None else plot | _

plot = alt.hconcat(
    histogram(df_pages, 'data-portal'),
    histogram(df_pages, 'journal'),
    histogram(df_pages, 'government')
)

plot.save(f'../data/{TIME_STAMP_FOLDER_NAME}/figures/histogram.png')
plot

In [None]:
pd.set_option("display.precision", 100)
reports_aggregated[(reports_aggregated.failure_rate > 0.0363636) & (reports_aggregated.failure_rate < 0.0363637)]
# reports_aggregated

In [None]:
reports[reports.page_url == 'http://n.neurology.org/']

In [None]:
reports[reports.page_url == 'http://arjournals.annualreviews.org/loi/ecolsys']

In [None]:
reports

## Aggregate Data By Unique Issue

In [None]:
reports['issue_exist'] = reports.violations.apply(lambda x: 0 if x <= 0 else 1)
reports_by_issues = reports.groupby(
    ['resource_category'] + ISSUE_COLUMNS,
    dropna=False
).agg({
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    # 'page_url': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

reports_by_issues['failure_rate'] = reports_by_issues.violations / reports_by_issues.total_checks
reports_by_issues.rename(columns={'page_url': 'page_count'}, inplace=True)
reports_by_issues['page_proportion'] = reports_by_issues.issue_exist
reports_by_issues['page_proportion'] /= reports_by_issues.resource_category.apply(lambda x: NUM_DATA_PORTALS if x == 'data-portal' else NUM_JOURNALS)

# grouped.to_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/reports/report.csv', index=False)
reports_by_issues.head(3)

In [None]:
alt.Chart(reports_by_issues[reports_by_issues.resource_category == 'data-portal']).mark_bar(
    color=COLORS['data-portal']
).encode(
    alt.X('page_proportion:Q', title='Proportion of webpages with issues').axis(format='%', orient='top'),
    alt.Y('issue_name:N', sort='-x', title=None).axis(titlePadding=40),
    alt.Color('issue_overall_impact:N').scale(domain=['critical', 'moderate', 'minor'], range=['#d95f02', '#E69F00', 'grey']),
    # alt.Color('resource_category:N').scale(domain=list(COLORS.keys()), range=list(COLORS.values())),
    # alt.Column('resource_category:N')
).properties(
    height=1600,
    width=600
)

In [None]:
import pygwalker as pyg

In [None]:
pyg.walk(reports_by_issues[reports_by_issues.resource_category == 'data-portal'])

## Statistics

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_overall_impact',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp.columns.tolist()

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_missing_label_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_severity',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_data_related',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_wcag_level',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')

In [None]:
dp = reports[reports.resource_category == 'data-portal']
dp.loc[dp.violations == 0, 'website_id'] = ''

ioi = dp.groupby(
    'issue_difficulty_to_fix',
    dropna=False
).agg({
    'website_id': lambda x: ',,,'.join(set(x)),
    'page_type': lambda x: ','.join(set(x)),
    'page_id': lambda x: ','.join(set(x)),
    'page_url': 'count',
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
    'issue_exist': 'sum'
}).reset_index()

ioi['websites'] = ioi.website_id.apply(lambda x: len(x.split(',,,')) - 1)
ioi['page_proportion'] = ioi.websites
ioi['page_proportion'] /= NUM_DATA_PORTALS

ioi.head(3)

# dp.to_csv('./test.csv')