In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from data_load import data_load 
import altair as alt

In [None]:
df = data_load()

In [None]:
df['d'].to_csv('data_portals.csv', index=False)
df['j'].to_csv('journal_websites.csv', index=False)
df['g'].to_csv('us_government_websites.csv', index=False)

In [None]:
toy = df['d_agg']
toy

In [None]:
toy.columns

In [None]:
higher_than_gov_est_fr = len(toy[toy.failure_rate > 0.015346].short_name.tolist())
all_data_portals = len(toy.short_name.tolist())

higher_than_gov_est_fr / all_data_portals * 100

In [None]:
alt.Chart(toy).mark_circle().encode(alt.X('zindex:Q'), alt.Y('mean(failure_rate):Q'), alt.Tooltip(['short_name']))

In [None]:
d = df['d'].copy()
j = df['j'].copy()

In [None]:
# merged = pd.concat([d, j])
# merged['issue_exist'] = merged.violations.apply(lambda x: 1 if x > 0 else 0)
# d.head(1)
d['issue_exist'] = d.violations.apply(lambda x: 1 if x > 0 else 0)
j['issue_exist'] = j.violations.apply(lambda x: 1 if x > 0 else 0)
d.head(2).columns

In [None]:
num_page_ids_d = len(d.page_id.unique().tolist())
num_page_ids_j = len(j.page_id.unique().tolist())
num_page_ids_d

In [None]:
sorted_d = d[['issue_id', 'issue_exist']].groupby('issue_id').agg('sum')
sorted_d.issue_exist = (sorted_d.issue_exist / num_page_ids_d)
sorted_d = sorted_d.sort_values(by='issue_exist', ascending=False).reset_index()
sorted_d

In [None]:
sorted_j = j[['issue_id', 'issue_exist']].groupby('issue_id').agg('sum')
sorted_j.issue_exist = (sorted_j.issue_exist / num_page_ids_j)
sorted_j = sorted_j.sort_values(by='issue_exist', ascending=False).reset_index()
sorted_j

In [None]:
from theme import apply_theme
COLORS = {
    'd': '#56B4E9',
    'j': '#CC79A7',
    'g': '#009E73'
}

plot_d = alt.Chart(sorted_d.head(10)).mark_bar(
    color=COLORS['d']
).encode(
    alt.X('issue_exist:Q', title='Proportion of webpages with issues').axis(format='%'),
    alt.Y('issue_id:N', sort='-x', title='Accessibility Issues IDs').axis(titlePadding=40)
).properties(
    title='Data Portals',
    height=300
)

plot_j = alt.Chart(sorted_j.head(10)).mark_bar(
    color=COLORS['j']
).encode(
    alt.X('issue_exist:Q', title='Proportion of webpages with issues').axis(format='%'),
    alt.Y('issue_id:N', sort='-x', title=None)
).properties(
    title='Journal Websites',
    height=300
)

plot = plot_d | plot_j

apply_theme(plot)

In [None]:
table_issues = [
    'table-duplicate-name',
    'td-headers-attr',
    'empty-table-header',
    'scope-attr-valid'
]

image_issues = [
    'svg-img-alt',
    'role-img-alt',
    'image-alt',
    'image-redundant-alt',
    'area-alt'
]

In [None]:
temp = d[['page_id', 'issue_id', 'violations', 'issue_exist']].copy()
temp['issue_data_representation'] = temp.issue_id.apply(lambda x: "Table-related" if x in table_issues else "Image-related" if x in image_issues else "None")

temp = temp[['page_id', 'issue_data_representation', 'violations']].groupby(['page_id', 'issue_data_representation']).agg('sum').reset_index()
temp['issue_exist'] = temp.violations.apply(lambda x: 1 if x > 0 else 0)
temp = temp[['issue_data_representation', 'issue_exist']].groupby('issue_data_representation').agg('sum')
temp.issue_exist = (temp.issue_exist / num_page_ids_d)
temp = temp.reset_index()
temp = temp[temp.issue_data_representation != 'None']
dd = temp.copy()
dd

In [None]:
temp = j[['page_id', 'issue_id', 'violations', 'issue_exist']].copy()
temp['issue_data_representation'] = temp.issue_id.apply(lambda x: "Table-related" if x in table_issues else "Image-related" if x in image_issues else "None")

temp = temp[['page_id', 'issue_data_representation', 'violations']].groupby(['page_id', 'issue_data_representation']).agg('sum').reset_index()
temp['issue_exist'] = temp.violations.apply(lambda x: 1 if x > 0 else 0)
temp = temp[['issue_data_representation', 'issue_exist']].groupby('issue_data_representation').agg('sum')
temp.issue_exist = (temp.issue_exist / num_page_ids_j)
temp = temp.reset_index()
temp = temp[temp.issue_data_representation != 'None']
jd = temp.copy()
jd

In [None]:
from theme import apply_theme
COLORS = {
    'd': '#56B4E9',
    'j': '#CC79A7',
    'g': '#009E73'
}

plot_d = alt.Chart(dd).mark_bar(
    color=COLORS['d'],
    size=70
).encode(
    alt.Y('issue_exist:Q', title='Proportion of webpages with issues').axis(format='%', tickCount=5).scale(domain=[0, 0.5]),
    alt.X('issue_data_representation:N', sort='-y', title=None).axis(titlePadding=40)
).properties(
    title='Data Portals',
    width=300
)

plot_j = alt.Chart(jd).mark_bar(
    color=COLORS['j'],
    size=70
).encode(
    alt.Y('issue_exist:Q', title=None).axis(format='%', tickCount=5).scale(domain=[0, 0.5]),
    alt.X('issue_data_representation:N', sort='-y', title=None)
).properties(
    title='Journal Websites',
    width=300
)

plot = plot_d | plot_j

apply_theme(plot)