In [1]:
import pandas as pd
import altair as alt
# alt.data_transformers.enable("vegafusion")
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

# Evaluate

In [19]:
df = pd.read_csv('../javascript/JAN-10-2024/data-portal_aggregated_results_only_image-alt.csv', header=None, names=['page_id', 'issue_id', 'violations', 'passes'])

# df = df[df.issue_id == 'image-alt']

df.head(10)
# len(df.issue_id.unique().tolist())

Unnamed: 0,page_id,issue_id,violations,passes
0,1008_home,image-alt,17,19
1,1004_home,image-alt,3,4
2,1009_home,image-alt,1,6
3,1019_home,image-alt,24,1
4,1024_home,image-alt,2,0
5,1029_home,image-alt,1,2
6,1025_home,image-alt,5,0
7,1028_home,image-alt,1,6
8,102_home,image-alt,2,7
9,1031_home,image-alt,8,10


$$
A3 = 1 - \Pi_b (1 - F_b)^{\frac{B_pb}{N_pb} + \frac{B_pb}{B_p}}
$$

> Equation 3 presents the formula for computing the A3 metric, where Bpb is the total of actual points of failure of a checkpoint b in page p, b is the barrier (checkpoint violation), Npb is the total of potential points of failure of a checkpoint b in page p, and Fb identifies the severity of a certain barrier b (this weight is calculated by simple heuristics, by combining the results of an automatic evaluation and manual testing or by disabled users feedback [221). The authors of this metric performed an experimental study to compare the results between A3 and UWEM and understand the differences between them. A checkpoint weight of 0.05 was used for all checkpoints, assuming that all of them would have the same importance. This experiment was conducted with a group of six disabled users that evaluated six web pages. After applying both metrics, the authors concluded that A3 outperformed UWEM in the experiment [11].

In [20]:
_df = df.merge(df.groupby(['page_id'])['violations'].sum().reset_index().rename(columns={'violations':'Bp'}), on='page_id', how='left')

_df['Npb'] = _df['violations'] + _df['passes']
_df['Bpb_over_Npb'] = _df['violations'] / _df['Npb']
_df['Bpb_over_Bb'] = _df['violations'] / _df['Bp']
_df['Fb'] = 0.1

_df = _df.merge(_df.groupby(['page_id'])['Npb'].sum().reset_index().rename(columns={'Npb':'Np'}), on='page_id', how='left')

_df['A3'] = 1 - _df['Fb']
_df['A3'] = _df['A3'] ** (_df['Bpb_over_Npb'] + _df['Bpb_over_Bb'])
_df.drop(['issue_id'], inplace=True, axis=1)
_df = _df[['page_id', 'A3']].groupby(['page_id']).prod().reset_index()
_df['A3'] = 1 - _df['A3']

_df

Unnamed: 0,page_id,A3
0,1004_home,0.139735
1,1008_home,0.143683
2,1009_home,0.113445
3,1019_home,0.186579
4,1024_home,0.190000
...,...,...
779,98_home,0.154086
780,990_home,0.168381
781,994_home,0.146185
782,997_home,0.190000


In [21]:
# or just ratio
_df = df.copy()

_df.drop(['issue_id'], axis=1, inplace=True)

_df = _df.groupby(['page_id']).sum().reset_index()

_df['Np'] = _df['violations'] + _df['passes']
_df['violation_ratio'] = _df['violations'] / _df['Np']
# _df['Bpb_over_Npb'] = _df['violations'] / _df['Npb']
# _df['Bpb_over_Bb'] = _df['violations'] / _df['Bp']
# _df['Fb'] = 0.1

# _df = _df.merge(_df.groupby(['page_id'])['Npb'].sum().reset_index().rename(columns={'Npb':'Np'}), on='page_id', how='left')

# _df['A3'] = 1 - _df['Fb']
# _df['A3'] = _df['A3'] ** (_df['Bpb_over_Npb'] + _df['Bpb_over_Bb'])
# _df.drop(['issue_id'], inplace=True, axis=1)
# _df = _df[['page_id', 'A3']].groupby(['page_id']).prod().reset_index()
# _df['A3'] = 1 - _df['A3']

_df

Unnamed: 0,page_id,violations,passes,Np,violation_ratio
0,1004_home,3,4,7,0.428571
1,1008_home,17,19,36,0.472222
2,1009_home,1,6,7,0.142857
3,1019_home,24,1,25,0.960000
4,1024_home,2,0,2,1.000000
...,...,...,...,...,...
779,98_home,10,7,17,0.588235
780,990_home,3,1,4,0.750000
781,994_home,2,2,4,0.500000
782,997_home,3,0,3,1.000000


In [22]:
_df['id'] = _df['page_id'].apply(lambda x: x.split('_')[0])
df_meta = pd.read_csv(f'../output/Nov-21-2023/data-portal_metadata.csv')

_df['id'] = _df['id'].astype(str)
df_meta['id'] = df_meta['id'].astype(str)

_df = _df.merge(df_meta, left_on='id', right_on='id', how='left')
_df

Unnamed: 0,page_id,violations,passes,Np,violation_ratio,id,source_id,short_name,accession,full_name,...,category_list,keywords_list,data_object_list,organism_list,theme_list,zindex,first_publication_year,search_example,cited_date,ess
0,1004_home,3,4,7,0.428571,1004,dc_74,P2CS,DBC000074,Prokaryotic 2-Component Systems,...,Metadata,Two-component system,,,,9.00000,0,,2022-12-04 00:00:04,0
1,1008_home,17,19,36,0.472222,1008,dc_5726,Chromosome 22,DBC005726,,...,"Gene genome and annotation, Genotype phenotype...",Microarray; Chromosome 22; human genome; trans...,Animal,Homo sapiens,,8.90000,2003,,2022-12-04 00:02:13,1
2,1009_home,1,6,7,0.142857,1009,dc_1477,Telomerase Database,DBC001477,Telomerase Database,...,Gene genome and annotation,Telomerase,,,,8.86667,0,,2022-12-04 00:00:43,0
3,1019_home,24,1,25,0.960000,1019,dc_6828,EWAS Data Hub,DBC006828,Epigenome-Wide Association Study data hub,...,"Modification, Health and medicine","DNA methylation, methylation-based biomarkers",Animal,Homo sapiens,,8.66667,0,,2022-12-04 00:02:36,0
4,1024_home,2,0,2,1.000000,1024,dc_7454,Cellinker,DBC007454,,...,Interaction,"scRNA-seq, protein-protein interaction, cell-c...","Animal, Virus","Homo sapiens, Mus musculus",,8.50000,0,,2022-12-04 00:02:49,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,98_home,10,7,17,0.588235,98,dc_419,GDSC,DBC000419,Genomics of Drug Sensitivity in Cancer,...,Health and medicine,therapeutic biomarker,Animal,Homo sapiens,,116.30000,0,,2022-12-04 00:00:14,0
780,990_home,3,1,4,0.750000,990,dc_8012,gutMGene,DBC008012,,...,Health and medicine,"microbe, microbial metabolite",Bacteria,"Homo sapiens, Mus musculus",,9.00000,0,,2022-12-04 00:03:00,0
781,994_home,2,2,4,0.500000,994,dc_7617,TarDB,DBC007617,,...,"Interaction, Expression","phasiRNA, miRNA",Plant,"Chlamydomonas reinhardtii, Volvox carteri",,9.00000,0,,2022-12-04 00:02:53,0
782,997_home,3,0,3,1.000000,997,dc_6405,TASmania,DBC006405,,...,Interaction,Bacterial Toxin-Antitoxin systems,Bacteria,Mycobacterium tuberculosis,,9.00000,0,,2022-12-04 00:02:27,0


In [23]:
_df.columns

Index(['page_id', 'violations', 'passes', 'Np', 'violation_ratio', 'id',
       'source_id', 'short_name', 'accession', 'full_name', 'description',
       'url', 'founded_year', 'host_institution', 'address', 'city',
       'province', 'country', 'current_version', 'contact_name',
       'contact_email', 'available_protocol', 'last_update', 'age', 'citation',
       'token', 'expire_date', 'ins', 'group_email', 'is_partner', 'funding',
       'submitter', 'is_new', 'bigsearch_id', 'record_created',
       'last_modified', 'data_type_list', 'category_list', 'keywords_list',
       'data_object_list', 'organism_list', 'theme_list', 'zindex',
       'first_publication_year', 'search_example', 'cited_date', 'ess'],
      dtype='object')

In [24]:
df_country_count = _df.country.value_counts().reset_index().sort_values(by='count', ascending=False)
df_country_count = df_country_count[df_country_count['count'] > 10]
COUNTRIES = df_country_count.country.tolist()
COUNTRIES

['United States',
 'China',
 'United Kingdom',
 'Germany',
 'France',
 'Japan',
 'Canada',
 'Italy',
 'Korea, Republic of',
 'Spain',
 'Switzerland',
 'India',
 'Denmark']

In [25]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(Np):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [26]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violations):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [32]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violation_ratio):Q', title='The ratio of missing alt-texts of images among all images').axis(format='.0%').scale(domain=[0, 1]),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [33]:
df_host_institution_count = _df.host_institution.value_counts().reset_index().sort_values(by='count', ascending=False)
df_host_institution_count = df_host_institution_count[df_host_institution_count['count'] > 3]
INSTS = df_host_institution_count.host_institution.tolist()
INSTS

['European Bioinformatics Institute',
 'Beijing Institute of Genomics, Chinese Academy of Sciences',
 'Huazhong University of Science and Technology',
 'Harbin Medical University',
 'Wellcome Sanger Institute',
 'University of Copenhagen',
 'Peking University',
 'Iowa State University',
 'Yonsei University',
 'Stanford University',
 'University of Oxford',
 'University of California San Diego',
 'University of Liverpool',
 'Cornell University',
 'National Center for Biotechnology Information',
 'National Institute of Agrobiological Sciences',
 'University of Toronto',
 'Swiss Institute of Bioinformatics',
 'International Institute of Molecular and Cell Biology',
 'Scripps Research',
 'University of Pennsylvania',
 'University of Georgia',
 'Southern Medical University',
 'Jackson Laboratory',
 'RIKEN',
 'China Agricultural University',
 'University of Tokyo',
 'Shanghai Jiao Tong University',
 'University of California Irvine',
 'University of Alberta',
 'Yale University',
 'University

In [34]:
alt.Chart(
    _df[_df.host_institution.isin(INSTS)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violation_ratio):Q'),
    alt.Y('host_institution:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='host_institution with more than 10 sites',
    width=500
)