In [164]:
import pandas as pd
import altair as alt
# alt.data_transformers.enable("vegafusion")
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

# Evaluate

In [165]:
df = pd.read_csv('../javascript/JAN-10-2024/data-portal_aggregated_results.csv', header=None, names=['page_id', 'issue_id', 'violations', 'passes'])

# df = df[df.issue_id == 'image-alt']

df.head(10)
# len(df.issue_id.unique().tolist())

Unnamed: 0,page_id,issue_id,violations,passes
0,1009_home,heading-order,2,1
1,1009_home,image-alt,1,6
2,1009_home,landmark-banner-is-top-level,1,1
3,1009_home,landmark-no-duplicate-banner,1,0
4,1009_home,landmark-unique,1,11
5,1009_home,region,1,174
6,1009_home,aria-allowed-attr,0,12
7,1009_home,aria-allowed-role,0,10
8,1009_home,aria-conditional-attr,0,12
9,1009_home,aria-deprecated-role,0,5


$$
A3 = 1 - \Pi_b (1 - F_b)^{\frac{B_pb}{N_pb} + \frac{B_pb}{B_p}}
$$

> Equation 3 presents the formula for computing the A3 metric, where Bpb is the total of actual points of failure of a checkpoint b in page p, b is the barrier (checkpoint violation), Npb is the total of potential points of failure of a checkpoint b in page p, and Fb identifies the severity of a certain barrier b (this weight is calculated by simple heuristics, by combining the results of an automatic evaluation and manual testing or by disabled users feedback [221). The authors of this metric performed an experimental study to compare the results between A3 and UWEM and understand the differences between them. A checkpoint weight of 0.05 was used for all checkpoints, assuming that all of them would have the same importance. This experiment was conducted with a group of six disabled users that evaluated six web pages. After applying both metrics, the authors concluded that A3 outperformed UWEM in the experiment [11].

In [166]:
_df = df.merge(df.groupby(['page_id'])['violations'].sum().reset_index().rename(columns={'violations':'Bp'}), on='page_id', how='left')

_df['Npb'] = _df['violations'] + _df['passes']
_df['Bpb_over_Npb'] = _df['violations'] / _df['Npb']
_df['Bpb_over_Bb'] = _df['violations'] / _df['Bp']
_df['Fb'] = 0.1

_df = _df.merge(_df.groupby(['page_id'])['Npb'].sum().reset_index().rename(columns={'Npb':'Np'}), on='page_id', how='left')

_df['A3'] = 1 - _df['Fb']
_df['A3'] = _df['A3'] ** (_df['Bpb_over_Npb'] + _df['Bpb_over_Bb'])
_df.drop(['issue_id'], inplace=True, axis=1)
_df = _df[['page_id', 'A3']].groupby(['page_id']).prod().reset_index()
_df['A3'] = 1 - _df['A3']

_df

Unnamed: 0,page_id,A3
0,1000_home,0.327676
1,1002_home,0.389882
2,1004_home,0.360616
3,1008_home,0.501268
4,1009_home,0.311422
...,...,...
1668,997_home,0.478410
1669,998_home,0.550478
1670,999_home,0.305311
1671,99_home,0.439233


In [173]:
# or just ratio
_df = df.copy()

_df.drop(['issue_id'], axis=1, inplace=True)

_df = _df.groupby(['page_id']).sum().reset_index()

_df['Np'] = _df['violations'] + _df['passes']
_df['violation_ratio'] = _df['violations'] / _df['Np']
# _df['Bpb_over_Npb'] = _df['violations'] / _df['Npb']
# _df['Bpb_over_Bb'] = _df['violations'] / _df['Bp']
# _df['Fb'] = 0.1

# _df = _df.merge(_df.groupby(['page_id'])['Npb'].sum().reset_index().rename(columns={'Npb':'Np'}), on='page_id', how='left')

# _df['A3'] = 1 - _df['Fb']
# _df['A3'] = _df['A3'] ** (_df['Bpb_over_Npb'] + _df['Bpb_over_Bb'])
# _df.drop(['issue_id'], inplace=True, axis=1)
# _df = _df[['page_id', 'A3']].groupby(['page_id']).prod().reset_index()
# _df['A3'] = 1 - _df['A3']

_df

Unnamed: 0,page_id,violations,passes,Np,violation_ratio
0,1000_home,42,730,772,0.054404
1,1002_home,19,206,225,0.084444
2,1004_home,20,678,698,0.028653
3,1008_home,98,378,476,0.205882
4,1009_home,8,441,449,0.017817
...,...,...,...,...,...
1668,997_home,18,404,422,0.042654
1669,998_home,45,343,388,0.115979
1670,999_home,7,399,406,0.017241
1671,99_home,18,68,86,0.209302


In [174]:
_df['id'] = _df['page_id'].apply(lambda x: x.split('_')[0])
df_meta = pd.read_csv(f'../output/Nov-21-2023/data-portal_metadata.csv')

_df['id'] = _df['id'].astype(str)
df_meta['id'] = df_meta['id'].astype(str)

_df = _df.merge(df_meta, left_on='id', right_on='id', how='left')
_df

Unnamed: 0,page_id,violations,passes,Np,violation_ratio,id,source_id,short_name,accession,full_name,...,category_list,keywords_list,data_object_list,organism_list,theme_list,zindex,first_publication_year,search_example,cited_date,ess
0,1000_home,42,730,772,0.054404,1000,dc_7326,TREND-DB,DBC007326,A transcriptome-wide atlas of the dynamic land...,...,Structure,"alternative polyadenylation, RNAi phenotypes",Animal,Homo sapiens,,9.00000,0,,2022-12-04 00:02:47,0
1,1002_home,19,206,225,0.084444,1002,dc_691,PROMISCUOUS,DBC000691,,...,"Health and medicine, Interaction","drug-protein interaction, side-effect",,,,9.00000,0,,2022-12-04 00:00:23,0
2,1004_home,20,678,698,0.028653,1004,dc_74,P2CS,DBC000074,Prokaryotic 2-Component Systems,...,Metadata,Two-component system,,,,9.00000,0,,2022-12-04 00:00:04,0
3,1008_home,98,378,476,0.205882,1008,dc_5726,Chromosome 22,DBC005726,,...,"Gene genome and annotation, Genotype phenotype...",Microarray; Chromosome 22; human genome; trans...,Animal,Homo sapiens,,8.90000,2003,,2022-12-04 00:02:13,1
4,1009_home,8,441,449,0.017817,1009,dc_1477,Telomerase Database,DBC001477,Telomerase Database,...,Gene genome and annotation,Telomerase,,,,8.86667,0,,2022-12-04 00:00:43,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1668,997_home,18,404,422,0.042654,997,dc_6405,TASmania,DBC006405,,...,Interaction,Bacterial Toxin-Antitoxin systems,Bacteria,Mycobacterium tuberculosis,,9.00000,0,,2022-12-04 00:02:27,0
1669,998_home,45,343,388,0.115979,998,dc_7250,OpenProt,DBC007250,,...,Gene genome and annotation,"ORF, Coding potential","Animal, Fungi","Homo sapiens, Mus musculus, Drosophila melanog...",,9.00000,0,,2022-12-04 00:02:45,0
1670,999_home,7,399,406,0.017241,999,dc_4478,RFDB,DBC004478,Rice Functional Genomics and Breeding Database,...,"Gene genome and annotation, Genotype phenotype...","pan-genome, genomic sequences, gene annotation...",Plant,Oryza sativa,,9.00000,2017,,2022-12-04 00:01:39,1
1671,99_home,18,68,86,0.209302,99,dc_516,VFDB,DBC000516,Virulence Factor Database,...,"Gene genome and annotation, Health and medicin...","bacterial virulence factor, bacterial pathogen...",Bacteria,"Escherichia coli, Acinetobacter baumannii, Aer...",,113.77800,0,,2022-12-04 00:00:17,0


In [175]:
_df.columns

Index(['page_id', 'violations', 'passes', 'Np', 'violation_ratio', 'id',
       'source_id', 'short_name', 'accession', 'full_name', 'description',
       'url', 'founded_year', 'host_institution', 'address', 'city',
       'province', 'country', 'current_version', 'contact_name',
       'contact_email', 'available_protocol', 'last_update', 'age', 'citation',
       'token', 'expire_date', 'ins', 'group_email', 'is_partner', 'funding',
       'submitter', 'is_new', 'bigsearch_id', 'record_created',
       'last_modified', 'data_type_list', 'category_list', 'keywords_list',
       'data_object_list', 'organism_list', 'theme_list', 'zindex',
       'first_publication_year', 'search_example', 'cited_date', 'ess'],
      dtype='object')

In [176]:
df_country_count = _df.country.value_counts().reset_index().sort_values(by='count', ascending=False)
df_country_count = df_country_count[df_country_count['count'] > 10]
COUNTRIES = df_country_count.country.tolist()
COUNTRIES

['United States',
 'China',
 'United Kingdom',
 'Germany',
 'Canada',
 'Japan',
 'France',
 'Switzerland',
 'Italy',
 'Spain',
 'India',
 'Korea, Republic of',
 'Australia',
 'Netherlands',
 'Belgium',
 'Denmark',
 'Poland',
 'Israel']

In [181]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(Np):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [183]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violations):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [182]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violation_ratio):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [178]:
df_host_institution_count = _df.host_institution.value_counts().reset_index().sort_values(by='count', ascending=False)
df_host_institution_count = df_host_institution_count[df_host_institution_count['count'] > 10]
INSTS = df_host_institution_count.host_institution.tolist()
INSTS

['European Bioinformatics Institute',
 'National Center for Biotechnology Information',
 'Huazhong University of Science and Technology',
 'Beijing Institute of Genomics, Chinese Academy of Sciences',
 'Harbin Medical University',
 'Swiss Institute of Bioinformatics',
 'Peking University',
 'Stanford University',
 'Wellcome Sanger Institute',
 'Zhejiang University',
 'Cornell University',
 'University of Oxford',
 'University of Alberta',
 'University of Toronto',
 'University of Washington',
 'University of Copenhagen',
 'University of Michigan']

In [179]:
alt.Chart(
    _df[_df.host_institution.isin(INSTS)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violation_ratio):Q'),
    alt.Y('host_institution:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='host_institution with more than 10 sites',
    width=500
)