# Analysis

In [None]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import altair as alt
import pygwalker as pyg
from altair import datum
from theme import apply_theme
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

## Set Const Variables

In [None]:
TIME_STAMP_FOLDER = 'JAN-10-2024'
A11Y_COLORS = {
    'd': '#56B4E9',
    'j': '#CC79A7',
    'g': '#009E73'
}
A11Y_CATEGORIES = [
    ('d', 'data-portal', 'Data Portals', '#56B4E9'),
    ('j', 'journal-portal', 'Journal Websites',  '#CC79A7'),
    ('g', 'gov', "US Government Websites", '#009E73'),
]
AGG_CATEGORIES = [
    '', 
    '_agg', 
    '_agg_alt'
]
TOP_CNT = 10 # for printing top websites

## Accessibility Issues

In [None]:
issues_df = pd.DataFrame()
for (id, category, title, color) in A11Y_CATEGORIES:
    _ = pd.read_csv(
        f'../javascript/{TIME_STAMP_FOLDER}/{category}_a11y_issues.csv',
        header=None,
        names=['id', 'impact', 'description']
    )
    issues_df = pd.concat([issues_df, _])
issues_df.drop_duplicates(inplace=True)
issues_df.to_csv(f'../javascript/{TIME_STAMP_FOLDER}/all_a11y_issues.csv', index=False)
issues_df.id.unique().tolist()

In [None]:
issues_df[issues_df.impact == 'serious']

In [None]:
ISSUE_IDS = issues_df.id.tolist()

In [None]:
issues_df[issues_df.description.str.contains('alternate')]
ALT_ISSUE_IDS = issues_df[issues_df.description.str.contains('alternate')].id.tolist()
ALT_ISSUE_IDS

## Load Accessibility Evaluation Results

In [None]:
df = {}

for (id, category, title, color) in A11Y_CATEGORIES:
    df[id] = pd.read_csv(
        f'../javascript/{TIME_STAMP_FOLDER}/{category}_a11y_results.csv',
        header=None,
        names=['page_id', 'issue_id', 'violations', 'passes']
    )

df['d']

### Merge additional datasets we selectively collected

#### NEI

In [None]:
nei = pd.read_csv(
    f'../javascript/{TIME_STAMP_FOLDER}/nei-data-portal_a11y_results.csv',
    header=None,
    names=['page_id', 'issue_id', 'violations', 'passes']
)
df['d'] = pd.concat([df['d'], nei])
df['d']

#### NIH

In [None]:
nih = pd.read_csv(
    f'../javascript/{TIME_STAMP_FOLDER}/nih-data-portal_a11y_results.csv',
    header=None,
    names=['page_id', 'issue_id', 'violations', 'passes']
)
df['d'] = pd.concat([df['d'], nih])
df['d']

### Harvard

In [None]:
# harvard = pd.read_csv(
#     f'../javascript/{TIME_STAMP_FOLDER}/harvard-data-portal_a11y_results.csv',
#     header=None,
#     names=['page_id', 'issue_id', 'violations', 'passes']
# )
# df['d'] = pd.concat([df['d'], harvard])
# df['d']

## Calculate Accessibility Measurement (Failure Rate or FR)

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    _ = df[id].copy()
    
    # To group by page_id, uncomment the following lines
    # _df.drop(['issue_id'], axis=1, inplace=True)
    # _df = _df.groupby(['page_id']).sum().reset_index()
    
    _['total_checks'] = _['violations'] + _['passes']
    _['failure_rate'] = _['violations'] / _['total_checks']
    df[id] = _
df['d'].head(5)

## Aggregation By Page ID

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    _ = df[id].copy()
    
    # CHECKING...
    # _ = _[_.issue_id != 'region']

    _.drop(['issue_id'], axis=1, inplace=True)
    _ = _.groupby(['page_id']).sum().reset_index()
    _['failure_rate'] = _['violations'] / _['total_checks']
    df[id + '_agg'] = _
df['d_agg'].head(5)

## Aggregation w/Alt-related Issues Only

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    _ = df[id].copy()
    _ = _[_['issue_id'].isin(ALT_ISSUE_IDS)]
    _.drop(['issue_id'], axis=1, inplace=True)
    _ = _.groupby(['page_id']).sum().reset_index()
    _['failure_rate'] = _['violations'] / _['total_checks']
    df[id + '_agg_alt'] = _
df['d_agg_alt'].head(5)

## Merge Metadata

### For Data Portals and Journal Websites

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    if(category == 'gov'):
        continue

    for ver in ['', '_agg', '_agg_alt']:
        _ = df[id + ver].copy()
        _['id'] = _['page_id'].apply(lambda x: x.split('_')[0])
        meta = pd.read_csv(f'../output/Nov-21-2023/{category}_metadata.csv')

        _['id'] = _['id'].astype(str)
        meta['id'] = meta['id'].astype(str)

        _ = _.merge(meta, left_on='id', right_on='id', how='left')

        # Some data cleaning
        if id == 'd':
            # Group NIH 
            NIH_INSTS = [
                'National Center for Biotechnology Information',
                'National Cancer Institute',
                'National Heart, Lung, and Blood Institute',
                'National Center for Advancing Translational Sciences',
                'National Institutes of Health',
                'National Human Genome Research Institute',
                'National Institute of Environmental Health Sciences',
                'National Library of Medicine',
                'National Institute of Standards and Technology',
                'National Institute of Health',
                'National Institute on Aging',
                'National Institute of Neurological Disorders & Stroke',
                'National Institute of Child Health and Human Development',
                'National Eye Institute', # none found
                'National Institute of Allergy and Infectious Diseases',
                'National Institute of Arthritis and Musculoskeletal and Skin Diseases'
            ]
            # _.loc[_.host_institution.isin(NIH_INSTS), 'host_institution'] = 'National Institutes of Health'
        # elif id == 'jp':
            # _.loc[_.publisher.str.contains('Elsevier') == True, 'publisher'] = 'Elsevier'
            # _.loc[_.publisher.str.contains('Springer') == True, 'publisher'] = 'Springer-related'

        df[id + ver] = _
df['j'].head(5)

### Add page ypes

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    for ver in ['', '_agg', '_agg_alt']:
        df[id + ver]['page_type'] = df[id + ver].page_id.apply(lambda x: x.split('_')[1] if '_' in str(x) else 'home')
df['d_agg'].head(5)

### Load Metadata for Gov

In [None]:
for var in AGG_CATEGORIES:
    gov_meta = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/gov_pages.csv', sep=',', header=None, names=['name', 'type', 'inst', 'desc', 'city', 'state', 'blank'])
    gov_meta.reset_index(inplace=True)
    df['g' + var] = df['g' + var].merge(gov_meta, left_on='page_id', right_on='index', how='left')
    df['g' + var]['url'] = df['g' + var]['name']

### Load Metadata for NEI

In [None]:
for var in AGG_CATEGORIES:
    _meta = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/nei-data-portal_pages.csv')
    _meta.set_index('page_id', inplace=True)
    
    _ = df['d' + var].copy()
    _.set_index('page_id', inplace=True)
    _[_.isnull()] = _meta
    _.reset_index(inplace=True)
    df['d' + var] = _

### Load Metadata for NIH

In [None]:
for var in AGG_CATEGORIES:
    _meta = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/nih-data-portal_pages.csv')
    _meta['short_name'] = _meta.Repository_Name
    _meta['host_institution'] = 'National Institutes of Health'
    _meta['country'] = 'United States'
    _meta.set_index('page_id', inplace=True)

    _ = df['d' + var].copy()
    _.set_index('page_id', inplace=True)
    _[_.isnull()] = _meta
    _.reset_index(inplace=True)
    df['d' + var] = _

### Load Metadata for Harvard

In [None]:
# for var in AGG_CATEGORIES:
#     _meta = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/harvard-data-portal_pages.csv')
#     _meta.set_index('page_id', inplace=True)
    
#     _ = df['d' + var].copy()
#     _.set_index('page_id', inplace=True)
#     _[_.isnull()] = _meta
#     _.reset_index(inplace=True)
#     df['d' + var] = _

In [None]:
# df['d_agg'][df['d_agg'].page_id.str.contains('harvard')]

## Data Cleaning

In [None]:
# df['j_agg'][df['j_agg'].title == 'Nature Communications']

### Column Names

In [None]:
for var in AGG_CATEGORIES:
    df['d' + var]['title'] = df['d' + var]['short_name']
    df['g' + var]['title'] = df['g' + var]['name']

### Country Names

In [None]:
def rename_countries(x: str):
    if x == 'Korea Republic of' or x == 'Korea, Republic of':
        return 'South Korea'
    elif x == 'Korea, Democratic People"S Republic of' or x == 'Korea, Democratic People':
        return 'North Korea'
    elif x == 'Russian Federation':
        return 'Russia'
    elif x == 'Iran, Islamic Republic Of':
        return 'Iran'
    else:
        return x
    
for var in AGG_CATEGORIES:
    df['d' + var].country = df['d' + var].country.apply(lambda x: rename_countries(x))
    df['j' + var].country = df['j' + var].country.apply(lambda x: rename_countries(x))

# Print
# countries = [x for x in list(set(df['d'].country.tolist() + df['j'].country.unique().tolist())) if str(x) != 'nan']
# countries.sort()
# countries

### Add Continent

In [None]:
continent_country_map = pd.read_csv('https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv')
continent_country_map = continent_country_map.rename(columns={
    'Country': 'country',
    'Continent': 'continent'
})

def clean_country_names(x):
    if x == 'US':
        return 'United States'
    elif x == 'Korea, South':
        return 'South Korea' 
    elif x == 'Korea, North':
        return 'North Korea'
    elif x == 'Russian Federation':
        return 'Russia'
    elif x == 'Samoa':
        return 'American Samoa'
    elif x == 'Vietnam':
        return 'Viet Nam'
    elif x == 'Serbia':
        return 'Serbia and Montenegro'
    else:
        return x

continent_country_map.country = continent_country_map.country.apply(lambda x: clean_country_names(x))

for id in ['d', 'j']:
    for var in AGG_CATEGORIES:
        _ = df[id + var].copy()
        _ = _.merge(continent_country_map, left_on='country', right_on='country', how='left')
        _.loc[_.continent.isnull(), 'continent'] = _[_.continent.isnull()].country.apply(lambda x: 'Europe' if x == 'Czech Republic' or x == 'Guadeloupe' else x)

        # Some manual correction
        _.continent = _.continent.apply(lambda x: 'Europe' if x == 'Serbia' else 'Asia' if x == 'Taiwan' or x == 'Hong Kong' or x == 'Brunei Darussalam' else 'North America' if x == 'Puerto Rico' else x)
        
        df[id + var] = _
# _[_.continent.isnull()].country.unique().tolist()
_.head(5)

### Journal Publisher Names

In [None]:
_ = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/Publishers of Journal Portals - Sheet1.csv')
_.Cleaned.fillna(_.Original, inplace=True)
mapping = dict(zip(_.Original, _.Cleaned))

for var in AGG_CATEGORIES:
    df['j' + var].publisher = df['j' + var].publisher.apply(lambda x: mapping[x] if x in mapping else x)
df['j'].head(5)

### Journal Filtering

In [None]:
_ = pd.read_csv(f'../javascript/{TIME_STAMP_FOLDER}/Filtering of Journals - Sheet1.csv')
_.rename(columns={'If filter "v", otherwise empty': 'is_filter'}, inplace=True)
_.is_filter = _.is_filter.apply(lambda x: True if x == 'v' else False)
mapping = dict(zip(_.Title, _.is_filter))

for var in AGG_CATEGORIES:
    df['j' + var]['is_filter'] = df['j' + var].title.apply(lambda x: mapping[x] if x in mapping else False)
    df['j' + var] = df['j' + var][df['j' + var].is_filter == False]
df['j'].head(5)

### More Journal Filtering Based on "Areas"

In [None]:
# What are the unique areas?
combined_list = [area.split('; ') for area in df['j'].areas.unique().tolist()]
flat_list = [item for sublist in combined_list for item in sublist]
flat_list = list(set(flat_list))
flat_list.sort()
# flat_list

In [None]:
keep = [
    'Agricultural and Biological Sciences',
    'Biochemistry, Genetics and Molecular Biology',
    'Dentistry',
    'Health Professions',
    'Immunology and Microbiology',
    'Medicine',
    'Multidisciplinary',
    'Neuroscience',
    'Nursing',
    'Pharmacology, Toxicology and Pharmaceutics',
    'Psychology'
]
def isKeeping(areas: str):
    if areas == 'Multidisciplinary' or any([k in areas for k in keep]):
        return True
    return False

for var in AGG_CATEGORIES:    
    df['j' + var]['is_keep'] = df['j' + var].areas.apply(lambda x: isKeeping(x))
    df['j' + var] = df['j' + var][df['j' + var].is_keep == True]
    df['j' + var].drop(['is_keep'], axis=1, inplace=True)
    df['j' + var]

In [None]:
def remove_unkeep_areas(areas):
        area_list = areas.split("; ")
        filtered_list = list(filter(lambda area: area in keep, area_list))
        
        # if len(filtered_list) >= 2 and 'Medicine' in filtered_list:
        #     filtered_list.remove('Medicine')
        # if len(filtered_list) >= 2 and 'Multidisciplinary' in filtered_list:
        #     filtered_list.remove('Multidisciplinary')
        # if len(filtered_list) >= 2 and 'Health Professions' in filtered_list:
        #     filtered_list.remove('Health Professions')
        # if len(filtered_list) >= 2 and 'Nursing' in filtered_list:
        #     filtered_list = ['Nursing']
            
        return "; ".join(filtered_list)
    
for var in AGG_CATEGORIES:
    # _ = df['j' + var].copy()
    df['j' + var]['areas_filtered'] = df['j' + var].areas.apply(lambda areas: remove_unkeep_areas(areas))
    

# _['is_multi_area'] = _.area.apply(lambda x: len(list(x.split("; "))) > 1)
# test = "Agricultural and Biological Sciences; Business, Management and Accounting; Economics, Econometrics and Finance"
# "; ".join(list(filter(lambda x: x in keep, test.split("; "))))
# unique_area = _.area.unique().tolist()
# unique_area = filter(lambda x: '; ' in x, unique_area)
# list(unique_area)
# print(len(_[_.is_multi_area]))

In [None]:
# What are the unique categories?
combined_list = [area.split('; ') for area in df['j'].categories.unique().tolist()]
flat_list = [item for sublist in combined_list for item in sublist]
flat_list = list(set(flat_list))
flat_list.sort()
flat_list = [item.split(' (')[0] for item in flat_list]
set(flat_list)

categories_to_keep = [
 'Advanced and Specialized Nursing',
 'Aging',
 'Agricultural and Biological Sciences',
 'Agronomy and Crop Science',
 'Anatomy',
 'Anesthesiology and Pain Medicine',
 'Animal Science and Zoology',
 'Anthropology',
 'Applied Microbiology and Biotechnology',
 'Applied Psychology',
 'Assessment and Diagnosis',
 'Atmospheric Science',
 'Atomic and Molecular Physics, and Optics',
 'Behavioral Neuroscience',
 'Biochemistry',
 'Biochemistry, Genetics and Molecular Biology',
 'Bioengineering',
 'Biological Psychiatry',
 'Biomaterials',
 'Biomedical Engineering',
 'Biophysics',
 'Biotechnology',
 'Cancer Research',
 'Cardiology and Cardiovascular Medicine',
 'Catalysis',
 'Cell Biology',
 'Cellular and Molecular Neuroscience',
 'Chemical Health and Safety',
 'Chiropractics',
 'Clinical Biochemistry',
 'Clinical Psychology',
 'Cognitive Neuroscience',
 'Complementary and Alternative Medicine',
 'Complementary and Manual Therapy',
 'Critical Care Nursing',
 'Critical Care and Intensive Care Medicine',
 'Demography',
 'Dental Assisting',
 'Dental Hygiene',
 'Dentistry',
 'Dermatology',
 'Development',
 'Developmental Biology',
 'Developmental Neuroscience',
 'Developmental and Educational Psychology',
 'Drug Discovery',
 'Drug Guides',
 'Emergency Medical Services',
 'Emergency Medicine',
 'Emergency Nursing',
 'Endocrine and Autonomic Systems',
 'Endocrinology',
 'Endocrinology, Diabetes and Metabolism',
 'Epidemiology',
 'Experimental and Cognitive Psychology',
 'Food Animals',
 'Food Science',
 'Gastroenterology',
 'Gender Studies',
 'Genetics',
 'Health',
 'Health Informatics',
 'Health Information Management',
 'Health Policy',
 'Health Professions',
 'Health, Toxicology and Mutagenesis',
 'Hematology',
 'Hepatology',
 'Histology',
 'Horticulture',
 'Human Factors and Ergonomics',
 'Immunology',
 'Immunology and Allergy',
 'Immunology and Microbiology',
 'Infectious Diseases',
 'Insect Science',
 'Internal Medicine',
 'Life-span and Life-course Studies',
 'Linguistics and Language',
 'Maternity and Midwifery',
 'Medical Assisting and Transcription',
 'Medical Laboratory Technology',
 'Medical Terminology',
 'Medical and Surgical Nursing',
 'Medicine',
 'Microbiology',
 'Molecular Biology',
 'Molecular Medicine',
 'Multidisciplinary',
 'Nanoscience and Nanotechnology',
 'Nephrology',
 'Neurology',
 'Neuropsychology and Physiological Psychology',
 'Neuroscience',
 'Nurse Assisting',
 'Nursing',
 'Nutrition and Dietetics',
 'Obstetrics and Gynecology',
 'Occupational Therapy',
 'Oncology',
 'Ophthalmology',
 'Optometry',
 'Oral Surgery',
 'Organic Chemistry',
 'Orthodontics',
 'Orthopedics and Sports Medicine',
 'Otorhinolaryngology',
 'Paleontology',
 'Parasitology',
 'Pathology and Forensic Medicine',
 'Pediatrics',
 'Pediatrics, Perinatology and Child Health',
 'Periodontics',
 'Pharmaceutical Science',
 'Pharmacology',
 'Pharmacology, Toxicology and Pharmaceutics',
 'Pharmacy',
 'Physical Therapy, Sports Therapy and Rehabilitation',
 'Physiology',
 'Plant Science',
 'Podiatry',
 'Process Chemistry and Technology',
 'Psychiatry and Mental Health',
 'Psychology',
 'Public Health, Environmental and Occupational Health',
 'Pulmonary and Respiratory Medicine',
 'Radiation',
 'Radiological and Ultrasound Technology',
 'Radiology, Nuclear Medicine and Imaging',
 'Rehabilitation',
 'Reproductive Medicine',
 'Respiratory Care',
 'Rheumatology',
 'Sensory Systems',
 'Social Psychology',
 'Speech and Hearing',
 'Structural Biology',
 'Surgery',
 'Tourism, Leisure and Hospitality Management',
 'Toxicology',
 'Transplantation',
 'Urology',
 'Veterinary',
 'Virology'
]

In [None]:
def remove_unkeep_areas(areas):
        area_list = areas.split("; ")
        area_list = [a.split(' (')[0] for a in area_list]
        filtered_list = list(filter(lambda area: area in categories_to_keep, area_list))
        
        # if len(filtered_list) >= 2 and 'Medicine' in filtered_list:
        #     filtered_list.remove('Medicine')
        # if len(filtered_list) >= 2 and 'Multidisciplinary' in filtered_list:
        #     filtered_list.remove('Multidisciplinary')
        # if len(filtered_list) >= 2 and 'Health Professions' in filtered_list:
        #     filtered_list.remove('Health Professions')
        # if len(filtered_list) >= 2 and 'Nursing' in filtered_list:
        #     filtered_list = ['Nursing']
            
        return "; ".join(filtered_list)
    
for var in AGG_CATEGORIES:
    # _ = df['j' + var].copy()
    df['j' + var]['categories_filtered'] = df['j' + var].categories.apply(lambda areas: remove_unkeep_areas(areas))
    

# _['is_multi_area'] = _.area.apply(lambda x: len(list(x.split("; "))) > 1)
# test = "Agricultural and Biological Sciences; Business, Management and Accounting; Economics, Econometrics and Finance"
# "; ".join(list(filter(lambda x: x in keep, test.split("; "))))
# unique_area = _.area.unique().tolist()
# unique_area = filter(lambda x: '; ' in x, unique_area)
# list(unique_area)
# print(len(_[_.is_multi_area]))

In [None]:
# df['j_agg'][df['j_agg'].publisher.str.contains('Massachussetts Medical Society')]

## Export Data for Collaboration

In [None]:
from datetime import date
from pathlib import Path

for key in ['j_agg', 'd_agg', 'g_agg']:
    file_name = 'journal-websites' if key == 'j_agg' else 'data-portals' if key == 'd_agg' else 'us-goverment-websites'
    today = date.today()
    folder = f'../output/share/{today}'
    Path(folder).mkdir(parents=True, exist_ok=True)
    df[key].to_csv(f'{folder}/{file_name}_{today}.csv', index=False)

## Descriptive Statistics

### # of Pages

In [None]:
GOV_NUM_PAGES = len(df['g'].page_id.unique().tolist())
DP_NUM_PAGES = len(df['d'].page_id.unique().tolist())
JW_NUM_PAGES = len(df['j'].page_id.unique().tolist())

GOV_NUM_PAGES, DP_NUM_PAGES, JW_NUM_PAGES

### # of Unique Websites

In [None]:
GOV_NUM_WEBSITES = len(df['g'].page_id.apply(lambda x: x.split('_')[0] if '_' in str(x) else x).unique().tolist())
DP_NUM_WEBSITES = len(df['d'].page_id.apply(lambda x: x.split('_')[0] if '_' in str(x) else x).unique().tolist())
JW_NUM_WEBSITES = len(df['j'].page_id.apply(lambda x: x.split('_')[0] if '_' in str(x) else x).unique().tolist())

GOV_NUM_WEBSITES, DP_NUM_WEBSITES, JW_NUM_WEBSITES

In [None]:
# # Columns with list of values
# COLUMNS_WITH_LIST = list(filter(lambda x: 'list' in x, df['d_agg_alt'].columns))
# df['d_agg'][['title'] + COLUMNS_WITH_LIST].head(10)

In [None]:
# # Columns with list of values
# df['j_agg'][['title', 'categories', 'areas']].head(10)

### # of Unique Journal Areas

In [None]:
# Unique categories
df['j'].areas.str.split('; ', expand=True).stack().unique().tolist()

In [None]:
# Unique categories
# _ = df['d'].copy()
# _[_.host_institution == 'National Eye Institute']

### # of Unique Countries

In [None]:
# Numb of unique countries
len(
    [x for x in list(set(df['d'].country.tolist() + df['j'].country.unique().tolist())) if str(x) != 'nan']
)

### # of Journal Publishrs

In [None]:
# Numb of jorunal publishers
len(df['j'].publisher.unique().tolist())

### # of Data Portals' Host Institution

In [None]:
# Numb of data portal institutions
len(df['d'].host_institution.unique().tolist())

### Mean FRs

In [None]:
US_GOV_FR_MEAN = df['g_agg'].failure_rate.median()
US_GOV_FR_MEAN, df['g_agg'].failure_rate.min(), df['g_agg'].failure_rate.max()

In [None]:
DP_FR_MEAN = df['d_agg'].failure_rate.median()
DP_FR_MEAN, df['d_agg'].failure_rate.max(), df['d_agg'].failure_rate.min()

In [None]:
JW_FR_MEAN = df['j_agg'].failure_rate.median()
JW_FR_MEAN, df['j_agg'].failure_rate.min(), df['j_agg'].failure_rate.max()

In [None]:
_ = pd.concat([df['g_agg'], df['d_agg'], df['j_agg']])
_.failure_rate.mean()

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    print(
        id, 
        df[id + '_agg'].violations.mean(),
        df[id + '_agg'].violations.min(),
        df[id + '_agg'].violations.max()
    )

In [None]:
for (id, category, title, color) in A11Y_CATEGORIES:
    print(
        id, 
        df[id + '_agg'].total_checks.mean(),
        df[id + '_agg'].total_checks.min(),
        df[id + '_agg'].total_checks.max()
    )

In [None]:
temp = df['j_agg'].copy()
temp[temp.country == 'Germany'].sort_values(by='failure_rate', ascending=True)
temp.to_csv('./germany_journals.csv')

# Visualize

In [None]:
walker = pyg.walk(df['j_agg'])

## How does the distribution looks like?

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    # bests = df[id + '_agg'].sort_values(by='failure_rate', ascending=True).head(TOP_CNT)[name_field].tolist()
    # worsts = df[id + '_agg'].sort_values(by='failure_rate', ascending=False).head(TOP_CNT)[name_field].tolist()
    # print(f"{title}' Best:")
    # print(f"\t{', '.join(bests)}")
    # print(f"{title}' Worst:")
    # print(f"\t{', '.join(worsts)}")
    _ = (
        alt.Chart(
            df[id + '_agg'][df[id + '_agg'].failure_rate > 0]
        ).mark_bar(
            # opacity=0.01
            color=color,
            stroke='white',
            strokeWidth=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').bin(extent=[0, 1], step=0.008).scale(domain=[0, 0.5], clamp=True).axis(format='%', zindex=10, tickCount=1),
            alt.Y('count()', title="The number of webpages").scale(type='linear').axis(tickCount=5),
        ).properties(
            title={
                "text": title,
                "fontWeight": 600,
                "color": "black"
            },
            height=300,
            width=400
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 4]
        ).encode(
            alt.X(f'baseline:Q', title='Failure rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.properties(
    # title={
    #     # 'text': 'The Distribution of Failure Rates',
    #     # 'subtitle': '* Dashed line represents the average failure rate of US government websites',
    #     'subtitleColor': 'grey'
    # }
)

plot = apply_theme(plot)
plot.save('../output/plots/ff-dist.png')
plot

## By Subpages

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    if id == 'g':
        continue
    
    test = df[id + '_agg'].copy()
    ID_WITH_SUBPAGES = test[test.page_type != 'home'].id.unique().tolist()
    _ = test[test.id.isin(ID_WITH_SUBPAGES)]
    
    base = alt.Chart(
        _
    ).mark_circle(
        opacity=0.5,
        color=color
    ).encode(
        alt.X('failure_rate', title='Failure rate (median)').axis(tickCount=5, format='%').scale(domain=[0, 0.1], clamp=True),
        alt.Y('page_type:N', title='Page type', sort='x'),
        alt.Size('total_checks:Q', legend=None),
        alt.YOffset('jitter:Q'),
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    ).properties(
        title=title,
        height=300
    )

    median = base.mark_tick(
        color='black',
        size=30
    ).encode(
        alt.X(f'median(failure_rate):Q', title='Failure rate (median)'),
        alt.Size(),
        alt.YOffset()
    )

    baseline = (
        base.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 4]
        ).encode(
            alt.Y(),
            alt.X(f'baseline:Q', title='Failure rate (median)'),
            alt.Size(),
            alt.YOffset()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )

    base = base + baseline + median
    plot = base if plot is None else plot | base

plot
apply_theme(plot)

In [None]:
DATA_RELATED_ISSUES = [
   "aria-command-name",
    "aria-conditional-attr",
    "aria-input-field-name",
    "aria-roles",
    "aria-text",
    "aria-toggle-field-name",
    "aria-tooltip-name",
    "aria-treeitem-name",
    "aria-valid-attr",
    "button-name",
    "color-contrast",
    "dlitem",
    "empty-table-header",
    "form-field-multiple-labels",
    "image-alt",
    "image-redundant-alt",
    "input-button-name",
    "input-image-alt",
    "label-title-only",
    "label",
    "landmark-unique",
    "link-in-text-block",
    "link-name",
    "list",
    "listitem",
    "object-alt",
    "region",
    "role-img-alt",
    "scope-attr-valid",
    "select-name",
    "svg-img-alt",
    "table-duplicate-name",
    "td-headers-attr",
    "th-has-data-cells",
]

In [None]:
df['d']['violated'] = df['d']['violations'].apply(lambda x: 0 if x == 0 else 1)
df['d']['data-rlated'] = df['d']['issue_id'].apply(lambda x: True if x in DATA_RELATED_ISSUES else False)

In [None]:
alt.Chart(
    df['d']
).mark_bar(

).encode(
    alt.X('sum(violated):Q'),
    alt.Y('issue_id:N', sort='-x'),
    alt.Color('data-rlated:N').scale(domain=[True, False], range=['#D55D00', '#56B4E9'])
)

In [None]:
alt.Chart(
    df['d'][df['d'].page_type == 'search']
).mark_bar(

).encode(
    alt.X('sum(violated):Q'),
    alt.Y('issue_id:N', sort='-x'),
    alt.Color('data-rlated:N').scale(domain=[True, False], range=['#D55D00', '#56B4E9'])
).properties(
    title="A11y Issues in Data Portals' Search Pages"
)

In [None]:
test = df['d'].copy()
ID_WITH_SUBPAGES = test[test.page_type != 'home'].id.unique().tolist()
test = test[test.id.isin(ID_WITH_SUBPAGES)]

alt.Chart(
    test
).mark_bar(

).encode(
    alt.X('sum(violated):Q'),
    alt.Y('page_type:N'),
    alt.Color('data-rlated:N').scale(domain=[True, False], range=['#D55D00', '#56B4E9'])
)

In [None]:
plot = None
# for (id, category, title, color) in A11Y_CATEGORIES:
#     if id == 'g':
#         continue
    
test = df['d'].copy()
ID_WITH_SUBPAGES = test[test.page_type != 'home'].id.unique().tolist()
_ = test[test.id.isin(ID_WITH_SUBPAGES)]

base = alt.Chart(
    _
).mark_circle(
    opacity=0.5,
    color=A11Y_COLORS['d']
).encode(
    alt.X('failure_rate', title='Failure rate (median)').axis(tickCount=5, format='%').scale(domain=[0, 0.1], clamp=True),
    alt.Y('page_type:N', title='Page type', sort='x'),
    alt.Size('total_checks:Q', legend=None),
    alt.YOffset('jitter:Q'),
).transform_calculate(
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
).properties(
    title='Data Portals',
    height=300
)

median = base.mark_tick(
    color='black',
    size=30
).encode(
    alt.X(f'median(failure_rate):Q', title='Failure rate (median)'),
    alt.Size(),
    alt.YOffset()
)

baseline = (
    base.mark_rule(
        color='black',
        size=2,
        # size=500 / len(COUNTRY_SORT),
        strokeDash=[4, 4]
    ).encode(
        alt.Y(),
        alt.X(f'baseline:Q', title='Failure rate (median)'),
        alt.Size(),
        alt.YOffset()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = base + baseline + median

plot
apply_theme(plot)

In [None]:
test

## Which are at the top and the bottom?

In [None]:
CUT = 10
for (id, category, title) in A11Y_CATEGORIES:
    print(title)
    print('Bottom ' + str(CUT))
    print(df[id + '_agg'].sort_values(by=['failure_rate'], ascending=False).head(CUT).url.tolist())
    print('Top ' + str(CUT))
    print(df[id + '_agg'].sort_values(by=['failure_rate'], ascending=True).head(CUT).url.tolist())

    # This shows that we really need to filter data properly!
    # There are many 404 pages in the data

## What about the # of DOM elements?

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    # bests = df[id + '_agg'].sort_values(by='total_checks', ascending=True).head(TOP_CNT)[name_field].tolist()
    # worsts = df[id + '_agg'].sort_values(by='total_checks', ascending=False).head(TOP_CNT)[name_field].tolist()
    # print(f"{title}' Lowest:")
    # print(f"\t{', '.join(bests)}")
    # print(f"{title}' Highest:")
    # print(f"\t{', '.join(worsts)}")
    _ = (
        alt.Chart(
            df[id + '_agg']
        ).mark_bar(
            # opacity=0.3,
            color=color,
            # stroke='black'
        ).encode(
            alt.X(f'total_checks:Q', title='The number of DOM elements').bin(extent=[0, 10000], step=50).scale(padding=0, type='linear', domain=[0, 3200], clamp=True),
            alt.Y('count()', title='The number of webpages').scale(type='linear').axis(tickCount=4),
        ).properties(
            title={
                "text": title,
                # "color": "grey"
            },
            height=300,
            width=400
        )
    )
    
    plot = _ if plot is None else plot | _

plot = plot.properties(
    # title='The Number of DOM Elements in Webpages'
)

plot = apply_theme(plot)
plot.save('../output/plots/dom-dist.png')
plot

## Is there a correlation between the webpage complexity and failure rate?

In [None]:
plot = None
for (id, category, title) in A11Y_CATEGORIES:
    _ = (
        alt.Chart(
            df[id + '_agg']
        ).mark_point(
            filled=True,
            opacity=0.3,
            color='#56B4E9'
        ).encode(
            alt.X(f'total_checks:Q', title='The Number of DOM Elements').scale(type='log'),
            alt.Y('failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]),
            alt.Tooltip([f'title:N', f'url:N'])
        ).properties(
            title={
                'text': title,
                'color': 'grey'
            },
            width=400,
            height=300
        )
    )
    
    nei = _.transform_filter(datum.host_institution == 'National Eye Institute').mark_point(
        opacity=1,
        filled=True,
        color='#D55D00'    
    )

    nih = _.transform_filter(datum.host_institution == 'National Institutes of Health').mark_point(
        opacity=1,
        filled=True,
        color='#009E73'    
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.Y(f'baseline:Q', title='Failure Rate'),
            alt.X()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = _ + nih + nei + baseline

    plot = _ if plot is None else plot | _

plot = plot.properties(
    title={
        'text': 'The Number of Items Checked vs. Failure Rate',
        'subtitle': '* Dashed line represents the average failure rate of US government websites',
        'subtitleColor': 'grey'
    }
)

apply_theme(plot)

## How about the relation with impact score?

In [None]:
plot = None
for (id, category, title) in A11Y_CATEGORIES:
    if id == 'g':
        continue
    
    impact_field = 'h_index' if id == 'j' else 'citation'

    _ = (
        alt.Chart(
            df[id + '_agg']
        ).mark_point(
            # filled=True,
            opacity=0.3,
            color='#56B4E9'
        ).encode(
            alt.X(f'{impact_field}:Q', title=impact_field.capitalize()),#.scale(type='symlog'),
            alt.Y('failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]),
            alt.Tooltip([f'title:N', f'url:N'])
        ).properties(
            title={
                'text': title,
                'color': 'grey'
            },
            width=400,
            height=300
        )
    )

    t = (
        alt.Chart(
            df[id + '_agg'][df[id + '_agg'][impact_field] > (10000 if id == 'dp' else 800)]
        ).mark_text(
            align='left',
            limit=200,
            opacity=0.5,
            fontSize=14,
            dx=10,
            baseline='bottom'
        ).encode(
            alt.X(f'{impact_field}:Q', title=impact_field.capitalize()),#.scale(type='log'),
            alt.Y('failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]),
            alt.Text(f'title:N'),
            alt.Tooltip([f'title:N', f'url:N'])
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.Y(f'baseline:Q', title='Failure Rate'),
            alt.X()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.properties(
    title='Failure Rate by Impact Score of Each Website'
)

apply_theme(plot)

## Impact

In [None]:
IMPACT_GROUPS = ['Highest', 'High', 'Low', 'Lowest']
temp = df['j_agg'][df['j_agg'].sjr_best_quartile.str.contains('Q')].copy()
temp.sjr_best_quartile = temp.sjr_best_quartile.apply(lambda x: 'Highest' if x == 'Q1' else 'High' if x == 'Q2' else 'Low' if x == 'Q3' else 'Lowest')

_ = (
    alt.Chart(
        temp
    ).mark_tick(
        color="black",
        thickness=2,
        size=60
    ).encode(
        alt.X(f'median(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.3], clamp=True).axis(format='%', tickCount=5),
        alt.Y('sjr_best_quartile:N', title='Impact groups', sort=IMPACT_GROUPS)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=400
    )
)

dist = (
    _.mark_circle(
        color=A11Y_COLORS['j'],
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Y(f'sjr_best_quartile:N', title='Impact groups', sort=IMPACT_GROUPS),
        alt.Size('total_checks:Q', legend=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 4]
    ).encode(
        alt.X(f'baseline:Q', title='Failure rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline

plot = plot.properties(
    title="Journal Websites"
)

j = apply_theme(plot)
j

In [None]:
df['d_agg'].columns

In [None]:
_ = df['d_agg'].copy()
_ = _[~_.zindex.isnull()]
_.zindex = _.zindex.apply(lambda x: int(x))
df['d_agg'] = _.assign(qaurtile=pd.qcut(_.zindex, 4, ['Lowest', 'Low', 'High', 'Highest']))

In [None]:
_ = (
    alt.Chart(
        df['d_agg']
    ).mark_tick(
        color='black',
        thickness=2,
        size=60
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.3], clamp=True).axis(format='%', tickCount=5),
        alt.Y('qaurtile:N', title='Impact Groups', sort=IMPACT_GROUPS)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=400
    )
)

dist = (
    _.mark_circle(
        color=A11Y_COLORS['d'],
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Y(f'qaurtile:N', title='Impact Groups', sort=IMPACT_GROUPS),
        alt.Size('total_checks:Q', legend=None).scale(domain=[1, 50000], clamp=True),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline

plot = plot.properties(
    title="Data Portals"
)

apply_theme(plot)

In [None]:
temp = df['j_agg']
temp['is_health_informatics'] = temp.categories.apply(lambda x: 'Health Informatics' if 'Health Informatics' in x else "Others")

_ = (
    alt.Chart(
        temp
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=40
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.3], clamp=True).axis(format='%'),
        alt.Y('is_health_informatics:N', title=None)
    ).properties(
        # title={
        #     "text": title,
        #     "color": "grey"
        # },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Y(f'is_health_informatics:N', title=None),
        alt.Size('total_checks:Q', legend=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline

plot = plot.properties(
    # title="Health Informatics vs. OtherThe Failure Rate By Data Portals (Z Index)"
)

apply_theme(plot)

In [None]:
copy = df['d_agg'].copy()
copy['harvard'] = copy.page_id.apply(lambda x: 'Harvard' if 'harvard' in x else '')

_ = (
    alt.Chart(
        copy
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=40
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.3], clamp=True).axis(format='%'),
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Size('total_checks:Q'),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

text = (
    _.mark_bar(
        # color='red',
        # size=20,
        # opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Y('title:N'),
        # alt.YOffset('jitter:Q'),
        # alt.Tooltip([f'title:N', f'url:N'])
    ).transform_filter(
        (datum.harvard == 'Harvard')
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline & text

plot = plot.properties(
    title="The Failure Rates of Our Websites"
)

apply_theme(plot)
# df['d_agg'][df['d_agg'].short_name.str.contains('4D')].short_name

In [None]:
copy = df['j_agg'].copy()
copy['informatics'] = copy.page_id.apply(lambda x: 'Informatics' if 'informatics' in x.lower() else '')

_ = (
    alt.Chart(
        copy
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=40
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.3], clamp=True).axis(format='%'),
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Size('total_checks:Q'),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

informatics = (
    _.mark_circle(
        color='red',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .3], clamp=True).axis(format='%'),
        alt.Size('total_checks:Q'),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
        # alt.YOffset('jitter:Q'),
        # alt.Tooltip([f'title:N', f'url:N'])
    ).transform_filter(
        (datum.informatics == 'Informatics')
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline + informatics

plot = plot.properties(
    title="The Failure Rates of Our Websites"
)

apply_theme(plot)
# df['d_agg'][df['d_agg'].short_name.str.contains('4D')].short_name

## By Group?

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    if id == 'g':
        continue
    
    impact_field = 'h_index' if id == 'j' else 'citation'
    gropu_field = 'publisher' if id == 'j' else 'host_institution'

    _ = (
        alt.Chart(
            df[id + '_agg']
        ).mark_point(
            # filled=True,
            opacity=0.3,
            color='#56B4E9'
        ).encode(
            alt.X(f'mean({impact_field}):Q', title=impact_field.capitalize()).scale(type='symlog'),
            alt.Y('mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 1]),
            alt.Color(f'{gropu_field}:N').scale(range=['#56B4E9']),
            alt.Tooltip([f'{gropu_field}:N'])
            # alt.Tooltip([f'{name_field}:N', f'{url_field}:N'])
        ).properties(
            title={
                'text': title,
                'color': 'grey'
            },
            width=400,
            height=300
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.Y(f'baseline:Q', title='Failure Rate'),
            alt.X(),
            alt.Color()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.properties(
    title='Failure Rate by Impact Score of Each Website'
)

apply_theme(plot)

## What are the Issues checked?

In [None]:
# plots = []
# for (id, category, title, color) in A11Y_CATEGORIES:
#     _ = (
#         alt.Chart(
#             df[id]#[df[id].issue_id != 'region']
#         ).mark_bar(
#             color='#56B4E9'
#         ).encode(
#             alt.Y('issue_id:N'),
#             alt.X(f'sum(total_checks):Q', title='The Number of Items Checked'),#.scale(type='log'),
#             alt.Tooltip([f'title:N', f'url:N'])
#         ).properties(
#             title={
#                 'text': title,
#                 'color': 'grey'
#             },
#             width=400,
#             height=1000
#         )
#     )
#     plots.append(_)

# # plot = plot.properties(
# #     title='The Number of Items Checked vs. Failure Rate'
# # )
    
# plot = alt.hconcat(*plots).resolve_scale(y='independent')

# apply_theme(plot)

In [None]:
merged = pd.concat([df['dp'], df['jp']])
merged['issue_exist'] = merged.violations.apply(lambda x: 1 if x > 0 else 0)
merged

In [None]:
_ = (
    alt.Chart(
        merged
    ).mark_tick(
        color='#D20000',
        thickness=2,
        # size=100
    ).encode(
        alt.X(f'sum(issue_exist):Q', title='The Number of Websites Having Issues'),
        alt.Y('issue_id:N', title=None, sort='x')
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=900,
        width=500
    )
)

# dist = (
#     _.mark_circle(
#         color='grey',
#         size=20,
#         opacity=0.5
#     ).encode(
#         alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .5], clamp=True).axis(format='%'),
#         alt.Y(f'issue_id:N', title=None),
#         # alt.Size('total_checks:Q'),
#         alt.YOffset('jitter:Q'),
#         # alt.Tooltip([f'{name_field}:N', f'{url_field}:N'])
#     ).transform_calculate(
#         jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
#     )
# )

# baseline = (
#     _.mark_rule(
#         color='black',
#         size=2,
#         strokeDash=[4, 2]
#     ).encode(
#         alt.X(f'baseline:Q', title='Failure Rate'),
#         y=alt.Y()
#     ).transform_calculate(
#         baseline=f"{US_GOV_FR_MEAN}"
#     )
# )

# plot = dist + _ + baseline
# plot = _ + baseline
plot = _

plot = plot.properties(
    title="The Failure Rate of Biomedical Websites By Domain"
)

apply_theme(plot)

## By Country

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    if id == 'g':
        continue

    # Select Countries with more than N pages
    CUT = 100
    _ = df[id + '_agg'].copy()
    df_count = _.country.value_counts().reset_index().sort_values(by='count', ascending=False)
    
    # df_count = df_count[df_count['count'] > CUT]
    # COUNTRY_FILTER = df_count.country.tolist()
    
    COUNTRY_FILTER = df_count.country.tolist()
    COUNTRY_FILTER = COUNTRY_FILTER[0:10]
    
    # Sort
    df_fr = _[_.country.isin(COUNTRY_FILTER)][['country', 'failure_rate']].groupby(['country']).median().reset_index().sort_values(by='failure_rate', ascending=False)
    COUNTRY_SORT = df_fr.country.tolist()

    # Add "Other"
    _['country_with_other'] = _.country.apply(lambda x: x if x in COUNTRY_FILTER else 'Other')

    base = (
        alt.Chart(
            _#[_.country.isin(COUNTRY_FILTER)]
        ).mark_tick(
            # opacity=0.01
            color='black',
            thickness=2,
            size=20
        ).encode(
            alt.X(f'median(failure_rate):Q', title='Failure rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y('country_with_other:N', title=None, sort=COUNTRY_SORT)
        ).properties(
            title={
                "text": title,
                # "color": "grey"
            },
            height=400,
            width=400
        )
    )

    n = (
        base.mark_text(
            size=18
        ).encode(
            alt.X(),
            alt.Y('country_with_other:N', sort=COUNTRY_SORT).axis(None),
            alt.Text('count()').format(',')
        ).properties(title='N', width=50)
    )

    dist = (
        base.mark_circle(
            color=A11Y_COLORS[id],
            # size=10,
            opacity=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y('country_with_other:N', title=None, sort=COUNTRY_SORT),
            alt.Size('total_checks:Q', legend=None).scale(type='linear', range=[4, 300]),
            alt.YOffset('jitter:Q'),
            alt.Tooltip(['title:N', 'url:N'])
        ).transform_calculate(
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )

    baseline = (
        base.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 4]
        ).encode(
            alt.X(f'baseline:Q', title='Failure rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    base = (dist + base + baseline).resolve_axis(y='shared') | n

    plot = base if plot is None else plot | base

plot = plot.resolve_axis(y='independent')

plot = plot.properties(
    # title={
    #     # 'text': 'The Failure Rate of Webpages by Country Compared to Baseline',
    #     # 'subtitle': [
    #         # f'* Only countries with more than {CUT} pages are shown',
    #         # '* US Government websites are used as the baseline (dashed line)'
    #     # ],
    #     # 'subtitleColor': 'grey'
    # }
)

plot = apply_theme(plot)
plot.save('../output/plots/country.png')
plot

## By Continent

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    if id == 'g':
        continue

    # Select Countries with more than N pages
    # CUT = 10
    _ = df[id + '_agg'].copy()
    # df_count = _.country.value_counts().reset_index().sort_values(by='count', ascending=False)
    # df_count = df_count[df_count['count'] > CUT]
    # COUNTRY_FILTER = df_count.country.tolist()

    # Sort
    df_fr = _[['continent', 'failure_rate']].groupby(['continent']).median().reset_index().sort_values(by='failure_rate', ascending=False)
    SORT = df_fr.continent.tolist()

    # Add "Other"
    # _['country_with_other'] = _.country.apply(lambda x: x if x in COUNTRY_FILTER else 'Other')

    base = (
        alt.Chart(
            _
        ).mark_tick(
            # opacity=0.01
            color='black',
            thickness=2,
            size=20
        ).encode(
            alt.X(f'median(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y('continent:N', title=None, sort=SORT)
        ).properties(
            title={
                "text": title,
                # "color": "grey"
            },
            height=400,
            width=400
        )
    )

    n = (
        base.mark_text(
            size=18
        ).encode(
            alt.X(),
            alt.Y('continent:N', sort=SORT).axis(None),
            alt.Text('count()').format(',')
        ).properties(title='N', width=50)
    )

    dist = (
        base.mark_circle(
            color=color,
            # size=10,
            opacity=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y('continent:N', title=None, sort=SORT),
            alt.Size('total_checks:Q').scale(type='linear', range=[4, 300]).legend(None),
            alt.YOffset('jitter:Q'),
            alt.Tooltip(['title:N', 'url:N'])
        ).transform_calculate(
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )

    baseline = (
        base.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 4]
        ).encode(
            alt.X(f'baseline:Q', title='Failure Rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    base = (dist + base + baseline).resolve_axis(y='shared') | n

    plot = base if plot is None else plot | base

plot = plot.resolve_axis(y='independent')

plot = plot.properties(
    # title={
    #     'text': 'The Failure Rate by Continent',
    #     'subtitle': [
    #         f'* Only countries with more than {CUT} pages are shown',
    #         '* US Government websites are used as the baseline (dashed line)'
    #     ],
    #     'subtitleColor': 'grey'
    # }
)

plot = apply_theme(plot)
plot.save('../output/plots/continent.png')
plot

## By Organizations

In [None]:
plot = None
for (id, category, title, color) in A11Y_CATEGORIES:
    if id == 'g':
        continue
    
    category_field = 'publisher' if id == 'j' else 'host_institution'
    
    _ = df[id + '_agg'].copy()

    # if id == 'd':
    #     _[category_field] = _[category_field].apply(lambda x: 'University' if 'university' in str(x).lower() else x)
    
    # Select Countries with more than 10 pages
    CUT = 10
    df_count = _[category_field].value_counts().reset_index().sort_values(by='count', ascending=False)
    
    # df_count = df_count[df_count['count'] > CUT]
    # FILTER = df_count[category_field].tolist()

    FILTER = df_count[category_field].tolist()
    FILTER = FILTER[0:10]
    
    # Sort
    df_fr = _[_[category_field].isin(FILTER)][[category_field, 'failure_rate']].groupby([category_field]).median().reset_index().sort_values(by='failure_rate', ascending=False)
    SORT = df_fr[category_field].tolist()

    base = (
        alt.Chart(
            _[_[category_field].isin(FILTER)]
        ).mark_tick(
            # opacity=0.01
            color='black',
            thickness=2,
            size=25
            # size=500 / len(COUNTRY_SORT),
        ).encode(
            alt.X(f'median(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y(f'{category_field}:N', sort=SORT, title='Host Institutions' if id == 'd' else 'Publishers')
        ).properties(
            title={
                "text": title,
                # "color": "grey"
            },
            height=400,
            width=400
        )
    )

    n = (
        base.mark_text(
            size=18
        ).encode(
            alt.X(),
            alt.Y(f'{category_field}:N', sort=SORT).axis(None),
            alt.Text('count()').format(',')
        ).properties(title='N', width=50)
    )

    dist = (
        base.mark_circle(
            color=color,
            size=20,
            opacity=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
            alt.Y(f'{category_field}:N', sort=SORT, title=None),
            alt.Size('total_checks:Q').scale(type='linear', range=[40, 300]).legend(None),
            alt.YOffset('jitter:Q'),
            alt.Tooltip([f'title:N', f'url:N'])
        ).transform_calculate(
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )

    baseline = (
        base.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.X(f'baseline:Q', title='Failure Rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    base = dist + base + baseline | n

    plot = base if plot is None else plot & base

plot = plot.resolve_axis(y='independent')

plot = plot.properties(
    # title={
    #     'text': 'The Failure Rate of Webpages by Organization Compared to Baseline',
    #     'subtitle': [
    #         f'* Only publishers/institutions with {CUT} pages or more are shown',
    #         '* US Government websites are used as the baseline (Dashed Line)'
    #     ],
    #     'subtitleColor': 'grey'
    # }
)

apply_theme(plot)

In [None]:
_ = df['j_agg'].copy()
plot = alt.Chart(
    _[_.publisher == 'Nature Publishing Group']
).mark_bar(
    
).encode(
    alt.X('median(failure_rate):Q'),
    alt.Y('title:N', sort='x')
)
apply_theme(plot)
# _[_.title == 'Nature Communications']

In [None]:
_ = df['dpg']
_[_.host_institution == 'Zhejiang University'].sort_values(by='failure_rate', ascending=False)[['short_name', 'url']]
# _[_.host_institution == 'European Bioinformatics Institute'].sort_values(by='failure_rate', ascending=True).short_name
# _[_.host_institution == 'National Institutes of Halth'].sort_values(by='failure_rate', ascending=True).short_name

In [None]:
_ = df['jpg']
_[_.publisher == 'Nature Publishing Group'].sort_values(by='failure_rate', ascending=True)[['title', 'url']].head(40)
# _[_.publisher == 'Springer'].sort_values(by='failure_rate', ascending=True)[['title', 'url']].head(50)
# _[_.publisher == 'BioMed Central Ltd.'].sort_values(by='failure_rate', ascending=True)[['title', 'url']].head(50)
# _[_.title == 'Nature Communications']

## How does it look with NIH websites?

In [None]:
name_field = 'title'
category_field = 'host_institution'

_ = df['d_agg'].copy()

_ = _[_.host_institution.isin(NIH_INSTS)]

# Select Countries with more than 10 pages
CUT = 1
df_count = _[category_field].value_counts().reset_index().sort_values(by='count', ascending=False)
df_count = df_count[df_count['count'] > CUT]
FILTER = df_count[category_field].tolist()

# Sort
df_fr = _[_[category_field].isin(FILTER)][[category_field, 'failure_rate']].groupby([category_field]).mean().reset_index().sort_values(by='failure_rate', ascending=False)
SORT = df_fr[category_field].tolist()

_ = (
    alt.Chart(
        _[_[category_field].isin(FILTER)]
    ).mark_tick(
        # opacity=0.01
        color='#D20000',
        thickness=2,
        # size=500 / len(COUNTRY_SORT),
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
        alt.Y(f'{category_field}:N', sort=SORT, title=None)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        # height=500,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
        alt.Y(f'{category_field}:N', sort=SORT, title=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        # size=500 / len(COUNTRY_SORT),
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

_ = dist + _ + baseline

plot = _

plot = plot.properties(
    title={
        'text': 'The Failure Rate of NIH Data Portals Compared to Baseline',
        'subtitle': [
            '* US Government websites are used as the baseline (Dashed Line)'
        ],
        'subtitleColor': 'grey'
    }
)

apply_theme(plot)

# What are common issues found from NIH data portals?

In [None]:
alt.Chart(
    df['dp'][df['dp'].host_institution == 'National Institutes of Health']
).mark_bar(
    color='#56B4E9'
).encode(
    alt.Y('issue_id:N'),
    alt.X(f'mean(failure_rate):Q', title='The Number of Items Checked'),#.scale(type='log'),
).properties(
    title={
        'text': title,
        'color': 'grey'
    },
    width=400,
    height=1000
)

# .gov Data Portals vs. Other Data Portals

In [None]:
id = 'd'
_df = df[id + '_agg'][df[id + '_agg'].failure_rate > 0].copy()
dotgov = _df[_df.url.str.contains('.gov')]
all = (
    alt.Chart(
        _df
    ).mark_bar(
        color='#56B4E9'
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').bin(extent=[0, 1], step=0.01).scale(domain=[0, 0.5], clamp=True).axis(format='%'),
        alt.Y('count()', title="The Number of Websites").scale(type='linear'),
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

onlydotgov = (
    alt.Chart(
        dotgov
    ).mark_bar(
        # opacity=0.01
        color='#0072B2'
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').bin(extent=[0, 1], step=0.01).scale(domain=[0, 0.5], clamp=True).axis(format='%'),
        alt.Y('count()', title="The Number of Websites").scale(type='linear'),
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

baseline = (
    all.mark_rule(
        color='black',
        size=2,
        # size=500 / len(COUNTRY_SORT),
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

_ = all + onlydotgov + baseline
_
_ = _.properties(
    title={
        'text': 'The Distribution of Failure Rate of Data Portals',
        'subtitle': [
            '* Dashed line represents the average failure rate of US government websites',
            '* Dark blue bars represent .gov websites'
        ],
        'subtitleColor': 'grey'
    }
)
apply_theme(_)

# .gov vs. All Others

In [None]:
df['d_agg']['resource_type'] = 'Data Portals'
df['j_agg']['resource_type'] = 'Journal Portals'
df['g_agg']['resource_type'] = 'Government Websites'
merged = pd.concat([df['d_agg'], df['j_agg']]) # , df['gog']])
merged['is_dot_gov'] = merged['url'].apply(lambda x: 'Government Biomedical Websites' if '.gov' in x.lower() else 'Non-government Biomedical Websites')
df['g_agg']['is_dot_gov'] = 'Government Non-biomedical Websites'
merged = pd.concat([merged, df['g_agg']])

In [None]:
_ = (
    alt.Chart(
        merged
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=100
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.5], clamp=True).axis(format='%'),
        alt.Y('is_dot_gov:N', title=None)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .5], clamp=True).axis(format='%'),
        alt.Y(f'is_dot_gov:N', title=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline

plot = plot.properties(
    title="The Failure Rate of Biomedical Websites By Domain"
)

apply_theme(plot)

In [None]:
df['d_agg']['resource_type'] = 'Data Portals'
df['j_agg']['resource_type'] = 'Journal Portals'
df['g_agg']['resource_type'] = 'Government Websites'
merged = pd.concat([df['d_agg'], df['j_agg']]) # , df['gog']])
merged['is_dot_gov'] = merged['url'].apply(lambda x: 'Biomedical websites w/ .gov' if '.gov' in x.lower() else 'Biomedical websites w/o .gov')
df['g_agg']['is_dot_gov'] = 'US government websites'
merged = pd.concat([merged, df['g_agg']])
SORT = ['US government websites', 'Biomedical websites w/ .gov', 'Biomedical websites w/o .gov']

In [None]:
base = (
    alt.Chart(
        merged
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=60
    ).encode(
        alt.X(f'median(failure_rate):Q', title='Failure rate (median)').scale(domain=[0, 0.2], clamp=True).axis(format='%'),
        alt.Y('is_dot_gov:N', title=None, sort=SORT)
    ).properties(
        title={
            "text": 'Biomedical Websites By Domain',
            # "color": "grey"
        },
        height=300,
        width=400
    )
)

dist = (
    base.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure rate (median)').scale(domain=[0, .2], clamp=True).axis(format='%'),
        alt.Y(f'is_dot_gov:N', title=None, sort=SORT),
        alt.Size('total_checks:Q', legend=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

n = (
    base.mark_text(
        size=18
    ).encode(
        alt.X(),
        alt.Y('is_dot_gov:N', sort=SORT).axis(None),
        alt.Text('count()').format(',')
    ).properties(title='N', width=50)
)

baseline = (
    base.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure rate (median)'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + baseline + base | n

# plot = plot.properties(
#     title="The Failure Rate of Biomedical Websites By Domain"
# )

plot = apply_theme(plot)
plot.save('../output/plots/domain.png')
plot

In [None]:
_ = (
    alt.Chart(
        merged[merged.resource_type == 'Data Portals']
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=100
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.5], clamp=True).axis(format='%'),
        alt.Y('is_dot_gov:N', title=None)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .5], clamp=True).axis(format='%'),
        alt.Y(f'is_dot_gov:N', title=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'{name_field}:N', f'{url_field}:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + _ + baseline

plot = plot.properties(
    title="The Failure Rate of Data Portals By Domain"
)

apply_theme(plot)

# Within .gov, Biomedical vs. Others

In [None]:
_ = (
    alt.Chart(
        merged[merged.is_dot_gov == '.gov']
    ).mark_tick(
        color='#D20000',
        thickness=2,
        size=100
    ).encode(
        alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 0.5], clamp=True).axis(format='%'),
        alt.Y('resource_type:N', title=None)
    ).properties(
        title={
            "text": title,
            "color": "grey"
        },
        height=300,
        width=500
    )
)

dist = (
    _.mark_circle(
        color='grey',
        size=20,
        opacity=0.5
    ).encode(
        alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, .5], clamp=True).axis(format='%'),
        alt.Y(f'resource_type:N', title=None),
        alt.YOffset('jitter:Q'),
        alt.Tooltip([f'short_name:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.X(f'baseline:Q', title='Failure Rate'),
        y=alt.Y()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = dist + baseline + _

plot = plot.properties(
    title="The Failure Rate of .gov Websites"
)

apply_theme(plot)

# Curious about the relation with founded_year for data portals

In [None]:
plot = alt.Chart(
    # df['dpg']
    # or
    df['d_agg']#[df['d_agg']#.host_institution == 'National Institutes of Health']
).mark_bar(
    # opacity=0.01
).encode(
    alt.Y(f'mean(failure_rate):Q'),
    alt.X('founded_year:O').axis(labelAngle=-90),
    # alt.Size('mean(total_checks):Q')
).properties(
    width=500
)

_ = (
    alt.Chart(
        merged[~merged.founded_year.isnull()]
    ).mark_line(
        color='black',
        thickness=2,
        point=True,
        # stroke='black'
        # size=100
    ).encode(
        alt.Y(f'median(failure_rate):Q', title='Failure rate (median)').scale(domain=[0, 0.12], clamp=True).axis(format='%'),
        alt.X('founded_year:O', title=None).axis(labelAngle=-90),
    ).properties(
        title={
            "text": title,
            # "color": "grey"
        },
        height=400,
        width=500
    )
)

dist = (
    _.mark_circle(
        color=A11Y_COLORS['d'],
        size=20,
        opacity=0.5
    ).encode(
        alt.Y(f'failure_rate:Q', title='Failure rate (median)').scale(domain=[0, .12], clamp=True).axis(format='%'),
        alt.X('founded_year:O', title=None).axis(labelAngle=-90),
        alt.Size('total_checks:Q', legend=None),
        alt.XOffset('jitter:Q'),
        alt.Tooltip([f'title:N', f'url:N'])
    ).transform_calculate(
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
)

baseline = (
    _.mark_rule(
        color='black',
        size=2,
        strokeDash=[4, 2]
    ).encode(
        alt.Y(f'baseline:Q', title='Failure rate (median)'),
        x=alt.X()
    ).transform_calculate(
        baseline=f"{US_GOV_FR_MEAN}"
    )
)

plot = _ + dist + baseline

plot = plot.properties(
    title="Founded Year of Data Portals"
)

plot = apply_theme(plot)
plot.save('../output/plots/year.png')
plot

# How about the alt texts?

In [None]:
US_GOV_ALT_FR_MEAN = df['g_ag'].failure_rate.mean()
US_GOV_ALT_FR_MEAN

In [None]:
plot = None
for (id, category, title, name_field, url_field) in A11Y_CATEGORIES:
    _ = (
        alt.Chart(
            df[id + 'ag'][df[id + 'ag'].failure_rate >= 0]
        ).mark_bar(
            # opacity=0.01
            color='#56B4E9'
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure Rate').bin(extent=[0, 1], step=0.01).scale(domain=[0, 1]).axis(format='%'),
            alt.Y('count()').scale(type='log'),
        ).properties(
            title={
                "text": title,
                "color": "grey"
            },
            height=300,
            width=500
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.X(f'baseline:Q', title='Failure Rate'),
            alt.Y(),
            alt.Color()
        ).transform_calculate(
            baseline=f"{US_GOV_ALT_FR_MEAN}"
        )
    )
    
    _ = _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.properties(
    title='The Distribution of Failure Rate of Webpages'
)

apply_theme(plot)

In [None]:
plot = None
for (id, category, title, name_field, url_field) in A11Y_CATEGORIES:
    if id == 'go':
        continue
    
    category_field = 'publisher' if id == 'jp' else 'host_institution'
    
    _ = df[id + 'ag'].copy()

    # Select Countries with more than 10 pages
    CUT = 10
    df_count = _[category_field].value_counts().reset_index().sort_values(by='count', ascending=False)
    df_count = df_count[df_count['count'] > CUT]
    FILTER = df_count[category_field].tolist()

    # Sort
    df_fr = _[_[category_field].isin(FILTER)][[category_field, 'failure_rate']].groupby([category_field]).mean().reset_index().sort_values(by='failure_rate', ascending=False)
    SORT = df_fr[category_field].tolist()

    _ = (
        alt.Chart(
            _[_[category_field].isin(FILTER)]
        ).mark_tick(
            # opacity=0.01
            color='#D20000',
            thickness=2,
            # size=500 / len(COUNTRY_SORT),
        ).encode(
            alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
            alt.Y(f'{category_field}:N', sort=SORT, title=None)
        ).properties(
            title={
                "text": title,
                "color": "grey"
            },
            # height=500,
            width=500
        )
    )

    dist = (
        _.mark_circle(
            color='grey',
            size=20,
            opacity=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
            alt.Y(f'{category_field}:N', sort=SORT, title=None),
            alt.YOffset('jitter:Q'),
            alt.Tooltip([f'{name_field}:N', f'{url_field}:N'])
        ).transform_calculate(
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.X(f'baseline:Q', title='Failure Rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = dist + _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.resolve_axis(y='independent')

plot = plot.properties(
    title={
        'text': 'The Failure Rate of Webpages by Organization Compared to Baseline',
        'subtitle': [
            f'* Only publishers/institutions with {CUT} pages or more are shown',
            '* US Government websites are used as the baseline (Dashed Line)'
        ],
        'subtitleColor': 'grey'
    }
)

apply_theme(plot)

In [None]:
plot = None
for (id, category, title, name_field, url_field) in A11Y_CATEGORIES:
    if id == 'go':
        continue

    # Select Countries with more than 10 pages
    CUT = 10
    _ = df[id + 'ag'].copy()
    df_count = _.country.value_counts().reset_index().sort_values(by='count', ascending=False)
    df_count = df_count[df_count['count'] > CUT]
    COUNTRY_FILTER = df_count.country.tolist()

    # Sort
    df_fr = _[_.country.isin(COUNTRY_FILTER)][['country', 'failure_rate']].groupby(['country']).mean().reset_index().sort_values(by='failure_rate', ascending=False)
    COUNTRY_SORT = df_fr.country.tolist()

    _ = (
        alt.Chart(
            _[_.country.isin(COUNTRY_FILTER)]
        ).mark_tick(
            # opacity=0.01
            color='#D20000',
            thickness=2,
            size=22
        ).encode(
            alt.X(f'mean(failure_rate):Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
            alt.Y('country:N', sort=COUNTRY_SORT)
        ).properties(
            title={
                "text": title,
                "color": "grey"
            },
            height=500,
            width=500
        )
    )

    dist = (
        _.mark_circle(
            color='grey',
            size=10,
            opacity=0.5
        ).encode(
            alt.X(f'failure_rate:Q', title='Failure Rate').scale(domain=[0, 1]).axis(format='%'),
            alt.Y('country:N', sort=COUNTRY_SORT),
            alt.YOffset('jitter:Q')
        ).transform_calculate(
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
    )

    baseline = (
        _.mark_rule(
            color='black',
            size=2,
            # size=500 / len(COUNTRY_SORT),
            strokeDash=[4, 2]
        ).encode(
            alt.X(f'baseline:Q', title='Failure Rate'),
            y=alt.Y()
        ).transform_calculate(
            baseline=f"{US_GOV_FR_MEAN}"
        )
    )
    
    _ = dist + _ + baseline

    plot = _ if plot is None else plot | _

plot = plot.resolve_axis(y='independent')

plot = plot.properties(
    title={
        'text': 'The Failure Rate of Webpages by Country Compared to Baseline',
        'subtitle': [
            f'* Only countries with more than {CUT} pages are shown',
            '* US Government websites are used as the baseline (dashed line)'
        ],
        'subtitleColor': 'grey'
    }
)

apply_theme(plot)

In [None]:
plots = []
for (id, category, title, name_field, url_field) in A11Y_CATEGORIES:
    _ = (
        alt.Chart(
            df[id + 'g']
        ).mark_bar(
            # opacity=0.01
        ).encode(
            alt.X(f'mean(total_checks):Q'),
            alt.Y('country:N', sort='-x'),
            alt.Color('count()'),
            alt.Tooltip(['count()'])
        ).properties(
            title='Countries with more than 10 sites',
            width=500
        )
    )
    plots.append(_)
apply_theme(alt.vconcat(*plots))

In [None]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(A3):Q'),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [None]:
alt.Chart(
    _df[_df.country.isin(COUNTRIES)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(violation_ratio):Q', title='The ratio of violations among all potential violations').axis(format='.1%').scale(alt.Scale(domain=(0, 1))),
    alt.Y('country:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='Countries with more than 10 sites',
    width=500
)

In [None]:
df_host_institution_count = _df.host_institution.value_counts().reset_index().sort_values(by='count', ascending=False)
df_host_institution_count = df_host_institution_count[df_host_institution_count['count'] > 10]
INSTS = df_host_institution_count.host_institution.tolist()
INSTS

In [None]:
alt.Chart(
    _df[_df.host_institution.isin(INSTS)]
).mark_bar(
    # opacity=0.01
).encode(
    alt.X(f'mean(Np):Q'),
    alt.Y('host_institution:N', sort='-x'),
    alt.Color('count()'),
    alt.Tooltip(['count()'])
).properties(
    title='host_institution with more than 10 sites',
    width=500
)