In [8]:
import pandas as pd
import urllib
import requests
import json
import os

# Evaluate Biomedical Resources

In [9]:
# Load a WAVE API key stored in a local file
with open('../input/api.lab.key', 'r') as f:
    API_KEY = f.read()

In [10]:
RESOURCES = [
    (
        'portal',                            # category
        'portals-metadata.json',             # input file
        'portals-reports.json',              # output file
        None                                 # sampling number (e.g., select the top N portals, No limit if None)
    ),
    (
        'visualization',                     # category
        'visualizations-metadata.json',      # input file
        'visualizations-reports.json',       # output file
        None,                                # sampling number (e.g., select the top N portals, No limit if None)
    ),
    (
        'journal',                           # category
        'journals-metadata.json',            # input file
        'journals-reports.json',             # output file
        None,                                # sampling number (e.g., select the top N portals, No limit if None)
    )
]

In [11]:
def get_sampled_resources_metadata(
    category,
    input_file,
    output_file,
    sample_size
):
    df = pd.read_json(f'../output/{input_file}')

    if category == 'portal':
        df = df[df.response == 200]
    
    # Select top N websites
    if sample_size is not None:
        df = df[0:sample_size] # already sorted by impact scores

    return df

In [12]:
def collect_reports_and_save(
    df,
    category,
    input_file,
    output_file,
    sample_size
):
    """
    Load existing reports so that we do not need to re-run evaluation from the start
    """
    output_path = f'../output/{output_file}'
    if os.path.isfile(output_path):
        f = open(output_path, 'r')
        reports = json.load(f)
        f.close()
    else:
        reports = []

    """ 
    Collect missing reports
    """
    for index, row in df.iterrows():
        
        # Resource information
        # dbId = row.dbId
        # shortName = row.shortName
        url = row.url

        # Check the existence
        report_exist = any(report['url'] == url for report in reports)
        if not report_exist:
            print('Loading... ', url)

            api_url = f'https://wave.webaim.org/api/request?key={API_KEY}&reporttype=2&url={url}'
            
            try:
                with urllib.request.urlopen(api_url) as f:
                    new_report = json.load(f)
                    new_data = {}
                    # new_data['dbId'] = dbId
                    # new_data['shortName'] = shortName
                    new_data['url'] = url
                    new_data['report'] = new_report
                    reports.append(new_data)
                    # print(reports)

                with open(output_path, 'w') as f:
                    json.dump(reports, f)
            except:
                print('failed')

In [13]:
for args in RESOURCES:
    collect_reports_and_save(
        get_sampled_resources_metadata(*args), 
        *args
    )

Loading...  https://github.com/ihh/abrowse
Loading...  https://alignmentviewer.org/
Loading...  https://msa.biojs.net/
Loading...  http://espript.ibcp.fr/ESPript/ESPript/
Loading...  http://www.bioinf.org.uk/software/jsav
Loading...  https://logojs.wenglab.org/app/gallery/
Loading...  https://thekaplanlab.github.io/
Loading...  https://www.ebi.ac.uk/Tools/msa/mview/
Loading...  https://www.ncbi.nlm.nih.gov/projects/msaviewer/
Loading...  https://github.com/plotly/react-msa-viewer
Loading...  http://www.bioinformatics.org/strap/
Loading...  http://wasabiapp.org/
Loading...  https://github.com/calipho-sib/feature-viewer
Loading...  http://www.yeastrc.org/mason/
Loading...  https://github.com/chfi/purescript-genetics-browser
Loading...  https://tnt.marlin.pub/articles/introduction
Loading...  https://github.com/TGAC/Aequatus
Loading...  https://alitvteam.github.io/AliTV/d3/AliTV.html
Loading...  http://biodalliance.org/dev/test-comparative.html
Loading...  https://github.com/mjsull/chroma

# Merge Data (deprecated)

In [None]:
"""
Issues By Site
"""
issues = []
with open('../output/accessibility-reports.json', 'r') as f:
    reports = json.load(f)
    for report in reports:
        metrics = ['error', 'contrast', 'alert']
        for m in metrics:
            if report['report']['status']['success'] == False:
                continue

            stats = report['report']['categories'][m]['items']

            row = {}
            row['dbId'] = report['dbId']
            row['shortName'] = report['shortName']
            row['url'] = report['url']
            row['type'] = m

            for e in stats:
                name = stats[e]['id']
                count = stats[e]['count']
                row_copy = row.copy()
                row_copy['name'] = name
                row_copy['count'] = count
                
                issues.append(row_copy)

issues = pd.DataFrame.from_records(issues)
issues.head()

In [None]:
"""
Site Metadata
"""
sites = pd.read_json('../input/database-commons-with-status.json')
sites.head()

In [None]:
"""
Merge
"""
df = issues.set_index("dbId").join(sites.set_index("dbId").drop(columns=['shortName', 'url'])).reset_index()
df.to_json('../output/accessibility-reports-with-metadata.json', orient='records')
df.tail()

In [None]:
"""
Data Cleaning
"""
import math

cdf = df.copy()

def json_to_str(x, k):
    if isinstance(x, float):
        return x
    else:
        values = list(map((lambda _: _[k]), x))
        values.sort()
        return ', '.join(values)
    
multi_label_columns = [
    ('dataTypeList', 'datatypeName'), 
    ('categoryList', 'name'), 
    ('keywordsList', 'name'),
    ('dataObjectList', 'name'),
#     ('ratingList', 'name'),
    ('organismList', 'organismName'),
    
]

for (c, k) in multi_label_columns:
    cdf[c] = cdf[c].apply(lambda x: json_to_str(x, k))

cdf = cdf.drop(columns=['ratingList', 'biodbRanks'])
    
cdf.to_json('../output/a11y-reports-with-metadata.json', orient='records')
cdf