In [None]:
import pandas as pd
import urllib
import requests
import json

# Evaluate Data Portals

In [None]:
NUM_OF_MANUALLY_ADDED = 3
FILTER_TOP = None # 1000 - NUM_OF_MANUALLY_ADDED # select the top N portals

In [None]:
with open('../input/api.lab.key', 'r') as f:
    apiKey = f.read() # WAVE API key

In [None]:
dbs = pd.read_json('../input/database-commons-with-status.json') # already sorted by impact scores

# Uncomment the following line to load original data (i.e., database-commons.json)
# dbs = pd.DataFrame.from_dict(dbs.data.to_dict(), orient='index')

dbs = dbs[dbs.response == 200]
if dbs is not None:
    dbs = dbs[0:FILTER_TOP]
dbs

In [None]:
manually_picked = pd.read_json('../input/manually-added-data-portals.json')
manually_picked
dbs = dbs.append(manually_picked)
dbs

In [None]:
"""
Load existing reports
"""
f = open('../output/a11y-reports.json', 'r')
reports = json.load(f)
f.close()

"""
Iterate to add missing reports
"""
for index, row in dbs.iterrows():
    dbId = row.dbId
    shortName = row.shortName
    url = row.url

    reportExist = any(report['dbId'] == dbId for report in reports)
    if not reportExist:
        print('Loading... ', dbId, shortName, url)

        apiUrl = f'https://wave.webaim.org/api/request?key={apiKey}&reporttype=2&url={url}'
        # apiUrl = 'https://raw.githubusercontent.com/gosling-lang/gosling.js/master/tsconfig.json'
        try:
        
            with urllib.request.urlopen(apiUrl) as f:
                newReport = json.load(f)
                newData = {}
                newData['dbId'] = dbId
                newData['shortName'] = shortName
                newData['url'] = url
                newData['report'] = newReport
                reports.append(newData)
                # print(reports)

            with open('../output/a11y-reports.json', 'w') as f:
                json.dump(reports, f)
        except:
            print('failed')

# Merge Data

In [None]:
"""
Issues By Site
"""
issues = []
with open('../output/a11y-reports.json', 'r') as f:
    reports = json.load(f)
    for report in reports:
        metrics = ['error', 'contrast', 'alert']
        for m in metrics:
            if report['report']['status']['success'] == False:
                continue

            stats = report['report']['categories'][m]['items']

            row = {}
            row['dbId'] = report['dbId']
            row['shortName'] = report['shortName']
            row['url'] = report['url']
            row['type'] = m

            for e in stats:
                name = stats[e]['id']
                count = stats[e]['count']
                row_copy = row.copy()
                row_copy['name'] = name
                row_copy['count'] = count
                
                issues.append(row_copy)

issues = pd.DataFrame.from_records(issues)
issues.head()

In [None]:
"""
Site Metadata
"""
sites = pd.read_json('../input/database-commons-with-status.json')
sites.head()

In [None]:
"""
Merge
"""
df = issues.set_index("dbId").join(sites.set_index("dbId").drop(columns=['shortName', 'url'])).reset_index()
df.to_json('../output/a11y-reports-with-metadata.json', orient='records')
df.tail()

In [None]:
"""
Data Cleaning
"""
import math

cdf = df.copy()

def json_to_str(x, k):
    if isinstance(x, float):
        return x
    else:
        values = list(map((lambda _: _[k]), x))
        values.sort()
        return ', '.join(values)
    
multi_label_columns = [
    ('dataTypeList', 'datatypeName'), 
    ('categoryList', 'name'), 
    ('keywordsList', 'name'),
    ('dataObjectList', 'name'),
#     ('ratingList', 'name'),
    ('organismList', 'organismName'),
    
]

for (c, k) in multi_label_columns:
    cdf[c] = cdf[c].apply(lambda x: json_to_str(x, k))

cdf = cdf.drop(columns=['ratingList', 'biodbRanks'])
    
cdf.to_json('../output/a11y-reports-with-metadata.json', orient='records')
cdf