In [None]:
import pandas as pd
import urllib
import json
import os
from pathlib import Path
from constants import EVALUATION_DATE_FOLDER

# Evaluate

In [None]:
# Load a WAVE API key stored in a local file
with open('../input/api.lab.key', 'r') as f:
    API_KEY = f.read()

## Data Portals

In [None]:
"""
Get filtered resources' page URLs and page IDs
"""
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_pages.csv'))
df_map = pd.read_csv(os.path.join('..', 'output', 'data-portal_id_map.csv'))

# Get ids to filter by. Let's just look at the manually collected ones for now.
# TODO: use the filtered data from `02-Filter.ipynb`!
df_subpages = pd.read_csv(os.path.join('..', 'input', EVALUATION_DATE_FOLDER, 'URL Collection for Subpages - Data Portals.csv'))
df_subpages.source_id = 'dc_' + df_subpages.source_id.astype(str) 
df_subpages = df_subpages.merge(df_map[['source_id', 'id']], on=['source_id'], how='left')
FILTER_IDS = list(set(df_subpages.id.values.tolist()))

# Filter pages by selected IDs. Also, empty URLs are excluded.
df_pages = df_pages[(df_pages.id.isin(FILTER_IDS)) & (~df_pages.url.isnull())]

# df_pages = df_pages.head(1) # for debuging purposes
df_pages

In [None]:
"""
Collect raw reports all together first using API calls
"""
def collect_raw_reports_and_save(df_pages):

    RAW_REPORTS_FOLDER = os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'raw-reports')
    
    # Create a folder to store raw reports, if missing
    Path(RAW_REPORTS_FOLDER).mkdir(exist_ok=True)

    """ 
    Collect missing reports one by one, and save them as a file
    """
    for _, row in df_pages.iterrows():
        page_id = row.page_id
        url = row.url

        # Skip if the report already exists
        PAGE_REPORT_PATH = os.path.join(RAW_REPORTS_FOLDER, f'{page_id}.json')
        is_exist = os.path.isfile(PAGE_REPORT_PATH)

        if is_exist:
            print(f'Report for {url} already exists. Skipping ...')
            continue
        
        # Refer to https://wave.webaim.org/api/docs#!/request/getRequest for the API documentation
        API_URL = f'https://wave.webaim.org/api/request?key={API_KEY}&reporttype=2&url={url}'
        
        print(f'Retrieving {url} ...')

        try:
            with urllib.request.urlopen(API_URL) as f:
                new_report = json.load(f) # Refer to `../output/raw-reports-examples` to understand the structure of the report

                # Save the raw report
                with open(PAGE_REPORT_PATH, 'w') as f:
                    json.dump(new_report, f)
        except:
            print('Failed!')

In [None]:
collect_raw_reports_and_save(df_pages)

In [None]:
for args in RESOURCES:
    collect_raw_reports_and_save(
        get_sampled_resources_metadata(*args), 
        *args
    )

In [None]:
"""
Issues By Site
"""
issues = []
with open('../output/accessibility-reports.json', 'r') as f:
    reports = json.load(f)
    for report in reports:
        metrics = ['error', 'contrast', 'alert']
        for m in metrics:
            if report['report']['status']['success'] == False:
                continue

            stats = report['report']['categories'][m]['items']

            row = {}
            row['dbId'] = report['dbId']
            row['shortName'] = report['shortName']
            row['url'] = report['url']
            row['type'] = m

            for e in stats:
                name = stats[e]['id']
                count = stats[e]['count']
                row_copy = row.copy()
                row_copy['name'] = name
                row_copy['count'] = count
                
                issues.append(row_copy)

issues = pd.DataFrame.from_records(issues)
issues.head()

In [None]:
"""
Site Metadata
"""
sites = pd.read_json('../input/database-commons-with-status.json')
sites.head()

In [None]:
"""
Merge
"""
df_pages = issues.set_index("dbId").join(sites.set_index("dbId").drop(columns=['shortName', 'url'])).reset_index()
df_pages.to_json('../output/accessibility-reports-with-metadata.json', orient='records')
df_pages.tail()

In [None]:
"""
Data Cleaning
"""
import math

cdf = df_pages.copy()

def json_to_str(x, k):
    if isinstance(x, float):
        return x
    else:
        values = list(map((lambda _: _[k]), x))
        values.sort()
        return ', '.join(values)
    
multi_label_columns = [
    ('dataTypeList', 'datatypeName'), 
    ('categoryList', 'name'), 
    ('keywordsList', 'name'),
    ('dataObjectList', 'name'),
#     ('ratingList', 'name'),
    ('organismList', 'organismName'),
    
]

for (c, k) in multi_label_columns:
    cdf[c] = cdf[c].apply(lambda x: json_to_str(x, k))

cdf = cdf.drop(columns=['ratingList', 'biodbRanks'])
    
cdf.to_json('../output/a11y-reports-with-metadata.json', orient='records')
cdf