In [1]:
import pandas as pd
import urllib
import json
import os
from pathlib import Path
import sys
from constants import EVALUATION_DATE_FOLDER

# Evaluate

In [2]:
# Load a WAVE API key stored in a local file
with open('../input/api.lab.key', 'r') as f:
    API_KEY = f.read()

## Data Portals

In [3]:
"""
Get filtered resources' page URLs and page IDs
"""
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_pages.csv'))
df_map = pd.read_csv(os.path.join('..', 'output', 'data-portal_id_map.csv'))

# Get ids to filter by. Let's just look at the manually collected ones for now.
df_filtered = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_filtered_ids.csv'))
FILTER_IDS = list(set(df_filtered.id.values.tolist()))

# Filter pages by selected IDs. Also, empty URLs are excluded.
df_pages = df_pages[(df_pages.id.isin(FILTER_IDS)) & (~df_pages.url.isnull())]

# df_pages = df_pages.head(1) # for debuging purposes
df_pages

Unnamed: 0,id,page_type,page_id,url
0,1,home,1_home,https://david.ncifcrf.gov
1,2,home,2_home,https://www.kegg.jp
2,3,home,3_home,https://www.cbioportal.org/
3,4,home,4_home,https://string-db.org/
4,5,home,5_home,https://www.encodeproject.org/
...,...,...,...,...
6609,58,documentation,58_documentation,https://docs.gsea-msigdb.org/#GSEA/GSEA_FAQ/
6610,59,search,59_search,http://www.cellchat.org/cellchatdb/
6615,60,search_result,60_search_result,http://smart.embl.de/smart/search.cgi?keywords...
6616,60,data_entity,60_data_entity,http://smart.embl.de/smart/show_motifs.pl?ID=F...


In [4]:
"""
Collect raw reports all together first using API calls
"""
def collect_raw_reports_and_save(df_pages, RAW_REPORTS_FOLDER):    
    # Create a folder to store raw reports, if missing
    Path(RAW_REPORTS_FOLDER).mkdir(exist_ok=True)

    """ 
    Collect missing reports one by one, and save them as a file
    """
    for _, row in df_pages.iterrows():
        page_id = row.page_id
        url = row.url

        # Skip if the report already exists
        PAGE_REPORT_PATH = os.path.join(RAW_REPORTS_FOLDER, f'{page_id}.json')
        is_exist = os.path.isfile(PAGE_REPORT_PATH)

        if is_exist:
            print(f'Report for {url} already exists. Skipping ...')
            continue
        
        # Refer to https://wave.webaim.org/api/docs#!/request/getRequest for the API documentation
        API_URL = f'https://wave.webaim.org/api/request?key={API_KEY}&reporttype=2&url={url}'
        
        print(f'Retrieving {url} ...')

        try:
            with urllib.request.urlopen(API_URL) as f:
                new_report = json.load(f) # Refer to `../output/raw-reports-examples` to understand the structure of the report

                # Save the raw report
                with open(PAGE_REPORT_PATH, 'w') as f:
                    json.dump(new_report, f)
        except:
            print('Failed!')

In [5]:
RAW_REPORTS_FOLDER = os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'raw-reports', 'data-portal')

In [None]:
collect_raw_reports_and_save(
    df_pages,
    RAW_REPORTS_FOLDER
)

In [6]:
def create_dataframe_from_raw_reports(df_pages, RAW_REPORTS_FOLDER):
    # Assumption in this function is that the raw reports are already collected

    # All reports to be saved as a dataframe
    reports = []
    for _, row in df_pages.iterrows():
        # The dictionary to be saved as a row in the dataframe
        cleaned_report = {}

        # The primary key column we use to identify each page
        page_id = row.page_id
        
        PAGE_REPORT_PATH = os.path.join(RAW_REPORTS_FOLDER, f'{page_id}.json')

        if not os.path.isfile(PAGE_REPORT_PATH):
            print(f'Odd. Report for {page_id} does not exist. Skipping ...')
            continue

        with open(PAGE_REPORT_PATH, 'r') as f:
            page_report = json.load(f)

        is_success = page_report.get('status').get('success')

        # Add basic information
        cleaned_report |= { 
            'page_id': page_id,
            'is_success': is_success
        }


        if is_success == True:
            error_items = page_report.get('categories').get('error').get('items')
            alert_items = page_report.get('categories').get('alert').get('items')
        
            cleaned_report |= { 
                'error_count': page_report.get('categories').get('error').get('count'),
                'alert_count': page_report.get('categories').get('alert').get('count'),
            }

        # Example: 
        # {
        #   'label_missing': {'id': 'label_missing', 'description': 'Missing form label', 'count': 1}, 
        #   'language_missing': {'id': 'language_missing', 'description': 'Language missing or invalid', 'count': 1}, 
        #   'button_empty': {'id': 'button_empty', 'description': 'Empty button', 'count': 1}
        # }

        if len(error_items) != 0:
            for item in error_items.values():
                cleaned_report |= { f"error_{item.get('id')}": item.get('count') }
        if len(alert_items) != 0:
            for item in alert_items.values():
                cleaned_report |= { f"alert_{item.get('id')}": item.get('count') }
        
        reports.append(cleaned_report)

    return pd.DataFrame.from_records(reports)    

In [None]:
df_reports = create_dataframe_from_raw_reports(df_pages, RAW_REPORTS_FOLDER)
df_reports.to_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'data-portal_evaluation.csv'), index=False)

## Journals

In [7]:
"""
Get filtered resources' page URLs and page IDs
"""
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_pages.csv'))
df_map = pd.read_csv(os.path.join('..', 'output', 'journal-portal_id_map.csv'))

# Get ids to filter by. Let's just look at the manually collected ones for now.
# TODO: use the filtered data from `02-Filter.ipynb`!
df_filtered = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_filtered_ids.csv'))
FILTER_IDS = list(set(df_filtered.id.values.tolist()))

# Filter pages by selected IDs. Also, empty URLs are excluded.
df_pages = df_pages[(df_pages.id.isin(FILTER_IDS)) & (~df_pages.url.isnull())]

# df_pages = df_pages.head(1) # for debuging purposes
df_pages

Unnamed: 0,id,page_type,page_id,url
0,27956,home,27956_home,https://onlinelibrary.wiley.com/journal/15424863
2,27958,home,27958_home,https://www.nature.com/nrm/
3,27959,home,27959_home,https://www.cell.com/cell/home
4,27960,home,27960_home,https://www.nejm.org/
5,27961,home,27961_home,https://www.nature.com/nm/
...,...,...,...,...
27927,55883,home,55883_home,https://rojournal.elpub.ru/jour
27938,55894,home,55894_home,https://www.apa.org/pubs/journals/tps
27943,55899,home,55899_home,http://online.eastview.com/projects/voprosy_is...
27944,55900,home,55900_home,https://www.keaipublishing.com/en/journals/wat...


In [8]:
RAW_REPORTS_FOLDER = os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'raw-reports', 'journal-portal')

In [9]:
collect_raw_reports_and_save(
    df_pages,
    RAW_REPORTS_FOLDER
)

Report for https://onlinelibrary.wiley.com/journal/15424863 already exists. Skipping ...
Report for https://www.nature.com/nrm/ already exists. Skipping ...
Report for https://www.cell.com/cell/home already exists. Skipping ...
Report for https://www.nejm.org/ already exists. Skipping ...
Report for https://www.nature.com/nm/ already exists. Skipping ...
Report for https://www.cdc.gov/mmwr/indrr_2020.html already exists. Skipping ...
Report for https://www.nature.com/nbt/ already exists. Skipping ...
Report for https://www.nature.com/nrc/ already exists. Skipping ...
Report for http://www.nature.com/nature/index.html already exists. Skipping ...
Report for https://www.nature.com/nrg/ already exists. Skipping ...
Report for https://www.nature.com/nrd/ already exists. Skipping ...
Report for http://www.nature.com/nri/index.html already exists. Skipping ...
Report for http://www.cdc.gov/mmwr/ already exists. Skipping ...
Report for https://www.nature.com/ng/ already exists. Skipping ...
R

In [None]:
df_reports = create_dataframe_from_raw_reports(df_pages, RAW_REPORTS_FOLDER)
df_reports.to_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_evaluation.csv'), index=False)