In [None]:
import pandas as pd
import urllib
import requests
import json
import os

# Data Cleaning

In [None]:
ID_COLUMN = 'BA_ID'
RESOURCES = [
    (
        'P',                                 # ID prefix
        'portal',                            # category
        'portals-metadata.json',             # metadata file
        'portals-reports.json',              # report file
        'portals-merged.json'                # merged file
    ),
    (
        'V',                                 # ID prefix
        'visualization',                     # category
        'visualizations-metadata.json',      # metadata file
        'visualizations-reports.json',       # report file
        'visualizations-merged.json'         # merged file
    ),
    (
        'J',                                 # ID prefix
        'journal',                           # category
        'journals-metadata.json',            # metadata file
        'journals-reports.json',             # report file
        'journals-merged.json'               # merged file
    )
]

## Add `BA_ID` If Missing
We want to provide unique ids to individual resources. We name the column after "*B*iomedical *A*ccessibility."

In [None]:
for (
    id_prefix,
    category,
    metadata_file,
    report_file,
    merged_file
) in RESOURCES:
    
    metadata_path = f'../output/{metadata_file}'
    report_path = f'../output/{report_file}'
    merged_path = f'../output/{merged_file}'
    
    meta_df = pd.read_json(metadata_path)
    repo_df = pd.read_json(report_path)

    if ID_COLUMN not in meta_df and ID_COLUMN not in repo_df:
        """
        Add `BA_ID`
        """
        meta_df[ID_COLUMN] = meta_df.index + 1
        meta_df[ID_COLUMN] = id_prefix +  meta_df[ID_COLUMN].astype(str).str.zfill(6)
        
        repo_df[ID_COLUMN] = repo_df.index + 1
        repo_df[ID_COLUMN] = id_prefix +  repo_df[ID_COLUMN].astype(str).str.zfill(6)
        
        meta_df.to_json(metadata_path, orient="records")
        repo_df.to_json(report_path, orient="records")
    else:
        print(f'The {category} files already have column IDs.')

## Merge Reports and Metadata
Combine `report_*.json` and `metadata_*.json` and generate `merged_*.json`.

In [None]:
for (
    id_prefix,
    category,
    metadata_file,
    report_file,
    merged_file
) in RESOURCES:
    
    metadata_path = f'../output/{metadata_file}'
    report_path = f'../output/{report_file}'
    merged_path = f'../output/{merged_file}'
    
    f = open(metadata_path, 'r')
    meta = json.load(f)
    f.close()
    
    f = open(report_path, 'r')
    reports = json.load(f)
    f.close()
    
    issues = []
    
    for report in reports:

        BA_ID = report[ID_COLUMN]
        METRICS = ['error', 'contrast', 'alert']

        row = {}
        row[ID_COLUMN] = BA_ID

        # TODO: Improve the code below
        for m in meta:
            if m[ID_COLUMN] == BA_ID:
                for key in m:
                    row[key] = m[key]
        
        for metric in METRICS:
            if report['report']['status']['success'] == False:
                continue

            scores = report['report']['categories'][metric]['items']

            row['issue_type'] = metric

            if len(scores) == 0:
                # Add an explit zero-issue row
                row_copy = row.copy()
                row_copy['issue_id'] = f'{metric}None'
                row_copy['issue_count'] = 0
                row_copy['issue_desc'] = 'No Issues Found'
                issues.append(row_copy)
            else:
                for score_category in scores:
                    issue_id = scores[score_category]['id']
                    issue_desc = scores[score_category]['description']
                    issue_count = scores[score_category]['count']

                    row_copy = row.copy()
                    row_copy['issue_id'] = issue_id
                    row_copy['issue_count'] = issue_count
                    row_copy['issue_desc'] = issue_desc

                    issues.append(row_copy)

    merged = pd.DataFrame.from_records(issues)
    merged.to_json(merged_path, orient='records')

# Make Columns and Values Readable

In [None]:
for (
    id_prefix,
    category,
    metadata_file,
    report_file,
    merged_file
) in RESOURCES:
    path = f'../output/{merged_file}'
    df = pd.read_json(path)
    
    # not ideal, but let's change the column names here at once
    # print(df.columns)
    columns = {
        'BA_ID': 'baId', 
        'issue_type': 'issueType',
        'issue_id': 'issueId', 
        'issue_count': 'issueCount', 
        'issue_desc': 'issueDesc',
        'Sourceid': 'sourceId',
        'Rank': 'rank',
        'Title': 'title',
        'Type': 'type',
        'Issn': 'issn',
        'SJR': 'sjr',
        'SJR Best Quartile': 'sjrBestQuartile',
        'H index': 'hIndex',
        'Total Docs. (2021)': 'totalDocs2021',
        'Total Docs. (3years)': 'totalDocs3Years',
        'Total Refs.': 'totalRefs', 
        'Total Cites (3years)': 'totalCites3Years',
        'Citable Docs. (3years)': 'citableDocs3Years',
        'Cites / Doc. (2years)': 'citesOverDoc2Years',
        'Ref. / Doc.': 'refOverDoc',
        'Country': 'country',
        'Region': 'region',
        'Publisher': 'publisher',
        'Coverage': 'coverage',
        'Categories': 'categories',
        'Areas': 'areas',
        'github_stars': 'githubStars',
        'alt_url': 'altUrl'
    }
    df = df.rename(columns=columns)
    df.to_json(path, orient='records')