# Aggregation

This combines metadata and accessibility issues.

In [None]:
import pandas as pd

In [None]:
"""
The name of the folder you created under `../data/`
"""
TIME_STAMP_FOLDER_NAME = '2024-08-01'

In [None]:
PAGE_COLUMNS = ['resource_category', 'website_id', 'page_id', 'page_type']
ISSUE_ORIGINAL_COLUMNS = ['issue_id', 'issue_desc', 'issue_impact', 'issue_help', 'issue_url']
ISSUE_COLUMNS = [
    'issue_id',
    'issue_desc',
    'issue_impact',
    'issue_help',
    'issue_url',
    'issue_name',
    'issue_filter',
    'issue_overall_impact',
    'issue_note_overall_impact_hdv',
    'issue_severity',
    'issue_data_related',
    'issue_data_related_rule',
    'issue_pour_category',
    'issue_wcag_level',
    'issue_difficulty_to_fix',
    'issue_missing_label_related'
]

In [None]:
"""
Merge Accessibility Status, Metadata, and Detailed Issues
"""
data_portal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/data-portal/database-commons.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
journal_metadata = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/input/journal/sjr2022.csv', dtype={"page_id": "string"})
reports = pd.read_csv(f"../data/{TIME_STAMP_FOLDER_NAME}/results/accessibility-status.csv", dtype={"page_id": "string"})
issue = pd.read_csv(f'../data/{TIME_STAMP_FOLDER_NAME}/results/unique-issues-additional-labels-aug-9-2024.csv')

data_portal_metadata['resource_category'] = 'data-portal'
journal_metadata['resource_category'] = 'journal'


reports = reports.merge(data_portal_metadata, how='left', on=PAGE_COLUMNS)
reports = reports.merge(journal_metadata, how='left', on=PAGE_COLUMNS)

reports = reports.merge(issue, how='left', on=ISSUE_ORIGINAL_COLUMNS)

In [None]:
"""
Filter out reports with errors
"""
reports = reports[reports.page_url != 'chrome-error://chromewebdata/']

In [None]:
"""
Merge columns with the same names across resource types
"""
reports['country'] = reports.country_x.fillna(reports.country_y)
reports['url'] = reports.url_x.fillna(reports.url_y)
reports.drop(columns=['country_x', 'country_y', 'url_x', 'url_y'], inplace=True)

In [None]:
"""
Clean up regional columns
"""
def rename_countries(x: str):
    if x == 'Korea Republic of' or x == 'Korea, Republic of':
        return 'South Korea'
    elif x == 'Korea, Democratic People"S Republic of' or x == 'Korea, Democratic People':
        return 'North Korea'
    elif x == 'Russian Federation':
        return 'Russia'
    elif x == 'Iran, Islamic Republic Of':
        return 'Iran'
    else:
        return x
    
reports.country = reports.country.apply(lambda x: rename_countries(x))

In [None]:
"""
TODO: move this to the Data-Wrangling process
Add missing country
"""
reports.loc[reports.resource_category == 'government', 'country'] = 'United States'
reports.loc[reports.website_id == 'hubmap', 'country'] = 'United States'

In [None]:
"""
Add a "continent" column
"""
continent_country_map = pd.read_csv('https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv')
continent_country_map = continent_country_map.rename(columns={
    'Country': 'country',
    'Continent': 'continent'
})

def clean_country_names(x):
    if x == 'US':
        return 'United States'
    elif x == 'Korea, South':
        return 'South Korea' 
    elif x == 'Korea, North':
        return 'North Korea'
    elif x == 'Russian Federation':
        return 'Russia'
    elif x == 'Samoa':
        return 'American Samoa'
    elif x == 'Vietnam':
        return 'Viet Nam'
    elif x == 'Serbia':
        return 'Serbia and Montenegro'
    else:
        return x

continent_country_map.country = continent_country_map.country.apply(lambda x: clean_country_names(x))

_ = reports.copy()
_ = _.merge(continent_country_map, left_on='country', right_on='country', how='left')
_.loc[_.continent.isnull(), 'continent'] = _[_.continent.isnull()].country.apply(lambda x: 'Europe' if x == 'Czech Republic' or x == 'Guadeloupe' else x)

# Some manual correction
_.continent = _.continent.apply(lambda x: 'Europe' if x == 'Serbia' else 'Asia' if x == 'Taiwan' or x == 'Hong Kong' or x == 'Brunei Darussalam' else 'North America' if x == 'Puerto Rico' else x)

reports = _

In [None]:
"""
Group NIH institutions
"""
NIH_INSTS = [
    'National Center for Biotechnology Information',
    'National Cancer Institute',
    'National Heart, Lung, and Blood Institute',
    'National Center for Advancing Translational Sciences',
    'National Institutes of Health',
    'National Human Genome Research Institute',
    'National Institute of Environmental Health Sciences',
    'National Library of Medicine',
    'National Institute of Standards and Technology',
    'National Institute of Health',
    'National Institute on Aging',
    'National Institute of Neurological Disorders & Stroke',
    'National Institute of Child Health and Human Development',
    'National Eye Institute', # none found
    'National Institute of Allergy and Infectious Diseases',
    'National Institute of Arthritis and Musculoskeletal and Skin Diseases'
]
reports.loc[_.host_institution.isin(NIH_INSTS), 'host_institution'] = 'National Institutes of Health'

In [None]:
"""
Group Universities?
"""
# reports_aggregated.loc[_.host_institution.astype(str).str.contains('University', case=False, regex=False), 'host_institution'] = 'Educational Institution'
# reports_aggregated.loc[_.host_institution.astype(str).str.contains('School', case=False, regex=False), 'host_institution'] = 'Educational Institution'
# reports_aggregated.loc[_.host_institution.astype(str).str.contains('College', case=False, regex=False), 'host_institution'] = 'Educational Institution'

## Save Individual Issues

In [None]:
reports.to_csv(
    f"../data/{TIME_STAMP_FOLDER_NAME}/results/reports.csv",
    index=False
)

## Save Data Aggregated By Page

In [None]:
"""
Store the column names for metadata of pages
"""
PAGE_METADATA_COLUMNS = [a for a in reports.columns.tolist() if a not in (ISSUE_COLUMNS + ['violations', 'passes', 'total_checks', 'failure_rate'])]

In [None]:
"""
Group by Page
"""
reports_by_page = reports.groupby(
    PAGE_METADATA_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
"""
Calculate the failure rate
"""
reports_by_page['failure_rate'] = reports_by_page.violations / reports_by_page.total_checks

In [None]:
reports_by_page.to_csv(
    f"../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-page.csv",
    index=False
)

## Save Data Aggregated By Issue and Resource Type

In [None]:
"""
Group by Issue and Resource Category
"""
reports_by_page_and_resource = reports.groupby(
    (ISSUE_COLUMNS + ['resource_category']),
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
reports_by_page_and_resource.to_csv(
    f"../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-issue-and-resource.csv",
    index=False
)

## Save Data Aggregated By Issue

In [None]:
"""
Group by Issue
"""
reports_by_issue = reports.groupby(
    ISSUE_COLUMNS,
    dropna=False 
).agg({
    'violations': 'sum',
    'passes': 'sum',
    'total_checks': 'sum',
}).reset_index()

In [None]:
reports_by_issue.to_csv(
    f"../data/{TIME_STAMP_FOLDER_NAME}/results/reports-by-issue.csv",
    index=False
)

## Save Data For Statistical Analysis (Optional)

This data is for sharing with a collaborator.

In [None]:
from datetime import date

"""
Create a /share/ folder first
"""
for c in reports_by_page.resource_category.unique().tolist():
    reports_by_page[reports_by_page.resource_category == c].dropna(axis=1, how='all').to_csv(
        f"../data/{TIME_STAMP_FOLDER_NAME}/share-for-statistical-analysis/{c}_{date.today()}.csv",
    )