# Playground
For visualization, refer to `./04-Visualize_Data-Portals.ipynb` and `./04-Visualize_Journal-Portals.ipynb` for more information.

## Visualize
### Import Libraries

In [1]:
import pandas as pd
import altair as alt
import os
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

### Journal Portals
#### Basic Data Wrangling

In [3]:
"""
Files available in GIST.
You can find the following files in the GitHub repository as well.
"""
# Contains the accessibility issues for each page
URL_EVALUATION_RESULTS = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/data-portal_evaluation.csv'

# Contains the page types and URLs for each resource
URL_PAGES = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/data-portal_pages.csv'

# Contains the metadata for each resource
URL_METADATA = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/data-portal_metadata.csv'

In [8]:
"""
Merge data for visualization
"""
df = pd.read_csv(URL_EVALUATION_RESULTS)

# Can be useful for visualization
ERROR_TYPES = df.columns.tolist()
ERROR_TYPES.remove('page_id')
ERROR_TYPES.remove('is_success')
ERROR_TYPES.remove('error_count')

# Add `id` of resources and `page_type` of pages
df_pages = pd.read_csv(URL_PAGES)
df = df.merge(df_pages[['id', 'page_id', 'page_type']], left_on='page_id', right_on='page_id', how='left')

# Add metadata of resources
df_meta = pd.read_csv(URL_METADATA)
df_meta.drop(columns=['url'], inplace=True)
df = df.merge(df_meta, left_on='id', right_on='id', how='left')

df.head(3)

Unnamed: 0,page_id,is_success,error_count,label_missing,language_missing,button_empty,alt_missing,link_empty,aria_menu_broken,alt_link_missing,...,category_list,keywords_list,data_object_list,organism_list,theme_list,zindex,first_publication_year,search_example,cited_date,ess
0,1_home,True,3.0,1.0,1.0,1.0,,,,,...,"Pathway, Standard ontology and nomenclature",functional annotation,"Animal, Plant, Fungi, Bacteria, Archaea","Homo sapiens, Mus musculus",,1847.8,2007,,2022-12-04 00:01:00,0
1,2_home,True,2.0,2.0,,,,,,,...,"Pathway, Standard ontology and nomenclature",functional annotation,"Animal, Plant, Fungi, Bacteria, Archaea, Virus","Escherichia coli, Homo sapiens",,1419.12,0,,2022-12-04 00:00:07,0
2,3_home,True,835.0,1.0,1.0,1.0,39.0,792.0,1.0,,...,"Genotype phenotype and variation, Expression, ...","tumor, cancer genome, TCGA",Animal,Homo sapiens,,1320.18,0,,2022-12-04 00:02:30,0


In [5]:
# Let's add some useful columns for visualization
df['has_error'] = df.error_count.apply(lambda x: x > 0)

In [9]:
# Available columns
df.columns

Index(['page_id', 'is_success', 'error_count', 'label_missing',
       'language_missing', 'button_empty', 'alt_missing', 'link_empty',
       'aria_menu_broken', 'alt_link_missing', 'th_empty', 'heading_empty',
       'link_skip_broken', 'alt_spacer_missing', 'title_invalid',
       'aria_reference_broken', 'alt_input_missing', 'alt_area_missing',
       'alt_map_missing', 'label_empty', 'label_multiple', 'id', 'page_type',
       'source_id', 'short_name', 'accession', 'full_name', 'description',
       'founded_year', 'host_institution', 'address', 'city', 'province',
       'country', 'current_version', 'contact_name', 'contact_email',
       'available_protocol', 'last_update', 'age', 'citation', 'token',
       'expire_date', 'ins', 'group_email', 'is_partner', 'funding',
       'submitter', 'is_new', 'bigsearch_id', 'record_created',
       'last_modified', 'data_type_list', 'category_list', 'keywords_list',
       'data_object_list', 'organism_list', 'theme_list', 'zindex',
   

#### Create Altair Plots!
Reference: https://altair-viz.github.io/

In [27]:
alt.Chart(df).mark_bar().encode(
    alt.Y('sum(error_count):Q', title='A Total of Error Counts'),
    alt.X('page_type:N', title='Page Type'),
) | alt.Chart(df).mark_point().encode(
    alt.Y('error_count'),
    alt.X('citation:Q'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('mean(error_count):Q', title='The Average of Error Counts'),
    alt.X('country:N'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('count(error_count):Q', title='The Number of Data Portals'),
    alt.X('country:N'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('mean(error_count):Q', title='The Average of Error Counts'),
    alt.X('founded_year:O'),
)

### Journals
#### Basic Data Wrangling

In [28]:
"""
Files available in GIST.
You can find the following files in the GitHub repository as well.
"""
# Contains the accessibility issues for each page
URL_EVALUATION_RESULTS = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/journal-portal_evaluation.csv'

# Contains the page types and URLs for each resource
URL_PAGES = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/journal-portal_pages.csv'

# Contains the metadata for each resource
URL_METADATA = 'https://gist.githubusercontent.com/sehilyi/45ec1eb9115f969279d30fd1734b50e8/raw/68e91ff90b66df05ce4396acefb664c0c1272916/journal-portal_metadata.csv'

In [29]:
"""
Merge data for visualization
"""
df = pd.read_csv(URL_EVALUATION_RESULTS)

# Can be useful for visualization
ERROR_TYPES = df.columns.tolist()
ERROR_TYPES.remove('page_id')
ERROR_TYPES.remove('is_success')
ERROR_TYPES.remove('error_count')

# Add `id` of resources and `page_type` of pages
df_pages = pd.read_csv(URL_PAGES)
df = df.merge(df_pages[['id', 'page_id', 'page_type']], left_on='page_id', right_on='page_id', how='left')

# Add metadata of resources
df_meta = pd.read_csv(URL_METADATA)
df_meta.drop(columns=['url'], inplace=True)
df = df.merge(df_meta, left_on='id', right_on='id', how='left')

df.head(3)

Unnamed: 0,page_id,is_success,error_count,aria_reference_broken,label_multiple,alt_missing,alt_link_missing,label_empty,button_empty,link_empty,...,total_cites_3years,citable_docs_3years,cites_per_doc_2years,ref_per_doc,country,region,publisher,coverage,categories,areas
0,12_home,True,0.0,,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
1,12_research_article,True,1.0,1.0,,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary
2,12_none_research_article,True,2.0,,2.0,,,,,,...,168543,3565,2395,2159,United Kingdom,Western Europe,Nature Publishing Group,1869-2022,Multidisciplinary (Q1),Multidisciplinary


In [30]:
# Let's add some useful columns for visualization
df['has_error'] = df.error_count.apply(lambda x: x > 0)

In [31]:
# Available columns
df.columns

Index(['page_id', 'is_success', 'error_count', 'aria_reference_broken',
       'label_multiple', 'alt_missing', 'alt_link_missing', 'label_empty',
       'button_empty', 'link_empty', 'heading_empty', 'alt_input_missing',
       'label_missing', 'aria_menu_broken', 'link_skip_broken',
       'alt_spacer_missing', 'th_empty', 'title_invalid', 'id', 'page_type',
       'source_id', 'rank', 'title', 'type', 'issn', 'sjr',
       'sjr_best_quartile', 'h_index', 'total_docs_2022', 'total_docs_3years',
       'total_refs', 'total_cites_3years', 'citable_docs_3years',
       'cites_per_doc_2years', 'ref_per_doc', 'country', 'region', 'publisher',
       'coverage', 'categories', 'areas', 'has_error'],
      dtype='object')

#### Create Altair Plots!
Reference: https://altair-viz.github.io/

In [33]:
alt.Chart(df).mark_bar().encode(
    alt.Y('sum(error_count):Q', title='A Total of Error Counts'),
    alt.X('page_type:N', title='Page Type'),
) | alt.Chart(df).mark_point().encode(
    alt.Y('error_count'),
    alt.X('total_cites_3years:Q'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('mean(error_count):Q', title='The Average of Error Counts'),
    alt.X('country:N'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('count(error_count):Q', title='The Number of Data Portals'),
    alt.X('areas:N'),
) | alt.Chart(df).mark_bar().encode(
    alt.Y('mean(alt_missing):Q', title='The Average of Alt-text Missing'),
    alt.X('publisher:N'),
)