In [None]:
%load_ext autoreload
%autoreload 2
%aimport theme
import pandas as pd
import altair as alt
import os
from theme import apply_theme
from constants import EVALUATION_DATE_FOLDER
alt.data_transformers.disable_max_rows(); # Allow using rows more than 5000

# Visualize
For previous code, refer to `./deprecated/04-Visualize.ipynb`.

## Journal Portals

In [None]:
"""
Merge data for visualization
"""
df = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_evaluation.csv'))

# Can be useful for visualization
ERROR_TYPES = df.columns.tolist()
ERROR_TYPES.remove('page_id')
ERROR_TYPES.remove('is_success')
ERROR_TYPES.remove('error_count')

# Add `id` of resources and `page_type` of pages
df_pages = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_pages.csv'))
df = df.merge(df_pages[['id', 'page_id', 'page_type']], left_on='page_id', right_on='page_id', how='left')

# Add metadata of resources
df_meta = pd.read_csv(os.path.join('..', 'output', EVALUATION_DATE_FOLDER, 'journal-portal_metadata.csv'))
df_meta.drop(columns=['url'], inplace=True)
df = df.merge(df_meta, left_on='id', right_on='id', how='left')

df.head(3)
# df.title
df

In [None]:
# Let's add some useful columns for visualization
df['has_error'] = df.error_count.apply(lambda x: x > 0)

In [None]:
THEME_COLOR = '#CC7DAA'

In [None]:
def mean_errors_in_page_by_page_type(df):
    point = alt.Chart(df).mark_point(filled=True, color=THEME_COLOR, size=100).encode(
        alt.X('mean(error_count):Q', title='Mean of Error Count'),
        alt.Y('page_type:N', title='Page Type'),
    )
    error_bar = alt.Chart(df).mark_errorbar(color=THEME_COLOR, thickness=2).encode(
        alt.X('error_count:Q', title='Mean of Error Count'),
        alt.Y('page_type:N', title='Page Type'),
    )
    return apply_theme(
        (error_bar + point).properties(
            title='Average # of Errors in a Page by Page Type',
            height=300,
            width=400
        )
    )
mean_errors_in_page_by_page_type(df)

In [None]:
def error_pages_by_page_type(df):
    return apply_theme(
        alt.Chart(df).mark_bar(color=THEME_COLOR).encode(
            alt.Y('mean(has_error):Q', title='Percentage of Pages').axis(format='%', tickCount=10).scale(domain=[0, 1.0]),
            alt.X('page_type:N', title='Page Type', sort='-y'),
        ).properties(
            title='Proportion of Error Pages by Page Type',
            width=700
        )
    )
error_pages_by_page_type(df)

In [None]:
def num_of_errors_by_resources(df):
    bar = alt.Chart(df).mark_bar(color=THEME_COLOR).encode(
        alt.X('sum(error_count):Q', title='The Number of Errors').scale(type='symlog').axis(tickCount=20, orient='top'),
        alt.Y('title:N', title=None, sort='-x'),
        alt.Column('page_type:N', title='Page Type'),
    )
    return apply_theme(
        (bar).properties(
            title='Average # of Errors in a Page by Page Type',
            # height=300,
            width=200
        )
    )
num_of_errors_by_resources(df)

In [None]:
id_vars = ['page_id', 'page_type']
df_long = df[ERROR_TYPES + id_vars].melt(id_vars=id_vars, value_vars=ERROR_TYPES, var_name='error_type', value_name='error_count')
df_long.head(3)

In [None]:
def average_errors_by_error_type_and_page_type(df_long):
    point = alt.Chart(df_long).mark_point(filled=True, color=THEME_COLOR).encode(
        alt.X('mean(error_count):Q', title='Average Number of Errors').scale(domain=[0, 200], clamp=True),
        alt.Y('error_type:N', title='Error Type')
    ).properties(
        width=200
    )
    error_bar = alt.Chart(df_long).mark_errorbar(color=THEME_COLOR, thickness=2).encode(
        alt.X('error_count:Q', title='Average Number of Errors'),
        alt.Y('error_type:N', title='Error Type').axis(grid=True)
    )
    return apply_theme(
        (point + error_bar).facet(
            alt.Column('page_type:N', title='Page Type'),
        ).properties(
            title='Average # of Errors in a Page by Page Type'
        )
    )
average_errors_by_error_type_and_page_type(df_long)

In [None]:
# Let's add some useful columns for visualization
df_long['has_error'] = df_long.error_count.apply(lambda x: x > 0)

In [None]:
def error_pages_by_page_type(df_long):
    bar = alt.Chart(df_long).mark_bar(color=THEME_COLOR).encode(
        alt.X('mean(has_error):Q', title='Percentage of Pages').scale(domain=[0, 1]).axis(format='%'),
        alt.Y('error_type:N', title='Error Type')
    ).properties(
        width=200
    )
    return apply_theme(
        bar.facet(
            alt.Column('page_type:N', title='Page Type'),
        ).properties(
            title='Proportion of Error Pages by Page Type'
        )
    )
error_pages_by_page_type(df_long)