In [None]:
%run _prepare.ipynb

# Gain insights (Notebook 5)

After checking the data for exceptions and distributions, checking key columns as well as time and geographic information, it is time to combine these skills and try to get some interesting insights. 

Topics of this notebook: 

* Which other interesting features other then co2 does the data contain?
* How can we quickly quickly iterate over different datapoints and selections
* How can we deal with the different magnitudes of countries emissions

In [None]:
plot_df = countries.groupby(["country","continent"])[["co2","co2_per_capita"]].sum().reset_index(level=1).reset_index()
px.scatter(
    plot_df,
    x="co2",
    y="co2_per_capita",
    color="continent",
    hover_name="country",
    height=600,
    width=1000,
    marginal_y="box",
    marginal_x="box"
)

# Focus on the relevant

To many information can distract attention on the important ones. Therefore it is totally valid and actually beneficial to create more balanced groups

In [None]:
def construct_groups(single_important_countries = ["China", "United States"]):
    # Create a list of interesting metrics that keep truth by aggregation
    summable_metrics = ['co2', 'trade_co2', 'cement_co2','coal_co2', 'flaring_co2', 'gas_co2', 'oil_co2', 'other_industry_co2','consumption_co2', 'total_ghg', 'total_ghg_excluding_lucf', 'methane','nitrous_oxide', 'population', 'gdp', 'primary_energy_consumption']
    aggregations = dict(zip(summable_metrics, ["sum"] * len(summable_metrics)))

    # Split the list of countries into groups
    filter_term = countries.country.isin(single_important_countries)
    groups = countries.loc[~filter_term].fillna(0).groupby(["continent", "year"]).agg(aggregations).reset_index()

    groups_important_countries = countries.loc[filter_term, ["country"] + list(groups.columns)].rename(columns={"country": "name"})

    groups["name"] = groups.continent
    groups = groups.append(groups_important_countries)
    groups = groups.assign(co2_per_capita = groups.co2 / groups.population * 10e5).sort_values("year")
    return groups

## Showing the same using another visualization

In [None]:
plot_df = construct_groups()
plot_df.name.value_counts()

In [None]:
plot_df = construct_groups(["China", "United States", "Russia", "Germany", "India", "Japan"]).query("year > 1900")

In [None]:
plot_df["dt"] = pd.to_datetime(plot_df.year.astype(str) + "-01-01")
line_chart = alt.Chart(
    plot_df,
    width=1000
).mark_line().encode(
    x="dt",
    y="co2_per_capita",
    color="name:N",
    tooltip=["name","dt"]
)
line_chart & line_chart.encode(y="co2") & line_chart.encode(y="gdp") & line_chart.encode(y="population")

# Connected Scatter Plot

Connected scatter plots are less popular chart types, which enables the visualization of timelines in two dimensions. How ever, they can get messy quickly 

In [None]:
chart = alt.Chart(construct_groups().query("year>1900"),
    height=600,
    width=1000,
).mark_circle().encode(
    x="co2:Q",
    y="co2_per_capita:Q",
    color="name:N",
    order="year",
    tooltip=["name", "year"]
)
chart + chart.mark_trail().encode()

A very similiar plot in plotly-express

In [None]:
px.scatter(
    construct_groups(),
    x="co2",
    y="co2_per_capita",
    color="name",
    hover_data=["name", "year", "co2", "co2_per_capita"],
    height=600,
    width=1000,
)

In [None]:
px.scatter(
    construct_groups(["China", "United States", "Russia", "Germany", "India", "Japan"]).query("year > 1900"),
    x="co2",
    y="co2_per_capita",
    size="population",
    color="name",
    hover_data=["name", "year", "co2", "co2_per_capita"],
    height=600,
    width=1000,
    animation_frame="year", 
    animation_group="name",
    range_x=[0,12000], 
    range_y=[0,25]
)

# 📝 Turn Chart into an interactive dashboard and find other interesting visualizations

In [None]:
def create_scatter_plot(x="co2",y="co2_per_capita", countries=[]):
    chart = alt.Chart(construct_groups(countries).query("year>1900"),
        height=600,
        width=1000,
    ).mark_circle().encode(
        x=x,
        y=y,
        color="name:N",
        order="year",
        tooltip=["name", "year"]
    )
    return chart + chart.mark_trail().encode()

interact(create_scatter_plot, 
         x=widgets.Dropdown(options=countries.columns, value="co2"),
         y=widgets.Dropdown(options=countries.columns, value="co2_per_capita"),
        countries = widgets.SelectMultiple(options=countries.country.unique().tolist()))

# Parallel Coordinates 

In [None]:
df = construct_groups().loc[filter_most_recent,["name", *cols_co2_sources]]
alt.Chart(df.melt("name")).mark_bar().encode(y="name", color="variable", x="value", tooltip=["name", "variable", "value"])

In [None]:
relative_fraction = df[["name"]].join(df.fillna(0).select_dtypes(np.number).div(df.drop(columns=["name"]).sum(axis=1), axis=0))

In [None]:
fig = px.line_polar(relative_fraction.melt("name"), r="value", theta="variable", color="name", line_close=True,
            color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()

In [None]:
plot_df = countries.loc[filter_most_recent, ["continent", "country"] + cols_co2_sources].melt(["continent", "country"]).fillna(0)

In [None]:
# Parallel Coordinates

In [None]:
px.parallel_coordinates(relative_fraction, labels=["name"])

In [None]:
# Inspired from https://altair-viz.github.io/gallery/parallel_coordinates.html

alt.Chart(relative_fraction.fillna(0).melt(["name"])).mark_line().encode(
    x='variable:N',
    y='value:Q',
    color='name:N',
    detail='name:N',
    opacity=alt.value(0.5)
).properties(width=500)

#  📝Make the time configurable within the dashboard

## Option 1: Using the dashboard

In [None]:
def create_scatter_plot(x, y, date):
    most_recent = raw_data.loc[raw_data.date.astype(str)==date.strftime("%Y-%m-%d")]
    plot_df = most_recent.groupby(["location","continent"])[[x,y]].sum().reset_index(level=1).reset_index()
    return px.scatter(plot_df, x=x,y=y,color="continent", hover_name="location", height=600, width=1000)

In [None]:
from datetime import datetime
today = datetime.strptime(raw_data.date.max(), "%Y-%m-%d").date()

In [None]:
numerical_columns= list(raw_data.select_dtypes(np.number).columns)
x = widgets.Dropdown(options=numerical_columns, value="new_cases")
y = widgets.Dropdown(options=numerical_columns, value="new_deaths")
date = widgets.DatePicker(value=today)
interact(create_scatter_plot, x=x, y=y, date=date)

## Option 2: Using plotly

In [None]:
def create_scatter_plot(x, y):
    plot_df = most_recent.groupby(["date","location","continent"])[[x,y]].sum().reset_index(level=1).reset_index()
    return px.scatter(plot_df, x=x,y=y,color="continent", hover_name="location", height=600, width=1000)

In [None]:
from datetime import datetime
today = datetime.strptime(raw_data.date.max(), "%Y-%m-%d").date()

In [None]:

x="new_cases_per_million"
y="new_deaths_per_million"  
plot_df = countries.groupby(["date","location","continent"])[[x,y]].sum().reset_index(level=1).reset_index()
plot_df["days_past"] = (plot_df.date - today).dt.days 

In [None]:
fig = px.scatter(
    plot_df, 
    x=x, y=y,
    animation_frame="days_past", 
    color="continent", 
    hover_name="location", 
    range_x=[0,250000], 
    range_y=[0,10000]
)
fig.show()

# Small Multiples

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
plot_df

In [None]:
px.scatter(plot_df, x="new_cases_per_million", y="new_deaths_per_million", facet_row="year_month", facet_col="continent")

In [None]:
countries.query("location=='Italy'")

In [None]:
alt.Chart(plot_df, height=100, width=100).mark_circle().encode(
    x=alt.X(field='new_cases_per_million', aggregate='mean', type='quantitative'),
    y=alt.X(field="new_deaths_per_million", aggregate='mean', type='quantitative'),
    color="continent:N", 
    tooltip=["location","new_cases_per_million", "new_deaths_per_million"], 
    #size=alt.Size("life_expectancy", aggregate='mean', type='quantitative'),
    column="continent:N",
    row="year_month:O"
)

In [None]:
alt.Chart(plot_df, height=100, width=100).mark_circle().encode(
    x=alt.X(field='new_cases_per_million', aggregate='mean', type='quantitative'),
    y=alt.X(field="new_deaths_per_million", aggregate='mean', type='quantitative'),
    color="continent:N", 
    tooltip=["location","new_cases_per_million", "new_deaths_per_million", "life_expectancy"], 
    size=alt.Size("life_expectancy", aggregate='mean', type='quantitative'),
    column="continent:N",
    row="year_month:O"
)

In [None]:
alt.Chart(plot_df, height=100, width=100).mark_circle().encode(
    x=alt.X(field='new_cases_per_million', aggregate='mean', type='quantitative'),
    y=alt.X(field="new_deaths_per_million", aggregate='mean', type='quantitative'),
    color="continent:N", 
    tooltip=["location","new_cases_per_million", "new_deaths_per_million", "life_expectancy"], 
    size=alt.Size("life_expectancy", aggregate='mean', type='quantitative'),
    column="continent:N",
    row="year_month:O"
)