In [None]:
%run _prepare.ipynb

# Gain insights (Notebook 5)

After checking the data for exceptions and distributions, checking key columns as well as time and geographic information, it is time to combine these skills and try to get some interesting insights. 

Topics of this notebook: 

* Which other interesting features other then co2 does the data contain?
* How can we quickly quickly iterate over different datapoints and selections
* How can we deal with the different magnitudes of countries emissions

# How to deal with the different magnitudes of countries?

The extreme difference of values makes it difficult to visualize points within the same chart. In the following, we discover some ways to deal with this

## Combine single-instance and aggregated plots

Pros:

* The aggregated sideplots can help understanding the messy parts of the scatterplot 

Cons:

* Some points might still be to 

In [None]:
plot_df = countries.groupby(["country","continent"])[["co2","co2_per_capita"]].sum().reset_index(level=1).reset_index()
px.scatter(
    plot_df,
    x="co2",
    y="co2_per_capita",
    color="continent",
    hover_name="country",
    height=600,
    width=1000,
    marginal_y="box",
    marginal_x="box"
)

## Combine single-instance and aggregated plots

Pros: 
    * 

In [None]:
plot_df = countries.groupby(["country","continent"])[["co2","co2_per_capita"]].sum().reset_index(level=1).reset_index()
px.scatter(
    plot_df,
    x="co2",
    y="co2_per_capita",
    log_x=True,
    log_y=True,
    color="continent",
    hover_name="country",
    height=600,
    width=1000,
    marginal_y="box",
    marginal_x="box"
)

## Visualize Changes

If the entities have extremly different magnitudes of size, one option is to analye changes.

In [None]:
aggregation_var = "co2_per_capita"
other_cols = ["population"]
base_year = 1990
plot_df = countries.loc[countries.year==2020].groupby(["country","continent"])[[aggregation_var, *other_cols]].mean().add_prefix("2020_").join(
    countries.loc[countries.year==base_year].groupby(["country","continent"])[[aggregation_var, *other_cols]].mean().add_prefix(f"{base_year}_")
).reset_index().fillna(0)

chart = alt.Chart(
    plot_df, height=300, width=800
).mark_circle().encode(
    x=alt.X(f"{base_year}_{aggregation_var}:Q", scale=alt.Scale(domain=[0, 40])), 
    y=alt.Y(f"2020_{aggregation_var}:Q", scale=alt.Scale(domain=[0, 40])), 
    color="continent", 
    size="2020_population:Q",
    tooltip=["country", f"{base_year}_{aggregation_var}", f"2020_{aggregation_var}"]
) 

min_max = plot_df.filter(like=aggregation_var).max().min()
diagonal = alt.Chart(
    pd.DataFrame({ 'x': [0, 40], 'y':  [0, 40]})
).mark_line(color= 'red').encode( 
    x= 'x',
    y= 'y',
)
chart + diagonal

## Create balanced subgroups

To many information can distract attention on the important ones. Therefore it is totally valid and actually beneficial to create more balanced groups

In [None]:
def construct_groups(single_important_countries = ["China", "United States"]):
    # Create a list of interesting metrics that keep truth by aggregation
    summable_metrics = ['co2', 'trade_co2', 'cement_co2','coal_co2', 'flaring_co2', 'gas_co2', 'oil_co2', 'other_industry_co2','consumption_co2', 'total_ghg', 'total_ghg_excluding_lucf', 'methane','nitrous_oxide', 'population', 'gdp', 'primary_energy_consumption']
    aggregations = dict(zip(summable_metrics, ["sum"] * len(summable_metrics)))

    # Split the list of countries into groups
    filter_term = countries.country.isin(single_important_countries)
    groups = countries.loc[~filter_term].fillna(0).groupby(["continent", "year"]).agg(aggregations).reset_index()

    groups_important_countries = countries.loc[filter_term, ["country"] + list(groups.columns)].rename(columns={"country": "name"})

    groups["name"] = groups.continent
    groups = groups.append(groups_important_countries)
    groups = groups.assign(co2_per_capita = groups.co2 / groups.population * 10e5).sort_values("year")
    return groups

In [None]:
px.scatter(
    construct_groups(["China", "United States", "Russia", "Germany", "India", "Japan"]).query("year > 1900"),
    x="co2",
    y="co2_per_capita",
    size="population",
    color="name",
    hover_data=["name", "year", "co2", "co2_per_capita"],
    height=600,
    width=1000,
    animation_frame="year", 
    animation_group="name",
    range_x=[0,12000], 
    range_y=[0,25]
)

# More interesting visualizations

## A contribute to Hans Rosling's Gapminder

Hans Rosling was not only the author of the great book "factfullness", but researched on global development for many years. Within his workings he also developed a tool named Gapminder or Trendalyzer. This tool is basically a bubble-chart on GDP per capita (x-axis), life expectancy (y-axis) and population size (bubble size). The special thing about it is the time-slider, which shows those key-characteristics in a trend over time.

Inspired by this, we can do something similiar regarding CO2 consumption:

In [None]:
px.scatter(
    countries.query("year > 1900").fillna(0),
    x="consumption_co2_per_gdp",
    y="co2_per_capita",
    size="population",
    color="country",
    hover_data=["country", "year", "co2", "co2_per_capita"],
    height=600,
    width=1000,
    animation_frame="year", 
    animation_group="country"
)

## Connected Scatter Plot

Connected scatter plots are less popular chart types, which enables the visualization of timelines in two dimensions. How ever, they can get messy quickly 

In [None]:
chart = alt.Chart(construct_groups().query("year>1900"),
    height=600,
    width=1000,
).mark_circle().encode(
    x="co2:Q",
    y="co2_per_capita:Q",
    color="name:N",
    order="year",
    tooltip=["name", "year"]
)
chart + chart.mark_trail().encode()

A very similiar plot in plotly-express

In [None]:
px.scatter(
    construct_groups(),
    x="co2",
    y="co2_per_capita",
    color="name",
    hover_data=["name", "year", "co2", "co2_per_capita"],
    height=600,
    width=1000,
)

# 📝 Turn Chart into an interactive dashboard and find other interesting visualizations

In [None]:
def create_scatter_plot(x="co2",y="co2_per_capita", countries=[]):
    chart = alt.Chart(construct_groups(countries).query("year>1900"),
        height=600,
        width=1000,
    ).mark_circle().encode(
        x=x,
        y=y,
        color="name:N",
        order="year",
        tooltip=["name", "year"]
    )
    return chart + chart.mark_trail().encode()

interact(create_scatter_plot, 
         x=widgets.Dropdown(options=countries.columns, value="co2"),
         y=widgets.Dropdown(options=countries.columns, value="co2_per_capita"),
        countries = widgets.SelectMultiple(options=countries.country.unique().tolist()))

# Analyzing CO2 sources

## Parallel Coordinates 

In [None]:
df = construct_groups().loc[filter_most_recent,["name", *cols_co2_sources]]
alt.Chart(df.melt("name")).mark_bar().encode(y="name", color="variable", x="value", tooltip=["name", "variable", "value"])

In [None]:
relative_fraction = df[["name"]].join(df.fillna(0).select_dtypes(np.number).div(df.drop(columns=["name"]).sum(axis=1), axis=0))

## Line Polar

In [None]:
fig = px.line_polar(relative_fraction.melt("name"), r="value", theta="variable", color="name", line_close=True,
            color_discrete_sequence=px.colors.sequential.Plasma_r)
fig.show()

In [None]:
plot_df = countries.loc[filter_most_recent, ["continent", "country"] + cols_co2_sources].melt(["continent", "country"]).fillna(0)

## Parallel Coordinates

In [None]:
px.parallel_coordinates(relative_fraction, labels=["name"])

In [None]:
# Inspired from https://altair-viz.github.io/gallery/parallel_coordinates.html

alt.Chart(relative_fraction.fillna(0).melt(["name"])).mark_line().encode(
    x='variable:N',
    y='value:Q',
    color='name:N',
    detail='name:N',
    opacity=alt.value(0.5)
).properties(width=500)

#  📝Make the time configurable within the dashboard