# VA Project Template

This template just loads and uses a few of the discussed libraries. Please follow the instruction in Moodle and feel free to remove/update any cells below.

In [90]:
#disable some annoying warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import pandas as pd
import numpy as np

import altair as alt

import ipywidgets as widgets

# Load Data

In [91]:
# helper functions for data loading 

def getYearsOfInterest(fromYear, toYear):
    return [str(x) for x in range(fromYear, toYear+1)]

#def filterData(valueColumns, metaDataColumns, data):
#    columns = list(set(metaDataColumns)|(set(valueColumns) & set(data.columns))) 
#    return data[columns]

def filterData(valueColumns, metaDataColumns, data):
    missingColumns = list(set(valueColumns) - set(data.columns))
    for c in missingColumns:
        data[c] = None #data.assign(missingColumns)
    return data[list(set(metaDataColumns) | set(valueColumns))]

def unpivot(data, key_columns, data_column, value_column):
    return pd.melt(data, id_vars=key_columns, var_name=data_column, value_name=value_column)

def loadSingleDataset(path, from_year, to_year, key_columns, data_column, value_column):
    data = pd.read_csv(path) 
    data = filterData(getYearsOfInterest(from_year, to_year), key_columns, data)
    return unpivot(data, key_columns, data_column, value_column)

def mergeDatasets(datasets, keys):
    data = datasets[0]
    
    for i in range(1, len(datasets)):
        data = data.merge(datasets[i], how='outer', left_on=keys, right_on=keys)
        
    return data

In [92]:
# global report params 
FROM_YEAR = 2000
TO_YEAR   = 2020

In [93]:
gdp_growth = loadSingleDataset('data/gdp_total_yearly_growth.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'gdp_growth')
gdp_growth.head()


Unnamed: 0,country,year,gdp_growth
0,Afghanistan,2007,13.6
1,Albania,2007,5.85
2,Algeria,2007,3.42
3,Andorra,2007,0.161
4,Angola,2007,23.2


In [94]:
children_per_woman_total_fertility = loadSingleDataset('data/children_per_woman_total_fertility.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'children_per_woman_total_fertility')
children_per_woman_total_fertility.head()

Unnamed: 0,country,year,children_per_woman_total_fertility
0,Afghanistan,2007,6.46
1,Albania,2007,1.67
2,Algeria,2007,2.66
3,Angola,2007,6.37
4,Antigua and Barbuda,2007,2.18


In [95]:
co2_emissions_tonnes_per_person = loadSingleDataset('data/co2_emissions_tonnes_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'co2_emissions_tonnes_per_person')
co2_emissions_tonnes_per_person.head()


Unnamed: 0,country,year,co2_emissions_tonnes_per_person
0,Afghanistan,2007,0.0838
1,Albania,2007,1.29
2,Algeria,2007,3.18
3,Andorra,2007,6.52
4,Angola,2007,1.2


In [96]:
mean_years_in_school_women_percent_men_25_to_34_years = loadSingleDataset('data/mean_years_in_school_women_percent_men_25_to_34_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'mean_years_in_school_women_percent_men_25_to_34_years')
mean_years_in_school_women_percent_men_25_to_34_years.head()

Unnamed: 0,country,year,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,2007,0.219
1,Albania,2007,1.01
2,Algeria,2007,0.88
3,Andorra,2007,1.05
4,Angola,2007,0.689


In [110]:
# merge the datasets in one that contains all the data
data = mergeDatasets([
    gdp_growth, 
    children_per_woman_total_fertility,
    co2_emissions_tonnes_per_person,
    mean_years_in_school_women_percent_men_25_to_34_years
], ['country', 'year'])

data.sort_values(by=['country', 'year'], inplace=True, ignore_index=True)

data.head(40)
#data.to_csv("data/tmp.csv")

Unnamed: 0,country,year,gdp_growth,children_per_woman_total_fertility,co2_emissions_tonnes_per_person,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,2000,1.74,7.49,0.037,0.203
1,Afghanistan,2001,-7.15,7.39,0.0376,0.205
2,Afghanistan,2002,27.1,7.27,0.0471,0.207
3,Afghanistan,2003,12.6,7.14,0.0509,0.209
4,Afghanistan,2004,6.55,6.99,0.0368,0.213
5,Afghanistan,2005,12.4,6.83,0.0515,0.214
6,Afghanistan,2006,4.56,6.65,0.0622,0.215
7,Afghanistan,2007,13.6,6.46,0.0838,0.219
8,Afghanistan,2008,2.5,6.25,0.152,0.222
9,Afghanistan,2009,20.2,6.04,0.238,0.223


# Show Data

In [None]:
# simple plots


In [120]:
#simple 
base_chart = alt.Chart(data).mark_point().encode(
    x='year:Q',
    y='children_per_woman_total_fertility:Q'
)

# https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20List.html#Selection-widgets
@widgets.interact(x = ['year', 'children_per_woman_total_fertility', 'mean_years_in_school_women_percent_men_25_to_34_years'])
def show_plot(x):
    # You have to return the chart to make it visible.
    return base_chart.encode(
    x=x,
)

interactive(children=(Dropdown(description='x', options=('year', 'children_per_woman_total_fertility', 'mean_y…