# VA Project Template

This template just loads and uses a few of the discussed libraries. Please follow the instruction in Moodle and feel free to remove/update any cells below.

In [83]:
#disable some annoying warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import pandas as pd
import numpy as np

import altair as alt

import ipywidgets as widgets

# Load Data

In [84]:
# helper functions for data loading 

def getYearsOfInterest(fromYear, toYear):
    return [str(x) for x in range(fromYear, toYear+1)]

def filterData(valueColumns, metaDataColumns, data):
    missingColumns = list(set(valueColumns) - set(data.columns))
    for c in missingColumns:
        data[c] = None 
    return data[list(set(metaDataColumns) | set(valueColumns))]

def unpivot(data, key_columns, data_column, value_column):
    return pd.melt(data, id_vars=key_columns, var_name=data_column, value_name=value_column)

def loadSingleDataset(path, from_year, to_year, key_columns, data_column, value_column):
    data = pd.read_csv(path) 
    data = filterData(getYearsOfInterest(from_year, to_year), key_columns, data)
    return unpivot(data, key_columns, data_column, value_column)

def mergeDatasets(datasets, keys):
    data = datasets[0]
    
    for i in range(1, len(datasets)):
        data = data.merge(datasets[i], how='outer', left_on=keys, right_on=keys)
        
    return data

In [86]:
# global report params 
FROM_YEAR = 1900
TO_YEAR   = 2020

In [87]:
gdp_growth = loadSingleDataset('data/gdp_total_yearly_growth.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'gdp_growth')
gdp_growth.head()


Unnamed: 0,country,year,gdp_growth
0,Afghanistan,1943,1.42
1,Albania,1943,0.551
2,Algeria,1943,1.7
3,Andorra,1943,4.47
4,Angola,1943,4.04


In [88]:
children_per_woman_total_fertility = loadSingleDataset('data/children_per_woman_total_fertility.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'children_per_woman_total_fertility')
children_per_woman_total_fertility.head()

Unnamed: 0,country,year,children_per_woman_total_fertility
0,Afghanistan,1943,7.44
1,Albania,1943,4.61
2,Algeria,1943,7.4
3,Angola,1943,7.03
4,Antigua and Barbuda,1943,4.47


In [89]:
co2_emissions_tonnes_per_person = loadSingleDataset('data/co2_emissions_tonnes_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'co2_emissions_tonnes_per_person')
co2_emissions_tonnes_per_person.head()


Unnamed: 0,country,year,co2_emissions_tonnes_per_person
0,Afghanistan,1943,
1,Albania,1943,0.402
2,Algeria,1943,0.0589
3,Andorra,1943,
4,Angola,1943,


In [90]:
mean_years_in_school_women_percent_men_25_to_34_years = loadSingleDataset('data/mean_years_in_school_women_percent_men_25_to_34_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'mean_years_in_school_women_percent_men_25_to_34_years')
mean_years_in_school_women_percent_men_25_to_34_years.head()

Unnamed: 0,country,year,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,1943,
1,Albania,1943,
2,Algeria,1943,
3,Andorra,1943,
4,Angola,1943,


## TODO 
* Add more datasets and then merge them in the following cell  

## The final merged dataset

* Call mergeDatasets function to form the final dataset
* Augment data with additional attributes (e.g. continent and region data for _'country'_ and decade for _'year'_)

In [91]:
# merge the datasets in one that contains all the data
data = mergeDatasets([
    gdp_growth, 
    children_per_woman_total_fertility,
    co2_emissions_tonnes_per_person,
    mean_years_in_school_women_percent_men_25_to_34_years
], ['country', 'year'])

data.sort_values(by=['country', 'year'], inplace=True, ignore_index=True)


countries = pd.read_csv('data/countryContinent.csv')

data = data.merge(countries, how='left', left_on=['country'], right_on=['country'])
data = data.convert_dtypes()

#add 'decade' computed column 
data['decade'] = data['year'].str.slice(0, 3)  + '0'

#check for missing countries (they have to be corrected in countryContinent.csv)
missing_countries = data[data["region_code"].isnull()]['country'].unique()

if (len(missing_countries) == 0):
    print("Country mapping is OK")
else:
    print(missing_countries)

Country mapping is OK


In [92]:
# basic statistics of the loaded data 
data.count()
data.head(50)

Unnamed: 0,country,year,gdp_growth,children_per_woman_total_fertility,co2_emissions_tonnes_per_person,mean_years_in_school_women_percent_men_25_to_34_years,code_2,code_3,country_code,iso_3166_2,continent,sub_region,region_code,sub_region_code,decade
0,Afghanistan,1900,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
1,Afghanistan,1901,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
2,Afghanistan,1902,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
3,Afghanistan,1903,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
4,Afghanistan,1904,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
5,Afghanistan,1905,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
6,Afghanistan,1906,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
7,Afghanistan,1907,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
8,Afghanistan,1908,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
9,Afghanistan,1909,1.05,7.0,,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900


# Show Data

In [93]:
# simple plots


In [95]:
#NB! the code will be fixed

base_chart = alt.Chart(data).mark_point().encode(
    x='decade:Q',
    y='children_per_woman_total_fertility:Q'
)

# https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20List.html#Selection-widgets
@widgets.interact(x = ['decade', 'children_per_woman_total_fertility', 'mean_years_in_school_women_percent_men_25_to_34_years'])
def show_plot(x):
    # You have to return the chart to make it visible.
    return base_chart.encode(
    x=x,
)

interactive(children=(Dropdown(description='x', options=('decade', 'children_per_woman_total_fertility', 'mean…