# VA Project Template

This template just loads and uses a few of the discussed libraries. Please follow the instruction in Moodle and feel free to remove/update any cells below.

In [44]:
#disable some annoying warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import pandas as pd
import numpy as np

import altair as alt

import ipywidgets as widgets

# Load Data

In [45]:
# helper functions for data loading 

def getYearsOfInterest(fromYear, toYear):
    return [str(x) for x in range(fromYear, toYear+1)]

def filterData(valueColumns, metaDataColumns, data):
    missingColumns = list(set(valueColumns) - set(data.columns))
    for c in missingColumns:
        data[c] = None 
    return data[list(set(metaDataColumns) | set(valueColumns))]

def unpivot(data, key_columns, data_column, value_column):
    return pd.melt(data, id_vars=key_columns, var_name=data_column, value_name=value_column)

def loadSingleDataset(path, from_year, to_year, key_columns, data_column, value_column):
    data = pd.read_csv(path) 
    data = filterData(getYearsOfInterest(from_year, to_year), key_columns, data)
    return unpivot(data, key_columns, data_column, value_column)

def mergeDatasets(datasets, keys):
    data = datasets[0]
    
    for i in range(1, len(datasets)):
        data = data.merge(datasets[i], how='outer', left_on=keys, right_on=keys)
        
    return data

In [46]:
# global report params 
FROM_YEAR = 1900
TO_YEAR   = 2020

In [47]:
gdp_growth = loadSingleDataset('data/gdp_total_yearly_growth.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'gdp_growth')
gdp_growth.head()


Unnamed: 0,country,year,gdp_growth
0,Afghanistan,1980,-0.28
1,Albania,1980,2.81
2,Algeria,1980,2.02
3,Andorra,1980,2.21
4,Angola,1980,2.73


In [48]:
children_per_woman_total_fertility = loadSingleDataset('data/children_per_woman_total_fertility.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'children_per_woman_total_fertility')
children_per_woman_total_fertility.head()

Unnamed: 0,country,year,children_per_woman_total_fertility
0,Afghanistan,1980,7.45
1,Albania,1980,3.62
2,Algeria,1980,6.79
3,Angola,1980,7.5
4,Antigua and Barbuda,1980,2.12


In [49]:
co2_emissions_tonnes_per_person = loadSingleDataset('data/co2_emissions_tonnes_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'co2_emissions_tonnes_per_person')
co2_emissions_tonnes_per_person.head()


Unnamed: 0,country,year,co2_emissions_tonnes_per_person
0,Afghanistan,1980,0.132
1,Albania,1980,1.93
2,Algeria,1980,3.46
3,Andorra,1980,
4,Angola,1980,0.64


In [50]:
mean_years_in_school_women_percent_men_25_to_34_years = loadSingleDataset('data/mean_years_in_school_women_percent_men_25_to_34_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'mean_years_in_school_women_percent_men_25_to_34_years')
mean_years_in_school_women_percent_men_25_to_34_years.head()

Unnamed: 0,country,year,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,1980,0.165
1,Albania,1980,0.92
2,Algeria,1980,0.906
3,Andorra,1980,1.0
4,Angola,1980,0.553


In [51]:
average_age_of_dollar_billionaires_years = loadSingleDataset('data/average_age_of_dollar_billionaires_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'average_age_of_dollar_billionaires_years')
mean_years_in_school_women_percent_men_25_to_34_years.head()

Unnamed: 0,country,year,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,1980,0.165
1,Albania,1980,0.92
2,Algeria,1980,0.906
3,Andorra,1980,1.0
4,Angola,1980,0.553


In [52]:
food_supply= loadSingleDataset('data/food_supply.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'food_supply')
food_supply.head()

Unnamed: 0,country,year,food_supply
0,Afghanistan,1980,2480
1,Albania,1980,2600
2,Algeria,1980,2570
3,Angola,1980,1970
4,Antigua and Barbuda,1980,1970


## TODO 
* Add more datasets and then merge them in the following cell  

In [53]:
hourly_compensation = loadSingleDataset('data/hourly_compensation.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'hourly_compensation')
hourly_compensation.head()

Unnamed: 0,country,year,hourly_compensation
0,Argentina,1980,
1,Armenia,1980,
2,Australia,1980,
3,Austria,1980,
4,Azerbaijan,1980,


In [54]:
income_per_person= loadSingleDataset('data/income_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'income_per_person')
income_per_person.head()

Unnamed: 0,country,year,income_per_person
0,Afghanistan,1980,2020
1,Albania,1980,4180
2,Algeria,1980,11200
3,Andorra,1980,32100
4,Angola,1980,5110


In [55]:
suicide_per_100000_people = loadSingleDataset('data/suicide_per_100000_people.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'suicide_per_100000_people')
suicide_per_100000_people.head()

Unnamed: 0,country,year,suicide_per_100000_people
0,Albania,1980,
1,Antigua and Barbuda,1980,
2,Argentina,1980,7.36
3,Armenia,1980,
4,Australia,1980,10.9


In [56]:
total_number_of_dollar_billionaires = loadSingleDataset('data/total_number_of_dollar_billionaires.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'total_number_of_dollar_billionaires')
total_number_of_dollar_billionaires.head()

Unnamed: 0,country,year,total_number_of_dollar_billionaires
0,Afghanistan,1980,
1,Albania,1980,
2,Algeria,1980,
3,Andorra,1980,
4,Angola,1980,


In [57]:
working_hours_per_week = loadSingleDataset('data/working_hours_per_week.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'working_hours_per_week')
working_hours_per_week.head()

Unnamed: 0,country,year,working_hours_per_week
0,Albania,1980,
1,Algeria,1980,
2,Argentina,1980,
3,Armenia,1980,
4,Australia,1980,


## The final merged dataset

* Call mergeDatasets function to form the final dataset
* Augment data with additional attributes (e.g. continent and region data for _'country'_ and decade for _'year'_)

In [58]:
# merge the datasets in one that contains all the data
data = mergeDatasets([
    gdp_growth, 
    children_per_woman_total_fertility,
    co2_emissions_tonnes_per_person,
    mean_years_in_school_women_percent_men_25_to_34_years,
    average_age_of_dollar_billionaires_years,
    food_supply,
    hourly_compensation,
    income_per_person,
    suicide_per_100000_people,
    total_number_of_dollar_billionaires,
    working_hours_per_week
], ['country', 'year'])

data.sort_values(by=['country', 'year'], inplace=True, ignore_index=True)


countries = pd.read_csv('data/countryContinent.csv')

data = data.merge(countries, how='left', left_on=['country'], right_on=['country'])
data = data.convert_dtypes()

#add 'decade' computed column 
data['decade'] = data['year'].str.slice(0, 3)  + '0'

#check for missing countries (they have to be corrected in countryContinent.csv)
missing_countries = data[data["region_code"].isnull()]['country'].unique()

if (len(missing_countries) == 0):
    print("Country mapping is OK")
else:
    print(missing_countries)

Country mapping is OK


In [60]:
# basic statistics of the loaded data 
print(data.count())
data.head(50)

country                                                  23595
year                                                     23595
gdp_growth                                               22094
children_per_woman_total_fertility                       22264
co2_emissions_tonnes_per_person                          15722
mean_years_in_school_women_percent_men_25_to_34_years     8602
average_age_of_dollar_billionaires_years                   776
food_supply                                               8022
hourly_compensation                                        483
income_per_person                                        23353
suicide_per_100000_people                                 2992
total_number_of_dollar_billionaires                        776
working_hours_per_week                                    1643
code_2                                                   23474
code_3                                                   23595
country_code                                           

Unnamed: 0,country,year,gdp_growth,children_per_woman_total_fertility,co2_emissions_tonnes_per_person,mean_years_in_school_women_percent_men_25_to_34_years,average_age_of_dollar_billionaires_years,food_supply,hourly_compensation,income_per_person,...,working_hours_per_week,code_2,code_3,country_code,iso_3166_2,continent,sub_region,region_code,sub_region_code,decade
0,Afghanistan,1900,1.05,7.0,,,,,,1090,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
1,Afghanistan,1901,1.05,7.0,,,,,,1110,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
2,Afghanistan,1902,1.05,7.0,,,,,,1120,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
3,Afghanistan,1903,1.05,7.0,,,,,,1140,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
4,Afghanistan,1904,1.05,7.0,,,,,,1160,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
5,Afghanistan,1905,1.05,7.0,,,,,,1180,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
6,Afghanistan,1906,1.05,7.0,,,,,,1200,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
7,Afghanistan,1907,1.05,7.0,,,,,,1220,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
8,Afghanistan,1908,1.05,7.0,,,,,,1240,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
9,Afghanistan,1909,1.05,7.0,,,,,,1260,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900


# Show Data

In [93]:
# simple plots


In [95]:
#NB! the code will be fixed

base_chart = alt.Chart(data).mark_point().encode(
    x='decade:Q',
    y='children_per_woman_total_fertility:Q'
)

# https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20List.html#Selection-widgets
@widgets.interact(x = ['decade', 'children_per_woman_total_fertility', 'mean_years_in_school_women_percent_men_25_to_34_years'])
def show_plot(x):
    # You have to return the chart to make it visible.
    return base_chart.encode(
    x=x,
)

interactive(children=(Dropdown(description='x', options=('decade', 'children_per_woman_total_fertility', 'mean…