# VA Project Template

This template just loads and uses a few of the discussed libraries. Please follow the instruction in Moodle and feel free to remove/update any cells below.

In [1]:
#disable some annoying warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import pandas as pd
import numpy as np

import altair as alt

import ipywidgets as widgets

# Load Data

In [2]:
# helper functions for data loading 

def getYearsOfInterest(fromYear, toYear):
    return [str(x) for x in range(fromYear, toYear+1)]

def filterData(valueColumns, metaDataColumns, data):
    missingColumns = list(set(valueColumns) - set(data.columns))
    for c in missingColumns:
        data[c] = None 
    return data[list(set(metaDataColumns) | set(valueColumns))]

def unpivot(data, key_columns, data_column, value_column):
    return pd.melt(data, id_vars=key_columns, var_name=data_column, value_name=value_column)

def loadSingleDataset(path, from_year, to_year, key_columns, data_column, value_column):
    data = pd.read_csv(path) 
    data = filterData(getYearsOfInterest(from_year, to_year), key_columns, data)
    return unpivot(data, key_columns, data_column, value_column)

def mergeDatasets(datasets, keys):
    data = datasets[0]
    
    for i in range(1, len(datasets)):
        data = data.merge(datasets[i], how='outer', left_on=keys, right_on=keys)
        
    return data

In [3]:
# global report params 
FROM_YEAR = 1900
TO_YEAR   = 2020

In [4]:
gdp_growth = loadSingleDataset('data/gdp_total_yearly_growth.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'gdp_growth')
gdp_growth.head()


Unnamed: 0,country,year,gdp_growth
0,Afghanistan,1973,11.4
1,Albania,1973,7.46
2,Algeria,1973,3.39
3,Andorra,1973,7.81
4,Angola,1973,8.11


In [5]:
children_per_woman_total_fertility = loadSingleDataset('data/children_per_woman_total_fertility.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'children_per_woman_total_fertility')
children_per_woman_total_fertility.head()

Unnamed: 0,country,year,children_per_woman_total_fertility
0,Afghanistan,1973,7.45
1,Albania,1973,4.51
2,Algeria,1973,7.56
3,Angola,1973,7.61
4,Antigua and Barbuda,1973,3.12


In [6]:
co2_emissions_tonnes_per_person = loadSingleDataset('data/co2_emissions_tonnes_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'co2_emissions_tonnes_per_person')
co2_emissions_tonnes_per_person.head()


Unnamed: 0,country,year,co2_emissions_tonnes_per_person
0,Afghanistan,1973,0.135
1,Albania,1973,2.29
2,Algeria,1973,2.44
3,Andorra,1973,
4,Angola,1973,0.751


In [7]:
mean_years_in_school_women_percent_men_25_to_34_years = loadSingleDataset('data/mean_years_in_school_women_percent_men_25_to_34_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'mean_years_in_school_women_percent_men_25_to_34_years')
mean_years_in_school_women_percent_men_25_to_34_years.head()

Unnamed: 0,country,year,mean_years_in_school_women_percent_men_25_to_34_years
0,Afghanistan,1973,0.156
1,Albania,1973,0.889
2,Algeria,1973,0.905
3,Andorra,1973,0.981
4,Angola,1973,0.523


In [8]:
average_age_of_dollar_billionaires_years = loadSingleDataset('data/average_age_of_dollar_billionaires_years.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'average_age_of_dollar_billionaires_years')
average_age_of_dollar_billionaires_years.head()

Unnamed: 0,country,year,average_age_of_dollar_billionaires_years
0,Afghanistan,1973,
1,Albania,1973,
2,Algeria,1973,
3,Andorra,1973,
4,Angola,1973,


In [9]:
food_supply= loadSingleDataset('data/food_supply.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'food_supply')
food_supply.head()

Unnamed: 0,country,year,food_supply
0,Afghanistan,1973,2720
1,Albania,1973,2430
2,Algeria,1973,1850
3,Angola,1973,1920
4,Antigua and Barbuda,1973,1860


## TODO 
* Add more datasets and then merge them in the following cell  

In [10]:
hourly_compensation = loadSingleDataset('data/hourly_compensation.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'hourly_compensation')
hourly_compensation.head()

Unnamed: 0,country,year,hourly_compensation
0,Argentina,1973,
1,Armenia,1973,
2,Australia,1973,
3,Austria,1973,
4,Azerbaijan,1973,


In [11]:
income_per_person= loadSingleDataset('data/income_per_person.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'income_per_person')
income_per_person.head()

Unnamed: 0,country,year,income_per_person
0,Afghanistan,1973,2030
1,Albania,1973,4050
2,Algeria,1973,8420
3,Andorra,1973,34900
4,Angola,1973,6010


In [12]:
suicide_per_100000_people = loadSingleDataset('data/suicide_per_100000_people.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'suicide_per_100000_people')
suicide_per_100000_people.head()

Unnamed: 0,country,year,suicide_per_100000_people
0,Albania,1973,
1,Antigua and Barbuda,1973,
2,Argentina,1973,
3,Armenia,1973,
4,Australia,1973,12.0


In [13]:
total_number_of_dollar_billionaires = loadSingleDataset('data/total_number_of_dollar_billionaires.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'total_number_of_dollar_billionaires')
total_number_of_dollar_billionaires.head()

Unnamed: 0,country,year,total_number_of_dollar_billionaires
0,Afghanistan,1973,
1,Albania,1973,
2,Algeria,1973,
3,Andorra,1973,
4,Angola,1973,


In [14]:
working_hours_per_week = loadSingleDataset('data/working_hours_per_week.csv', 
                               FROM_YEAR, TO_YEAR, 
                               ['country'], 
                               'year', 
                               'working_hours_per_week')
working_hours_per_week.head()

Unnamed: 0,country,year,working_hours_per_week
0,Albania,1973,
1,Algeria,1973,
2,Argentina,1973,
3,Armenia,1973,
4,Australia,1973,


## The final merged dataset

* Call mergeDatasets function to form the final dataset
* Augment data with additional attributes (e.g. continent and region data for _'country'_ and decade for _'year'_)

In [15]:
# merge the datasets in one that contains all the data
data = mergeDatasets([
    gdp_growth, 
    children_per_woman_total_fertility,
    co2_emissions_tonnes_per_person,
    mean_years_in_school_women_percent_men_25_to_34_years,
    average_age_of_dollar_billionaires_years,
    food_supply,
    hourly_compensation,
    income_per_person,
    suicide_per_100000_people,
    total_number_of_dollar_billionaires,
    working_hours_per_week
], ['country', 'year'])

data.sort_values(by=['country', 'year'], inplace=True, ignore_index=True)


countries = pd.read_csv('data/countryContinent.csv')

data = data.merge(countries, how='left', left_on=['country'], right_on=['country'])
data = data.convert_dtypes()

#add 'decade' computed column 
data['decade'] = data['year'].str.slice(0, 3)  + '0'

#check for missing countries (they have to be corrected in countryContinent.csv)
missing_countries = data[data["region_code"].isnull()]['country'].unique()

if (len(missing_countries) == 0):
    print("Country mapping is OK")
else:
    print(missing_countries)
    
data.to_csv('data/data.csv')

Country mapping is OK


In [16]:
# basic statistics of the loaded data 
print(data.count())
data.head(50)

country                                                  23595
year                                                     23595
gdp_growth                                               22094
children_per_woman_total_fertility                       22264
co2_emissions_tonnes_per_person                          15722
mean_years_in_school_women_percent_men_25_to_34_years     8602
average_age_of_dollar_billionaires_years                   776
food_supply                                               8022
hourly_compensation                                        483
income_per_person                                        23353
suicide_per_100000_people                                 2992
total_number_of_dollar_billionaires                        776
working_hours_per_week                                    1643
code_2                                                   23474
code_3                                                   23595
country_code                                           

Unnamed: 0,country,year,gdp_growth,children_per_woman_total_fertility,co2_emissions_tonnes_per_person,mean_years_in_school_women_percent_men_25_to_34_years,average_age_of_dollar_billionaires_years,food_supply,hourly_compensation,income_per_person,...,working_hours_per_week,code_2,code_3,country_code,iso_3166_2,continent,sub_region,region_code,sub_region_code,decade
0,Afghanistan,1900,1.05,7.0,,,,,,1090,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
1,Afghanistan,1901,1.05,7.0,,,,,,1110,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
2,Afghanistan,1902,1.05,7.0,,,,,,1120,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
3,Afghanistan,1903,1.05,7.0,,,,,,1140,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
4,Afghanistan,1904,1.05,7.0,,,,,,1160,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
5,Afghanistan,1905,1.05,7.0,,,,,,1180,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
6,Afghanistan,1906,1.05,7.0,,,,,,1200,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
7,Afghanistan,1907,1.05,7.0,,,,,,1220,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
8,Afghanistan,1908,1.05,7.0,,,,,,1240,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900
9,Afghanistan,1909,1.05,7.0,,,,,,1260,...,,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,142,34,1900


## Helper variables for different set of columns in the dataset

In [17]:
# change all _ with line breaks - in this case it's easier to display column names in the plots
mapping = {}
for col in data:
    mapping[col] = col.replace('_', "\n")
    
data = data.rename(columns=mapping)

key_columns = ['country', 'year']

measure_columns = [
        "gdp\ngrowth",
        "children\nper\nwoman\ntotal\nfertility",
        "co2\nemissions\ntonnes\nper\nperson",
        "mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears"
        #"average\nage\nof\ndollar\nbillionaires\nyears",
        #'food_supply',
        #'hourly_compensation',
        #'income_per_person',
        #'suicide_per_100000_people',
        #'total_number_of_dollar_billionaires',
        #'working_hours_per_week'
    ]

all_columns = key_columns + measure_columns

# Show Data

## Step 1 - Data Completeness
 > In the data quality framework, data completeness refers to the degree to which all data in a data set is available. A measure of data completeness is the percentage of missing data entries [[1]](https://dataladder.com/missing-data-and-data-completeness/)

In [18]:
t1 = pd.melt(data[all_columns], id_vars=['country', 'year'], var_name=['measure'], value_name='val')
t1['Countries Count'] = t1['val'].isnull()

t1 = t1.groupby(['year', 'measure'])['Countries Count'].sum().reset_index()

alt.Chart(t1).mark_rect().encode(
    x='year:O',
    y='measure:O',
    color='Countries Count:Q'
).properties(
    width=800,
    height=300,
    title='Data Completeness'
)


## Step 2 - Simple Plots

### 2.1. Interactive Scatter Plot Matrix.

The goal of this visualization is to explore the correlations between data columns.
The data is filtered by country.

In [19]:
@widgets.interact(country=data['country'].unique())
def basic_plot(country):
    filtered_df = data.loc[(data['country'] == country)]
    #print(filtered_df[measure_columns].describe())
    pd.plotting.scatter_matrix(filtered_df[measure_columns], alpha=0.5, figsize=(9,9))

interactive(children=(Dropdown(description='country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

In [26]:
#add geopandas to environment; geofiles CNTR_RG_60M_2020_4326 added to data

import geopandas as gpd
import json
import altair as alt
import pandas as pd

gdf = gpd.read_file('data/CNTR_RG_60M_2020_4326.shp')
gdf2=gdf[gdf.NAME_ENGL!='Antarctica']
gdf2


@widgets.interact(slider=(1900,2020),indicator1=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'])
def react(indicator1,slider):
    
        ee =data.loc[data['year']==slider]
        dd = ee.rename(columns = {"code\n3": "ISO3_CODE"})
        multi = alt.selection_multi(fields=['ISO3_CODE'])
        color = alt.condition(multi,
                          alt.Color(indicator1+':Q', 
                          scale=alt.Scale(scheme='teals')),
                          alt.value('lightgray'))

        hover = alt.selection_single( on='mouseover',fields=['ISO3_CODE'])
        return    alt.Chart(gdf2).mark_geoshape(
                        stroke='white'
                    ).encode( 
                       color=color,
                       tooltip=['NAME_ENGL',indicator1+':Q']

                    ).transform_lookup( # your code here

                         lookup='ISO3_CODE',
                        from_=alt.LookupData(dd, 'ISO3_CODE', [indicator1])

                    ).add_selection(
                            multi
                        ).properties( 
                        width=650,
                        height=400
                    ).properties(title='Overview') & alt.Chart(dd).mark_bar().encode( y='country', x=indicator1).add_selection(
                            multi
                        ).transform_filter(
                        multi
                    ).properties(title='Countries')
        
           

interactive(children=(Dropdown(description='indicator1', options=('gdp\ngrowth', 'children\nper\nwoman\ntotal\…

In [21]:
data['year'] = data['year'].astype(int)
@widgets.interact(slider=(1900,2020),country=data['country'].unique(), indicator1=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'], indicator2=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'], indicator3=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'])
def react(country,indicator1,indicator2,indicator3, slider):
    con11=data.loc[data['country']==country]
    con22=con11.loc[data['year'].between(slider-10, slider+10)]
    con=data
    con2=con.loc[data['year']==slider]
    ind=indicator1
    ind2=indicator2
    ind3=indicator3
    return  alt.Chart(con22[{'year',ind}]).mark_bar().encode(x='year',y=ind,tooltip=ind).properties(title='Development of indicator1 in selected country') | alt.Chart(con2).mark_circle().encode(
                x=alt.X(ind),
                y=ind2,
                #y=alt.Y('lifeExp', scale=alt.Scale(zero=False)),
                color='continent',
                size=ind3,
                tooltip='country'
            ).properties(title='Indicators 1-3 (3: Bubble)').interactive() 

interactive(children=(Dropdown(description='country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

@widgets.interact(slider=(1900,2020), indicator=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'], bins=(5, 25, 5))
def react(indicator,slider,bins):
    con=data
    con2=con.loc[data['year']==slider]
    ind=indicator

    plt.figure(figsize=(20,10))
    plt.subplot(121)
    sns.boxplot(x='continent',y=ind,data=con2) 
    plt.subplot(122)
    sns.distplot(con2[ind],bins=bins)
    
 

interactive(children=(Dropdown(description='indicator', options=('gdp\ngrowth', 'children\nper\nwoman\ntotal\n…

In [24]:
@widgets.interact(slider=(1900,2020), indicator=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'], indicator2=['gdp\ngrowth','children\nper\nwoman\ntotal\nfertility','co2\nemissions\ntonnes\nper\nperson','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','average\nage\nof\ndollar\nbillionaires\nyears','food\nsupply','hourly\ncompensation','income\nper\nperson','suicide\nper\n100000\npeople','total\nnumber\nof\ndollar\nbillionaires','working\nhours\nper\nweek'])
def react(indicator,indicator2, slider):    
    con=data
    con2=con.loc[data['year']==slider]
    ind=indicator
    ind2=indicator2
    back = alt.Chart(con2).mark_point().encode(
        x=alt.X(ind),
        y=alt.Y(ind2),
        color='continent:N',
        tooltip='country'
        )
    return alt.Chart(con2).mark_bar().encode(
        x = alt.X(ind, bin=True),
        y='count()',
        color='continent',
        tooltip='continent'
    ).properties(title='Histrogram') & back + back.transform_regression(ind, ind2, 
        groupby=['continent'], method='poly').mark_line(size=4).properties(title='Regression per continent').interactive()

interactive(children=(Dropdown(description='indicator', options=('gdp\ngrowth', 'children\nper\nwoman\ntotal\n…

In [114]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [25]:
@widgets.interact(slider=(1970,2020), clusters=(2,10))
def react(slider, clusters):
    con=data
    con2=con.loc[data['year']==slider]
    con3=con2[{'country','gdp\ngrowth','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','continent'}]

    con4=con3[{'gdp\ngrowth','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears'}]
    con4=con4.dropna()
    con4.select_dtypes(exclude=['number']).columns
    con4=pd.DataFrame(con4)
    con4=pd.get_dummies(con4)
    con4
    from sklearn.cluster import KMeans
    pred = KMeans(n_clusters=clusters, random_state = 102).fit_predict(con4)
    e=pd.DataFrame(pred)
    x1 = con3.iloc[:, [0,1,2,3]].values
    x2=pd.DataFrame(x1)
    cla = pd.concat([e,x2],axis=1,ignore_index=True)
    cla.columns=['cluster','gdp\ngrowth','mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears','continent','country']
    brush = alt.selection_interval()
    return alt.Chart(cla).mark_point().encode(
            x='gdp\ngrowth',
            y='mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears',
            color=alt.condition(brush, 'cluster:N', alt.value('lightgray')),
            tooltip=['country']
            ).add_selection(
            brush 
        ).properties(title='KMeans clusters')  & alt.Chart(con2).mark_bar().encode(
            x='country',
             y='mean\nyears\nin\nschool\nwomen\npercent\nmen\n25\nto\n34\nyears',
            color='continent:N'
            ).transform_filter(
            brush
        ).properties(title='Per country')


interactive(children=(IntSlider(value=1995, description='slider', max=2020, min=1970), IntSlider(value=6, desc…