# VA Project

In [35]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline

import matplotlib.pyplot as plt
from matplotlib import figure
import seaborn as sns
import altair as alt
from sklearn import decomposition
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

from ipywidgets import interact, interact_manual

### Load Data

In [2]:
average_age_of_dollar_billionaires_years = pd.read_csv('data//average_age_of_dollar_billionaires_years.csv')
#average_age_of_dollar_billionaires_years.head()

In [3]:
children_per_woman_total_fertility = pd.read_csv('data//children_per_woman_total_fertility.csv')
#children_per_woman_total_fertility.head()

In [4]:
co2_emissions_tonnes_per_person = pd.read_csv('data//co2_emissions_tonnes_per_person.csv')
#co2_emissions_tonnes_per_person.head()

In [5]:
countryContinent = pd.read_csv('data//countryContinent.csv')
#countryContinent.head()

In [6]:
data = pd.read_csv('data//data.csv')
#print(data.head(120))
#print(data.columns)

In [7]:
data_condensed = pd.read_csv('data//data.csv', index_col=0)
data_condensed = data_condensed.drop(['average_age_of_dollar_billionaires_years',
                     'hourly_compensation', 'working_hours_per_week', 'total_number_of_dollar_billionaires',
                     'code_2', 'code_3', 'country_code', 'iso_3166_2', 'region_code', 'sub_region_code'], axis = 1)
data_condensed.head(200)

Unnamed: 0,country,year,gdp_growth,children_per_woman_total_fertility,co2_emissions_tonnes_per_person,mean_years_in_school_women_percent_men_25_to_34_years,food_supply,income_per_person,suicide_per_100000_people,continent,sub_region,decade
0,Afghanistan,1900,1.05,7.00,,,,1090.0,,Asia,Southern Asia,1900
1,Afghanistan,1901,1.05,7.00,,,,1110.0,,Asia,Southern Asia,1900
2,Afghanistan,1902,1.05,7.00,,,,1120.0,,Asia,Southern Asia,1900
3,Afghanistan,1903,1.05,7.00,,,,1140.0,,Asia,Southern Asia,1900
4,Afghanistan,1904,1.05,7.00,,,,1160.0,,Asia,Southern Asia,1900
...,...,...,...,...,...,...,...,...,...,...,...,...
195,Albania,1974,2.69,4.37,1.84,0.892,2490.0,4070.0,,Europe,Southern Europe,1970
196,Albania,1975,2.62,4.24,1.91,0.897,2490.0,4080.0,,Europe,Southern Europe,1970
197,Albania,1976,2.64,4.10,2.01,0.902,2680.0,4100.0,,Europe,Southern Europe,1970
198,Albania,1977,2.64,3.97,2.27,0.906,2780.0,4120.0,,Europe,Southern Europe,1970


In [8]:
food_supply = pd.read_csv('data//food_supply.csv')
#food_supply.head()

In [9]:
gdp_total_yearly_growth = pd.read_csv('data//gdp_total_yearly_growth.csv')
#gdp_total_yearly_growth.head()

In [10]:
hourly_compensation = pd.read_csv('data//hourly_compensation.csv')
#hourly_compensation.head()

In [11]:
income_per_person = pd.read_csv('data//income_per_person.csv')
#income_per_person.head()

In [12]:
mean_years_in_school_women_percent_men_25_to_34_years = pd.read_csv('data//mean_years_in_school_women_percent_men_25_to_34_years.csv')
#mean_years_in_school_women_percent_men_25_to_34_years.index = mean_years_in_school_women_percent_men_25_to_34_years.pop('country')
#mean_years_in_school_women_percent_men_25_to_34_years.head()#['country']

In [13]:
suicide_per_100000_people = pd.read_csv('data//suicide_per_100000_people.csv')
#suicide_per_100000_people.head()

In [14]:
total_number_of_dollar_billionaires = pd.read_csv('data//total_number_of_dollar_billionaires.csv')
#total_number_of_dollar_billionaires.head()

In [15]:
working_hours_per_week = pd.read_csv('data//working_hours_per_week.csv')
#working_hours_per_week.head()

In [16]:
working_hours_per_week.index = working_hours_per_week.pop('country')
#working_hours_per_week.head()

### Descriptive Satatistics

In [17]:
@interact(year = (1970,2015))
def plot_education_gender_ratio(year):
    mean_years_in_school_women_percent_men_25_to_34_years.boxplot(str(year))
    mean_years_in_school_women_percent_men_25_to_34_years.hist(str(year))

interactive(children=(IntSlider(value=1992, description='year', max=2015, min=1970), Output()), _dom_classes=(…

### Correlations between attributes

In [73]:
@interact(year = (1970,2013), cluster_by_continent=True, continent=['Asia','Europe','Africa', 'Oceania', 'Americas'],
          children_per_woman_total_fertility=True, gdp_growth=True, co2_emissions_tonnes_per_person=True,
          income_per_person=True, food_supply=True, mean_years_in_school_women_percent_men_25_to_34_years=True)
def plot_education_gender_ratio(year,cluster_by_continent, continent,
                                children_per_woman_total_fertility,gdp_growth,co2_emissions_tonnes_per_person,
                                income_per_person,food_supply,
                                mean_years_in_school_women_percent_men_25_to_34_years):
    checked_data = list()
    if children_per_woman_total_fertility:
        checked_data.append('children_per_woman_total_fertility')
    if gdp_growth:
        checked_data.append('gdp_growth')
    if co2_emissions_tonnes_per_person:
        checked_data.append('co2_emissions_tonnes_per_person')
    if income_per_person:
        checked_data.append('income_per_person')
    if food_supply:
        checked_data.append('food_supply')
    if mean_years_in_school_women_percent_men_25_to_34_years:
        checked_data.append('mean_years_in_school_women_percent_men_25_to_34_years')
    if cluster_by_continent:
        new_data_condensed = data_condensed[checked_data + ['country', 'year', 'continent']]
        data_condensed_of_year = new_data_condensed[new_data_condensed.year == year]
        data_condensed_of_year_continent = data_condensed_of_year[data_condensed_of_year.continent == continent]
        print(f'current year is {year}')
        pd.plotting.scatter_matrix(data_condensed_of_year_continent[checked_data], figsize=(15,10))
    else:
        new_data_condensed = data_condensed[checked_data + ['country', 'year']]
        data_condensed_of_year = new_data_condensed[new_data_condensed.year == year]
        print(f'current year is {year}')
        pd.plotting.scatter_matrix(data_condensed_of_year[checked_data], figsize=(15,10))

interactive(children=(IntSlider(value=1991, description='year', max=2013, min=1970), Checkbox(value=True, desc…

### Cluster similar items

In [115]:
@interact(year = (1970,2013), children_per_woman_total_fertility=True,
          gdp_growth=True, co2_emissions_tonnes_per_person=True,
          income_per_person=True, food_supply=True, mean_years_in_school_women_percent_men_25_to_34_years=True,
          clustering_algorithm=['pca','mds','tsne'])
def plot_education_gender_ratio(year, children_per_woman_total_fertility,
                                gdp_growth,co2_emissions_tonnes_per_person,
                                income_per_person,food_supply,
                                mean_years_in_school_women_percent_men_25_to_34_years, clustering_algorithm):
    checked_data = list()
    if children_per_woman_total_fertility:
        checked_data.append('children_per_woman_total_fertility')
    if gdp_growth:
        checked_data.append('gdp_growth')
    if co2_emissions_tonnes_per_person:
        checked_data.append('co2_emissions_tonnes_per_person')
    if income_per_person:
        checked_data.append('income_per_person')
    if food_supply:
        checked_data.append('food_supply')
    if mean_years_in_school_women_percent_men_25_to_34_years:
        checked_data.append('mean_years_in_school_women_percent_men_25_to_34_years')
    
    if clustering_algorithm == 'pca':
        new_data_condensed = data_condensed[checked_data + ['year']]
        ind = {'recognice_by_number': list(range(len(data_condensed['year'])))}
        ind = pd.DataFrame(data=ind)
        
        new_data_condensed = new_data_condensed.join(ind) #add index at end
        new_data_condensed = new_data_condensed[
            new_data_condensed.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)]
        
        #create df which contains continents and some index to recognice when merging
        df_continents_of_year = data_condensed[['year','continent']]
        df_continents_of_year = df_continents_of_year.join(ind)
        
        #drow rows that contain wrong years
        new_data_condensed = new_data_condensed[new_data_condensed.year == year]
        df_continents_of_year = df_continents_of_year[df_continents_of_year.year == year]
        
        #create colors for plotting and use 'year' column as placeholder
        colors = ['yellow','red','black','blue','green']
        for index, cont in enumerate(['Asia','Europe','Africa', 'Oceania', 'Americas']):
            df_continents_of_year.loc[df_continents_of_year.continent == cont, 'year'] = colors[index]
        #change 'year' column as placeholder to 'color'
        df_continents_of_year = df_continents_of_year.rename(columns={'year': 'color'})
        
        #create merged df
        df_merged=new_data_condensed.merge(df_continents_of_year,
                                           on='recognice_by_number', how='left') 
        #drop non-values
        new_data_condensed.fillna(0)
        
        #scale data
        scaler = StandardScaler()
        scaler.fit(new_data_condensed)
        scaled_data = scaler.transform(new_data_condensed)
        
        #use pca
        pca = PCA(n_components=2)
        pca.fit(scaled_data)
        x_pca = pca.transform(scaled_data)
        
        #plot data in 2D
        plt.figure(figsize=(8,6))
        plt.scatter(x_pca[:,0],x_pca[:,1],c=df_merged['color'])
        plt.xlabel('First principle component')
        plt.ylabel('Second principle component')
    else:
        print('not done yet')

interactive(children=(IntSlider(value=1991, description='year', max=2013, min=1970), Checkbox(value=True, desc…