In [1]:
# Dependencies
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

from numpy import nan
from numpy import isnan
from sklearn.impute import SimpleImputer

# World Happiness Report for the Year 2005-2019

Define Countries of Interest

In [9]:
# G20 (except E.U.) plus random guest countries
countries = ['Argentina','Australia','Bangladesh','Brazil','Canada','China','Denmark', 
                    'Finland', 'France',  'Germany', 'India', 'Indonesia', 'Iran', 'Italy', 
                    'Japan', 'Malaysia', 'Mexico', 'Philippines','Russia', 'Saudi Arabia', 'Singapore', 
                    'South Africa', 'South Korea', 'Turkey', 'United Kingdom', 'United States']
# Report coverage Y2005-2019 
years = ['2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
         '2015', '2016', '2017', '2018', '2019']

### Independent Variable - Happiness Score per Country

In [10]:
# Load/Read World Happiness Data 
Happiness_2005_2019 = "Data sets/Raw Data/Happiness Score/Happiness_2005_2019.csv"
world_happiness = pd.read_csv(Happiness_2005_2019 )
world_happiness.set_index('Country name', inplace=True)

#### Sampling, Imputation, and Display Dataset without NaN Values

In [11]:
sampling_happiness = world_happiness[world_happiness.index.isin(countries)]

In [12]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_happiness.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_sampling_happiness = imputer.fit_transform(values)
# transform_sampling_happiness

In [13]:
happiness_score = np.array(transform_sampling_happiness)
transformed_happiness = pd.DataFrame(data=happiness_score, index=[countries], columns=[years])
transformed_happiness

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Argentina,6.705385,6.31,6.07,5.96,6.42,6.44,6.78,6.47,6.58,6.67,6.7,6.43,6.04,5.79,6.09
Australia,7.34,5.673571,7.29,7.25,6.00087,7.45,7.41,7.2,7.36,7.29,7.31,7.25,7.26,7.18,7.23
Bangladesh,6.705385,4.32,4.61,5.05,5.08,4.86,4.99,4.72,4.66,4.64,4.63,4.56,4.31,4.5,5.11
Brazil,6.64,5.673571,6.32,6.69,7.0,6.84,7.04,6.66,7.14,6.98,6.55,6.37,6.33,6.19,6.45
Canada,7.42,5.673571,7.48,7.49,7.49,7.65,7.43,7.42,7.59,7.3,7.41,7.24,7.41,7.18,7.11
China,6.705385,4.56,4.86,4.85,4.45,4.65,5.04,5.09,5.24,5.2,5.3,5.32,5.1,5.13,5.14
Denmark,8.02,5.673571,7.83,7.97,7.68,7.77,7.79,7.52,7.59,7.51,7.51,7.56,7.59,7.65,7.69
Finland,6.705385,7.67,6.134167,7.67,6.00087,7.39,7.35,7.42,7.44,7.38,7.45,7.66,7.79,7.86,7.78
France,7.09,6.58,6.134167,7.01,6.28,6.8,6.96,6.65,6.67,6.47,6.36,6.48,6.64,6.67,6.69
Germany,6.62,5.673571,6.42,6.52,6.64,6.72,6.62,6.7,6.97,6.98,7.04,6.87,7.07,7.12,7.04


In [14]:
# Display Statistics Overview
transformed_happiness.describe()

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
count,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0
mean,6.705385,5.673571,6.134167,6.18,6.00087,6.2168,6.222308,6.1184,6.135,6.173846,6.106154,6.076,6.074,6.04,6.1308
std,0.599224,0.745668,0.910534,1.004036,0.880872,0.94947,0.990391,0.9292,1.107112,0.935269,0.913039,0.946281,1.022182,1.051886,1.025967
min,4.72,4.32,4.61,4.59,4.45,4.65,4.63,4.61,3.66,4.42,4.34,4.18,4.05,3.82,3.25
25%,6.705385,5.335,5.25,5.2,5.26,5.4675,5.195,5.325,5.2525,5.585,5.52,5.355,5.5825,5.34,5.37
50%,6.705385,5.673571,6.187083,6.24,6.00087,6.33,6.59,6.0592,6.255,6.16,6.28,6.053,6.137,6.03,6.325
75%,6.813846,5.673571,6.7425,6.95,6.585,6.83,6.9575,6.835,7.0975,6.98,6.68,6.815,6.9025,6.8275,6.8775
max,8.02,7.67,7.83,7.97,7.68,7.77,7.79,7.52,7.59,7.51,7.51,7.66,7.79,7.86,7.78


# Dependent Variables
### Sampling, Imputation, and Display

### GDP per capita

In [18]:
# Load/Read GDP per capita Data
GDP_vivi = 'Data sets/From Vivi/HI_Log GDPpCap.csv'
GDP = pd.read_csv(GDP_vivi)
GDP.set_index('Country name', inplace=True)
# GDP.head()

In [20]:
sampling_GDP = GDP[GDP.index.isin(countries)]

In [22]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_GDP.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_GDP = imputer.fit_transform(values)
# Display dataframe after imputation
GDP_score = np.array(transform_GDP)
clean_GDP = pd.DataFrame(data=GDP_score, index=[countries], columns=[years])
# clean_GDP

### Social Support

In [48]:
# Load/Read Social Support Data
Social_Support_vivi = 'Data sets/From Vivi/HI_Social support.csv'
Social_Support = pd.read_csv(Social_Support_vivi)
Social_Support.set_index('Country name', inplace=True)
# Social_Support.head()

In [49]:
sampling_Social_Support = Social_Support[Social_Support.index.isin(countries)]
# sampling_Social_Support

In [50]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_Social_Support.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_Social_Support = imputer.fit_transform(values)
# Display dataframe after imputation
Social_Support_score = np.array(transform_Social_Support)
clean_Social_Support = pd.DataFrame(data=Social_Support_score, index=[countries], columns=[years])
# clean_Social_Support

### Freedom

In [58]:
# Load/Read Freedom Data
Freedom_vivi = 'Data sets/From Vivi/HI_freedom.csv'
Freedom = pd.read_csv(Freedom_vivi)
Freedom.set_index('Country name', inplace=True)
# Freedom.head()

In [59]:
sampling_Freedom = Freedom[Freedom.index.isin(countries)]
# sampling_Freedom

In [60]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_Freedom.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_Freedom = imputer.fit_transform(values)
# Display dataframe after imputation
Freedom_score = np.array(transform_Freedom)
clean_Freedom = pd.DataFrame(data=Freedom_score, index=[countries], columns=[years])
# clean_Freedom

### Health (Life Expectancy) 

In [30]:
# Load/Read Health Life Expectancy Data
Health_life_expectancy_vivi = 'Data sets/From Vivi/HI_life expectancy.csv'
Health_life_expectancy = pd.read_csv(Health_life_expectancy_vivi)
Health_life_expectancy.set_index('Country name', inplace=True)
# Health_life_expectancy.head()

In [31]:
sampling_Health_life_expectancy = Health_life_expectancy[Health_life_expectancy.index.isin(countries)]

In [43]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_Health_life_expectancy.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_Health_life_expectancy = imputer.fit_transform(values)
# Display dataframe after imputation
Health_life_expectancy_score = np.array(transform_Health_life_expectancy)
clean_Health_life_expectancy = pd.DataFrame(data=Health_life_expectancy_score, index=[countries], columns=[years])
# clean_Health_life_expectancy

### Generosity

In [62]:
# Load/Read Generosity Data
Generosity_vivi = 'Data sets/From Vivi/HI_Generosity.csv'
Generosity = pd.read_csv(Generosity_vivi)
Generosity.set_index('Country name', inplace=True)
# Generosity.head()

In [63]:
sampling_Generosity = Generosity[Generosity.index.isin(countries)]
# sampling_Generosity

In [65]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_Generosity.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_Generosity = imputer.fit_transform(values)
# Display dataframe after imputation
Generosity_score = np.array(transform_Generosity)
clean_Generosity= pd.DataFrame(data=Generosity_score, index=[countries], columns=[years])
# clean_Generosity

### Trust (Perception of Corruption)

In [67]:
# Load/Read Generosity Data
Trust_vivi = 'Data sets/From Vivi/HI_corruption.csv'
Trust = pd.read_csv(Generosity_vivi)
Trust.set_index('Country name', inplace=True)
# Trust.head()

In [69]:
sampling_Trust = Trust[Trust.index.isin(countries)]
# sampling_Trust

In [72]:
# Imputation of NaN or missing values through SimpleImputer from sklearn
values = sampling_Trust.values
imputer = SimpleImputer(missing_values=nan, strategy='mean')
transform_Trust = imputer.fit_transform(values)
# Display dataframe after imputation
Trust_score = np.array(transform_Trust)
clean_Trust= pd.DataFrame(data=Trust_score, index=[countries], columns=[years])
# clean_Trust