In [21]:
# Dependencies
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import seaborn as sns
from numpy import nan
from numpy import isnan
from sklearn.impute import SimpleImputer

In [22]:
# Set function for scatter plot, correlation analysis, regression coefficient, linear equation, & saving figure
def linregres_func(x_values, y_values, x_label, y_label, ann_horizontal, ann_vertical, savefile): 
                                                                            
    (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
    
    regress_values = x_values * slope + intercept
    eq_line = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    
    plt.scatter(x_values, y_values, c='lightskyblue', edgecolor='maroon')
    plt.plot(x_values, regress_values, 'r-')
    plt.annotate(eq_line, (ann_horizontal, ann_vertical), fontsize=11, color='red')
    plt.title(f" {x_label} vs {y_label} \n")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.tight_layout
    print(f"The r-squared is: {rvalue}")
    plt.savefig(f'../figures/{savefile}')
    plt.show()

## Correlations Summary
Based on Normalized Data


In [24]:
# Load/Read Normalized Data
pd.set_option('display.max_columns', None)
path_Normalized_v2 = "../Outputs/Normalized Table 2017_df.csv"
df_normalized = pd.read_csv(path_Normalized_v2)
df_normalized.rename(columns={"Healthy life expectancy at birth":"Healthy life exp","Life Ladder":"Happiness Index",
                             "Freedom to make life choices":"Freedom", "Log GDP per capita":"Log GDP"},inplace=True)                             
df_normalized.head(3)

Unnamed: 0,Country name,Code,Happiness Index,Log GDP,Social support,Healthy life exp,Freedom,Generosity,Perceptions of corruption,Suicide,Depression & Anxiety,Substance Abuse,Physician perpop,Average age,Sleep (min),Gini index,Pct Poverty,Pct spent ed,Literacy rate,Tourist arrivals,Rev tourism,Fragile State index,Security index,Brain drain,Unemployment rate,Blue Index
0,Afghanistan,AFG,0.0,0.138829,0.096154,0.177083,0.0,0.233766,1.0,0.071535,0.391471,0.19681,,0.106583,,,,0.375431,,,0.0,0.958874,1.0,0.92,0.402182,0.219939
1,Albania,ALB,0.385965,0.553145,0.384615,0.71875,0.571429,0.337662,0.911392,0.089793,0.258033,0.174371,,0.548589,,,,0.234483,,0.053397,0.008087,0.452381,0.411111,0.84,0.498871,0.174066
2,Argentina,ARG,0.658869,0.655098,0.903846,0.725694,0.714286,0.142857,0.860759,0.254116,0.351928,0.399367,0.648456,0.510972,0.506173,0.544803,0.344675,0.271983,,0.077236,0.023135,0.319264,0.366667,0.226667,0.295711,0.335137


In [26]:
# Re-arrange variable columns
correlation_overview = df_normalized.loc[:, ['Brain drain', 'Security index',  'Fragile State index ', 'Pct Poverty','Perceptions of corruption',
                                      'Unemployment rate', 'Physician perpop', 'Substance Abuse', 'Depression & Anxiety',
                                      'Pct spent ed', 'Suicide', 'Blue Index', 'Tourist arrivals', 'Average age', 
                                      'Rev tourism', 'Literacy rate', 'Sleep (min)', 'Generosity', 'Healthy life exp',
                                      'Freedom', 'Social support', 'Log GDP', 'Happiness Index' ]]

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [6]:
# Dispaly correlation table
overview = correlation_overview.corr(method='pearson').round(decimals=2)
overview

NameError: name 'correlation_overview' is not defined

In [None]:
# Plot and display correlation summary heatmap
%matplotlib notebook
sns.heatmap(overview, 
            xticklabels=overview.columns,
            yticklabels=overview.columns,
            cmap='RdPu',#bwr
            annot=False,
            linewidth=.5)
plt.title('Correlations Summary \n Heat Map \n ', fontsize= 10)
plt.tight_layout()
plt.yticks(fontsize=8)
plt.xticks(fontsize=8)
plt.savefig('../Figures/Correlation Summary Heatmap')
plt.show()

##  Focus: Impact of Economic Health to Happiness 
- Log GDP and Unemployment Rate data are correlated with Happiness Index

In [None]:
# Select Variables Pertaining to Economic Health vs Happiness Index
Economic_health_Correlation = df_normalized.loc[:, ['Happiness Index', 'Log GDP', 'Unemployment rate',]]

In [None]:
# Display correlation table
Economic_health_Correlation = Economic_health_Correlation.corr(method='pearson').round(decimals=2)
Economic_health_Correlation

In [None]:
# Plot and display heat map of economic health vs happiness correlations
%matplotlib notebook
sns.heatmap(Economic_health_Correlation, 
            xticklabels=Economic_health_Correlation.columns, 
            yticklabels=Economic_health_Correlation.columns,
            cmap='RdPu',#bwr
            annot=True,
            linewidth=.5)
plt.title('Economic Health and Happiness \n Correlation Heat Map \n ', fontsize= 10)
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('Figures/Economic Health and Happiness Heatmap')
plt.show()

### Sample Y2017 Data Analysis

In [None]:
# Load/Read Data
pd.set_option('display.max_columns', None)
path = "../Outputs/df_2017.csv"
df_2017 = pd.read_csv(path)
# Clean csv
del df_2017['Unnamed: 0']
df_2017.head(1)

In [None]:
# Sort and determine Top Countries with Highest Log GDP 
Top_logGDP = df_2017.loc[:, ['Country name', 'Log GDP', 'Happiness Index']]
Top_logGDP.sort_values(['Log GDP'], ascending=False, inplace=True) 
Top_logGDP.head()

In [None]:
# Inspect countries with a  High Log GDP but with Low Happiness Index based on a set criteria
HighGDP_LowHappy = Top_logGDP[(Top_logGDP["Happiness Index"]<=5.5) 
                              & (Top_logGDP["Log GDP"]>=10)]
HighGDP_LowHappy.sort_values(['Log GDP'], ascending=False, inplace=True)
HighGDP_LowHappy.head(10)

In [None]:
# Sort and determine Top Countries with Highest Unemployment in Y2017
Top_Unemployment = df_2017.loc[:, ['Country name', 'Unemployment rate', 'Happiness Index']]
Top_Unemployment.sort_values(['Unemployment rate'], ascending=False, inplace=True) 
Top_Unemployment.head()

In [None]:
# Inspect Unemployment but with the High Happiness Index based on a set criteria
HighUnemployment_HighHappy = Top_Unemployment[(Top_Unemployment["Happiness Index"]>=6) 
                                              & (Top_Unemployment["Unemployment rate"]>=17)] 
HighUnemployment_HighHappy.sort_values(['Unemployment rate'], ascending=False, inplace=True)
HighUnemployment_HighHappy.head(10)

---
## Focus: Factors not included in World Happiness Index Report

Selected Variables for Analysis, Correlation, and Regression based on Normalized Data:
- Tourism Revenue
- Average Age
- Brain Drain
- Security Threat

### Tourism Revenue vs Happiness

In [None]:
# Select variables and check NaN values
rev_tourism = df_normalized.loc[:, ['Country name', 'Happiness Index', 'Rev tourism',]]
rev_tourism.info()

In [None]:
# Calculate mean to replace NaN
rev_tourism.mean()

In [None]:
# Fill in mean values and re-check info
fill_rev_tourism = rev_tourism.fillna(rev_tourism.mean())
fill_rev_tourism.info()

In [None]:
# Display scatter plot, correlation & linear equation
linregres_func(fill_rev_tourism["Rev tourism"], fill_rev_tourism["Happiness Index"], 'Rev tourism', 'Happiness Index', 0.4, 0.2, 'RevTourism vs Happiness.png')

---
### Average Age  vs Happiness

In [None]:
# Select variables and check NaN values
Average_Age = df_normalized.loc[:, ['Country name', 'Happiness Index', 'Average Age']]
Average_Age.info()

In [None]:
# Calculate mean to replace NaN
Average_Age.mean()

In [None]:
# Fill in mean values and re-check info
fill_Average_Age = Average_Age.fillna(Average_Age.mean())
fill_Average_Age.info()

In [None]:
# Display scatter plot, correlation & linear equation
linregres_func(fill_Average_Age["Average Age"], fill_Average_Age["Happiness Index"], 'Average Age', 'Happiness Index', 0.6, 0.19, 'Average Age vs Happiness.png')

---
### Brain Drain vs Happiness

In [None]:
# Select variables and check NaN values
brain_drain = df_normalized.loc[:, ['Country name', 'Happiness Index', 'brain drain',]]
brain_drain.info()

In [None]:
# Calculate mean to replace NaN
brain_drain.mean()

In [None]:
# Fill in mean values and re-check info
fill_brain_drain = brain_drain.fillna(brain_drain.mean())
fill_brain_drain.info()

In [None]:
# Display scatter plot, correlation & linear equation
linregres_func(fill_brain_drain["brain drain"], fill_brain_drain["Happiness Index"], 'Brain Drain', 'Happiness Index', 0.1, 0.3, 'Brain drain vs Happiness.png')

### Security Threat vs Happiness

In [None]:
# Select variables and check NaN values
Security_index = df_normalized.loc[:, ['Country name', 'Happiness Index', 'Security index',]]
Security_index.info()

In [None]:
# Calculate mean to replace NaN
Security_index.mean()

In [None]:
# Fill in mean values and re-check info
fill_Security_index = Security_index.fillna(Security_index.mean())
fill_Security_index.info()

In [None]:
# Display scatter plot, correlation & linear equation
linregres_func(fill_Security_index["Security index"], fill_brain_drain["Happiness Index"], 'Security Threat', 'Happiness Index', 0.1, 0.35, 'Security Threat vs Happiness.png')