In [3]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib widget
import seaborn as sns
import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode
from utils import load_data_frame_from_strings

### Gather data from different sources:

Main sources for this analysis is the two sheets from [World Happiness Report 2023 (Official websites)]("https://worldhappiness.report/ed/2023/#appendices-and-data"):
- DataForFigure2.1WHR2023.csv
- DataForTable2.1WHR2023.csv

However since they does not contain the information about the region of the countries. Therefore I imported as well from [World Happiness Report 2021 on Kaggle]("https://www.kaggle.com/datasets/ajaypalsinghlo/world-happiness-report-2021"), which also contains this information. Other than this, this dataset is not used for any other step in the analysis

In [4]:
df1 = load_data_frame_from_strings("../data/DataForFigure2.1WHR2023.csv")

In [5]:
df1

Unnamed: 0,Country name,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.150,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.250,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084
2,Iceland,7.530,0.049,7.625,7.434,10.896,0.983,72.050,0.936,0.211,0.668,1.778,1.926,1.620,0.559,0.738,0.250,0.187,2.250
3,Israel,7.473,0.032,7.535,7.411,10.639,0.943,72.697,0.809,-0.023,0.708,1.778,1.833,1.521,0.577,0.569,0.124,0.158,2.691
4,Netherlands,7.403,0.029,7.460,7.346,10.942,0.930,71.550,0.887,0.213,0.379,1.778,1.942,1.488,0.545,0.672,0.251,0.394,2.110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,Congo (Kinshasa),3.207,0.095,3.394,3.020,7.007,0.652,55.375,0.664,0.086,0.834,1.778,0.531,0.784,0.105,0.375,0.183,0.068,1.162
133,Zimbabwe,3.204,0.061,3.323,3.084,7.641,0.690,54.050,0.654,-0.046,0.766,1.778,0.758,0.881,0.069,0.363,0.112,0.117,0.905
134,Sierra Leone,3.138,0.082,3.299,2.976,7.394,0.555,54.900,0.660,0.105,0.858,1.778,0.670,0.540,0.092,0.371,0.193,0.051,1.221
135,Lebanon,2.392,0.044,2.479,2.305,9.478,0.530,66.149,0.474,-0.141,0.891,1.778,1.417,0.476,0.398,0.123,0.061,0.027,-0.110


In [6]:
df2 = load_data_frame_from_strings("../data/DataForTable2.1WHR2023.csv")

In [7]:
df2

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect
0,Afghanistan,2008,3.724,7.350,0.451,50.500,0.718,0.168,0.882,0.414,0.258
1,Afghanistan,2009,4.402,7.509,0.552,50.800,0.679,0.191,0.850,0.481,0.237
2,Afghanistan,2010,4.758,7.614,0.539,51.100,0.600,0.121,0.707,0.517,0.275
3,Afghanistan,2011,3.832,7.581,0.521,51.400,0.496,0.164,0.731,0.480,0.267
4,Afghanistan,2012,3.783,7.661,0.521,51.700,0.531,0.238,0.776,0.614,0.268
...,...,...,...,...,...,...,...,...,...,...,...
2194,Zimbabwe,2018,3.616,7.783,0.775,52.625,0.763,-0.051,0.844,0.658,0.212
2195,Zimbabwe,2019,2.694,7.698,0.759,53.100,0.632,-0.047,0.831,0.658,0.235
2196,Zimbabwe,2020,3.160,7.596,0.717,53.575,0.643,0.006,0.789,0.661,0.346
2197,Zimbabwe,2021,3.155,7.657,0.685,54.050,0.668,-0.076,0.757,0.610,0.242


Country - researched country.

Happiness score - an index that reflects the well-being of people and the state of the environment in different countries of the world, which was proposed by the New Economics Foundation in July 2006.

Whisker-high and Whisker-low - Whisker (low) is the quartile (real number between 0 and 10) from which the happiness score is calculated.

Dystopia (1.83) + residual

Explained by: GDP per capita is the ratio of GDP to the population of the country, which shows how much of the gross product produced in the country in a year and expressed in value terms, falls on one inhabitant of this country.

Explained by: Social support is a form of expression of the social policy of the state, aimed at providing social assistance to needy citizens.

Explained by: Healthy life expectancy - Healthy life expectancy is an indicator for calculating which life expectancy must be adjusted for the health of the individual. This indicator represents the period of time (years, months, days) during which an individual is expected to have a certain state of health, with a constant level of mortality and morbidity.

Explained by: Freedom to make life choices - the right of a person to freely express his opinion. This right includes freedom to hold opinions and freedom to receive and impart information and ideas without any interference from public authorities and regardless of frontiers.

Explained by: Generosity - The main feature of charity is the voluntary choice of the type, time and place, as well as the content of (targeted) assistance. Charity is distinguished as a manifestation of compassion for one's neighbor and the moral duty of the possessor to rush to the aid of the have-not.

Explained by: Perceptions of corruption - an index compiled by the international non-governmental organization Transparency International to reflect the assessment of the level of perception of corruption by analysts and entrepreneurs on a 100-point scale. Compiled annually since 1995.

In [8]:
df3 = load_data_frame_from_strings("../data/world-happiness-report-2021.csv")

In [9]:
country_region_map = df3[["Country name","Regional indicator"]]
country_region_map

Unnamed: 0,Country name,Regional indicator
0,Finland,Western Europe
1,Denmark,Western Europe
2,Switzerland,Western Europe
3,Iceland,Western Europe
4,Netherlands,Western Europe
...,...,...
144,Lesotho,Sub-Saharan Africa
145,Botswana,Sub-Saharan Africa
146,Rwanda,Sub-Saharan Africa
147,Zimbabwe,Sub-Saharan Africa


In [10]:
exteneded_df1 = df1.merge(country_region_map, on="Country name")
exteneded_df1["Regional indicator"].isnull().values.any()

False

Good! This means that there is no missing value from the mapping. All the countries have the information about their regions

In [11]:
exteneded_df2 = df2.merge(country_region_map, on="Country name")
exteneded_df2

Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Regional indicator
0,Afghanistan,2008,3.724,7.350,0.451,50.500,0.718,0.168,0.882,0.414,0.258,South Asia
1,Afghanistan,2009,4.402,7.509,0.552,50.800,0.679,0.191,0.850,0.481,0.237,South Asia
2,Afghanistan,2010,4.758,7.614,0.539,51.100,0.600,0.121,0.707,0.517,0.275,South Asia
3,Afghanistan,2011,3.832,7.581,0.521,51.400,0.496,0.164,0.731,0.480,0.267,South Asia
4,Afghanistan,2012,3.783,7.661,0.521,51.700,0.531,0.238,0.776,0.614,0.268,South Asia
...,...,...,...,...,...,...,...,...,...,...,...,...
2082,Zimbabwe,2018,3.616,7.783,0.775,52.625,0.763,-0.051,0.844,0.658,0.212,Sub-Saharan Africa
2083,Zimbabwe,2019,2.694,7.698,0.759,53.100,0.632,-0.047,0.831,0.658,0.235,Sub-Saharan Africa
2084,Zimbabwe,2020,3.160,7.596,0.717,53.575,0.643,0.006,0.789,0.661,0.346,Sub-Saharan Africa
2085,Zimbabwe,2021,3.155,7.657,0.685,54.050,0.668,-0.076,0.757,0.610,0.242,Sub-Saharan Africa


In [12]:
exteneded_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133 entries, 0 to 132
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                133 non-null    object 
 1   Ladder score                                133 non-null    float64
 2   Standard error of ladder score              133 non-null    float64
 3   upperwhisker                                133 non-null    float64
 4   lowerwhisker                                133 non-null    float64
 5   Logged GDP per capita                       133 non-null    float64
 6   Social support                              133 non-null    float64
 7   Healthy life expectancy                     133 non-null    float64
 8   Freedom to make life choices                133 non-null    float64
 9   Generosity                                  133 non-null    float64
 10  Perceptions of

In [13]:
exteneded_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2087 entries, 0 to 2086
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Country name                      2087 non-null   object 
 1   year                              2087 non-null   int64  
 2   Life Ladder                       2087 non-null   float64
 3   Log GDP per capita                2077 non-null   float64
 4   Social support                    2078 non-null   float64
 5   Healthy life expectancy at birth  2047 non-null   float64
 6   Freedom to make life choices      2056 non-null   float64
 7   Generosity                        2028 non-null   float64
 8   Perceptions of corruption         1977 non-null   float64
 9   Positive affect                   2067 non-null   float64
 10  Negative affect                   2075 non-null   float64
 11  Regional indicator                2087 non-null   object 
dtypes: flo

In [14]:
exteneded_df1.describe()

Unnamed: 0,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,5.559233,0.064301,5.685135,5.433241,9.458504,0.798594,64.981278,0.790504,0.023729,0.722226,1.778,1.410113,1.155008,0.366549,0.544143,0.149173,0.148173,1.786135
std,1.129063,0.022977,1.107411,1.152033,1.198584,0.129744,5.735112,0.109445,0.142825,0.178601,2.674609e-15,0.429843,0.327636,0.156279,0.145622,0.076647,0.1279,0.501509
min,1.859,0.029,1.923,1.795,5.527,0.341,51.53,0.382,-0.254,0.146,1.778,0.0,0.0,0.0,0.0,0.0,0.0,-0.11
25%,4.855,0.047,4.983,4.727,8.591,0.722,60.698,0.73,-0.07,0.66,1.778,1.099,0.962,0.25,0.464,0.099,0.06,1.572
50%,5.703,0.06,5.851,5.572,9.567,0.827,65.825,0.801,0.001,0.771,1.778,1.449,1.227,0.389,0.558,0.137,0.113,1.852
75%,6.334,0.075,6.441,6.243,10.54,0.896,69.6,0.874,0.119,0.846,1.778,1.798,1.401,0.492,0.656,0.2,0.192,2.078
max,7.804,0.147,7.875,7.733,11.66,0.983,77.28,0.961,0.531,0.929,1.778,2.2,1.62,0.702,0.772,0.422,0.561,2.955


In [15]:
sns.set()
#sns.pairplot(data,height = 5 ,kind ='scatter',diag_kind='kde')
#plt.show()

In [16]:
exteneded_df1.columns

Index(['Country name', 'Ladder score', 'Standard error of ladder score',
       'upperwhisker', 'lowerwhisker', 'Logged GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual', 'Regional indicator'],
      dtype='object')

First, I want to have a look in the distribution of the numeric data from the dataset

In [17]:
numeric_features=['Ladder score',
                  'upperwhisker', 
                  'lowerwhisker',
                  'Explained by: Log GDP per capita', 
                  'Explained by: Social support',
                  'Explained by: Healthy life expectancy',
                  'Explained by: Freedom to make life choices',
                  'Explained by: Generosity', 
                  'Explained by: Perceptions of corruption',
                  'Dystopia + residual']

In [18]:
fig = make_subplots(rows=len(numeric_features), cols=3)
i=1
for feature in numeric_features:
  fig.add_trace(go.Histogram(x=exteneded_df1[feature], name=feature), row=i, col=1)
  fig.add_trace(go.Box(x=exteneded_df1[feature], name=feature), row=i, col=2)
  fig.add_trace(go.Violin(x=exteneded_df1[feature], name=feature), row=i, col=3)
  i+=1
fig.update_layout(height=2400, width=1800, title_text='<b>Continuos distributions', title_x=0.5)
fig.show()

In [19]:
for feature in numeric_features:
  top5 = exteneded_df1.nlargest(5, feature).sort_values(by=feature)
  fig_bar = px.bar(x = top5['Country name'], y = top5[feature])
  fig_bar.update_layout(width = 1000, height = 500, title_text=f"TOP 5 country by {feature}")
  fig_bar.update_xaxes(title = 'Country')
  fig_bar.update_yaxes(title = f'{feature}')
  fig_bar.show()

In [20]:
for feature in numeric_features:  
  fig = px.scatter(exteneded_df1, x='Ladder score',y=feature)
  fig.update_layout(title=f"Scatter plot Happiness score with {feature}")
  fig.show()

In [21]:
for feature in numeric_features:
    x = dict(type='choropleth',
             locations = exteneded_df1['Country name'],
             locationmode = 'country names', 
             z = exteneded_df1[feature],
             text = exteneded_df1['Country name'], 
             colorbar = {'title':f'{feature}'},
             colorscale=[[0, 'rgb(224,255,255)'],
                         [0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
                         [0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'],
                         [0.10, 'rgb(251,154,153)'], [0.20, 'rgb(255,255,0)'],
                         [1, 'rgb(227,26,28)']],
             reversescale = False)
    layout = dict(title=f'Visualization of the parameter {feature} on the map', geo = dict(showframe = True))
    choromap = go.Figure(data = [x], layout = layout)
    iplot(choromap, validate=False)

In [22]:
fig=px.imshow(exteneded_df1[numeric_features].corr(),
              text_auto=True,
              color_continuous_scale='Viridis', 
              aspect='auto',
              title='<b>Correlation matrix')
fig.update_layout(title_x=0.5)
fig.show()