In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

np.set_printoptions(precision=4, suppress=True)

In [32]:
df = pd.read_csv('world-happiness-report-2021.csv')

df = df[['Country name', 'Ladder score',
         'Logged GDP per capita', 'Social support',
         'Healthy life expectancy', 'Freedom to make life choices',
         'Generosity', 'Perceptions of corruption']]

df = df.rename(columns={
    'Country name': 'Country',
    'Ladder score': 'Happiness',
    'Logged GDP per capita': 'GDP',
    'Social support': 'Social',
    'Healthy life expectancy': 'Life',
    'Freedom to make life choices': 'Freedom',
    'Perceptions of corruption': 'Corruption'
})

df = df.set_index('Country')
df

Unnamed: 0_level_0,Happiness,GDP,Social,Life,Freedom,Generosity,Corruption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Finland,7.842,10.775,0.954,72.000,0.949,-0.098,0.186
Denmark,7.620,10.933,0.954,72.700,0.946,0.030,0.179
Switzerland,7.571,11.117,0.942,74.400,0.919,0.025,0.292
Iceland,7.554,10.878,0.983,73.000,0.955,0.160,0.673
Netherlands,7.464,10.932,0.942,72.400,0.913,0.175,0.338
...,...,...,...,...,...,...,...
Lesotho,3.512,7.926,0.787,48.700,0.715,-0.131,0.915
Botswana,3.467,9.782,0.784,59.269,0.824,-0.246,0.801
Rwanda,3.415,7.676,0.552,61.400,0.897,0.061,0.167
Zimbabwe,3.145,7.943,0.750,56.201,0.677,-0.047,0.821


In [33]:
# Centering the data
df = df - df.mean()
df

Unnamed: 0_level_0,Happiness,GDP,Social,Life,Freedom,Generosity,Corruption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Finland,2.309161,1.342792,0.139255,7.007201,0.157403,-0.082866,-0.54145
Denmark,2.087161,1.500792,0.139255,7.707201,0.154403,0.045134,-0.54845
Switzerland,2.038161,1.684792,0.127255,9.407201,0.127403,0.040134,-0.43545
Iceland,2.021161,1.445792,0.168255,8.007201,0.163403,0.175134,-0.05445
Netherlands,1.931161,1.499792,0.127255,7.407201,0.121403,0.190134,-0.38945
...,...,...,...,...,...,...,...
Lesotho,-2.020839,-1.506208,-0.027745,-16.292799,-0.076597,-0.115866,0.18755
Botswana,-2.065839,0.349792,-0.030745,-5.723799,0.032403,-0.230866,0.07355
Rwanda,-2.117839,-1.756208,-0.262745,-3.592799,0.105403,0.076134,-0.56045
Zimbabwe,-2.387839,-1.489208,-0.064745,-8.791799,-0.114597,-0.031866,0.09355


# PCA

In [34]:
X = df.values.T  # d * n matrix
X.shape

(7, 149)

In [35]:
n = X.shape[1]
C = X @ X.T / n
C.shape

(7, 7)

In [36]:
eigenvalues, eigenvectors = np.linalg.eig(C)
eigenvectors = eigenvectors.T

eigenvalues
eigenvectors

array([47.1136,  0.5655,  0.2401,  0.0294,  0.0178,  0.0074,  0.0038])

array([[-0.1215, -0.1461, -0.0122, -0.9816, -0.0077,  0.0035,  0.0096],
       [ 0.817 ,  0.5398,  0.0547, -0.1829,  0.0488,  0.0169, -0.0457],
       [-0.5503,  0.825 ,  0.0174, -0.0542, -0.0594, -0.085 ,  0.051 ],
       [ 0.096 , -0.0514,  0.0855,  0.0025, -0.1617, -0.4809,  0.8505],
       [-0.0288,  0.0429,  0.0896,  0.0033,  0.0891,  0.8564,  0.498 ],
       [ 0.0651,  0.0017, -0.2433,  0.0021, -0.9508,  0.1667, -0.0693],
       [-0.0275, -0.0466,  0.9602, -0.0011, -0.2366,  0.0058, -0.1379]])

In [37]:
np.cumsum(eigenvalues) / sum(eigenvalues)

array([0.982 , 0.9938, 0.9988, 0.9994, 0.9998, 0.9999, 1.    ])

# dimensionality reduction using PCA

In [38]:
W = eigenvectors[:2]

In [39]:
W.shape, X.shape
coefficients = W @ X
coefficients.shape

((2, 7), (7, 149))

(2, 149)

In [40]:
W.T.shape, coefficients.shape
X_new = W.T @ coefficients
X_new.shape

((7, 2), (2, 149))

(7, 149)

In [41]:
np.mean((X - X_new)**2)

np.float64(0.042643510026191274)

In [42]:
df_new = pd.DataFrame(X_new.T, columns=df.columns, index=df.index)
df
df_new

Unnamed: 0_level_0,Happiness,GDP,Social,Life,Freedom,Generosity,Corruption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Finland,2.309161,1.342792,0.139255,7.007201,0.157403,-0.082866,-0.54145
Denmark,2.087161,1.500792,0.139255,7.707201,0.154403,0.045134,-0.54845
Switzerland,2.038161,1.684792,0.127255,9.407201,0.127403,0.040134,-0.43545
Iceland,2.021161,1.445792,0.168255,8.007201,0.163403,0.175134,-0.05445
Netherlands,1.931161,1.499792,0.127255,7.407201,0.121403,0.190134,-0.38945
...,...,...,...,...,...,...,...
Lesotho,-2.020839,-1.506208,-0.027745,-16.292799,-0.076597,-0.115866,0.18755
Botswana,-2.065839,0.349792,-0.030745,-5.723799,0.032403,-0.230866,0.07355
Rwanda,-2.117839,-1.756208,-0.262745,-3.592799,0.105403,0.076134,-0.56045
Zimbabwe,-2.387839,-1.489208,-0.064745,-8.791799,-0.114597,-0.031866,0.09355


Unnamed: 0_level_0,Happiness,GDP,Social,Life,Freedom,Generosity,Corruption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Finland,2.012591,1.814292,0.164718,6.977966,0.123442,-0.002833,-0.133047
Denmark,1.914383,1.794330,0.160907,7.688816,0.117865,-0.008987,-0.129451
Switzerland,1.908038,1.901219,0.167330,9.393404,0.118211,-0.019312,-0.133789
Iceland,1.819512,1.749652,0.155661,7.986138,0.112313,-0.012602,-0.124905
Netherlands,1.810466,1.704875,0.152663,7.393269,0.111519,-0.009228,-0.122760
...,...,...,...,...,...,...,...
Lesotho,-1.591508,-2.134661,-0.173406,-16.250268,-0.102180,0.066476,0.134778
Botswana,-1.082264,-1.097933,-0.096116,-5.627605,-0.067178,0.012747,0.076712
Rwanda,-2.127309,-1.671279,-0.158915,-3.597326,-0.128870,-0.019620,0.130221
Zimbabwe,-2.058729,-1.961685,-0.175016,-8.760139,-0.126962,0.012607,0.140566


In [43]:
countries = ['Finland', 'Denmark', 'Switzerland', 'Iceland', 'Netherlands',
             'Sweden', 'New Zealand', 'Australia', 'Canada', 'Germany',
             'United Kingdom', 'United States', 'India', 'Chile', 'South Korea',
             'Afghanistan', 'Zimbabwe', 'Rwanda', 'Haiti', 'Botswana']
indices = df.index.get_indexer(countries)
indices

array([  0,   1,   2,   3,   4,   6,   8,  10,  13,  12,  16,  18, 138,
        42,  61, 148, 147, 146, 142, 145])

In [44]:
coefficients_ = coefficients[:, indices]
coefficients_

array([[-7.3635, -8.0463, -9.7346, -8.32  , -7.7303, -8.0045, -8.6483,
        -9.1478, -9.0374, -7.781 , -7.7452, -3.5544,  4.5894, -5.0857,
        -8.9587, 12.8985,  9.1403,  4.0379,  9.647 ,  5.8185],
       [ 1.3687,  1.1469,  0.8881,  0.9901,  1.0667,  0.8985,  0.5799,
         0.4844,  0.4258,  0.7518,  0.5926,  1.4421, -0.9755, -0.0525,
        -0.7236, -1.1607, -1.161 , -2.0036, -0.94  , -0.4596]])

In [45]:
fig = px.scatter(x=coefficients_[0], y=coefficients_[1],
                 labels={'x': 'PC1', 'y': 'PC2'},
                 title='PCA of World Happiness Report 2021',
                 text=countries)

fig.show()