In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

seed = 42
np.random.seed(seed)

In [17]:
df = pd.read_csv('cleaned_whr.csv')
print(df.shape)
df.head(3)

(1810, 9)


Unnamed: 0,Country name,year,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption
0,Afghanistan,2008,3.724,7.37,0.451,50.8,0.718,0.168,0.882
1,Afghanistan,2009,4.402,7.54,0.552,51.2,0.679,0.19,0.85
2,Afghanistan,2010,4.758,7.647,0.539,51.6,0.6,0.121,0.707


In [5]:
df.columns

Index(['Country name', 'year', 'Life Ladder', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption'],
      dtype='object')

## Preprocessing

In [24]:
features = ['year', 'Log GDP per capita',
       'Social support', 'Healthy life expectancy at birth',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption']

feat_vals = df[features]

labels = df[['Life Ladder']] # Life Ladder depicts how happy a country's citizens are

In [25]:
# min-max scale
scaler = MinMaxScaler()

scaled_feats = scaler.fit_transform(feat_vals)

In [26]:
train_feats, val_feats, y_train, y_val  = train_test_split(scaled_feats, labels, test_size=0.2, random_state=seed)

# Linear Regression

Use Linear Regression to understand how each feature contribute to the happiness score of a country

In [27]:
# Evaluate how well linear regression is able to predict Happiness

linreg_train = LinearRegression().fit(train_feats, y_train)

r_squared = linreg_train.score(val_feats, y_val)

print(f'Linear Regression achieves {r_squared} r-squared on the validation set')

Linear Regression achieves 0.7549953751964085 r-squared on the validation set


In [47]:
print(f'Coefficients for each feature (train): \n')
for feat, coef in zip(features, linreg_train.coef_[0]):
    feature_weights[feat] = coef
    print(f'{feat}: {coef}')

Coefficients for each feature (train): 

year: -0.21363345781294302
Log GDP per capita: 1.823592394378828
Social support: 1.5589416068680688
Healthy life expectancy at birth: 1.4104160284166016
Freedom to make life choices: 0.8364363039406275
Generosity: 0.732481474023862
Perceptions of corruption: -0.5828302586085676


In [41]:
linreg = LinearRegression().fit(scaled_feats, labels)

feature_weights = {}

print(f'Coefficients for each feature: \n')
for feat, coef in zip(features, linreg.coef_[0]):
    feature_weights[feat] = coef
    print(f'{feat}: {coef}')

Coefficients for each feature: 

year: -0.21495044765877314
Log GDP per capita: 1.7427729120531477
Social support: 1.5559589175083068
Healthy life expectancy at birth: 1.484179657883182
Freedom to make life choices: 0.8337622070699899
Generosity: 0.6861463938437915
Perceptions of corruption: -0.6399692613657582


# Random Forest

Use Random Forest to understand the relative importance of each feature

In [35]:
# Evaluate how well Random Forest is able to predict Happiness

rf_train = RandomForestRegressor().fit(train_feats, y_train)

r_squared = rf_train.score(val_feats, y_val)

print(f'Random Forest achieves {r_squared} r-squared on the validation set')

  rf_train = RandomForestRegressor().fit(train_feats, y_train)


Random Forest achieves 0.8626489704981295 r-squared on the validation set


In [48]:
print(f'Coefficients for each feature (train): \n')
for feat, imp in zip(features, rf_train.feature_importances_):
    print(f'{feat}: {imp}')

Coefficients for each feature (train): 

year: 0.018925953565194446
Log GDP per capita: 0.20257081070069663
Social support: 0.0870565685054824
Healthy life expectancy at birth: 0.5706153468914741
Freedom to make life choices: 0.04141871419085741
Generosity: 0.04056564564825611
Perceptions of corruption: 0.038846960498038846


In [51]:
rf = RandomForestRegressor().fit(scaled_feats, labels)

feature_importances = {}

print(f'Coefficients for each feature: \n')
for feat, imp in zip(features, rf.feature_importances_):
    feature_importances[feat] = imp
    print(f'{feat}: {imp}')

  rf = RandomForestRegressor().fit(scaled_feats, labels)


Coefficients for each feature: 

year: 0.01906982582864117
Log GDP per capita: 0.16275280318586433
Social support: 0.08546427251057095
Healthy life expectancy at birth: 0.6168428604785029
Freedom to make life choices: 0.04157087453213017
Generosity: 0.0388079522717796
Perceptions of corruption: 0.03549141119251098
