#### This notebook is the simplifed journey of getting a R2 score of about 0.885

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# Load the dataset
df = pd.read_csv('./dataset/dataset.csv')
display(df.head())

df.info()

Unnamed: 0,country,region,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption,year
0,Finland,Western Europe,7.804,1.888,1.585,0.535,0.772,0.126,0.535,2023
1,Denmark,Western Europe,7.586,1.949,1.548,0.537,0.734,0.208,0.525,2023
2,Iceland,Western Europe,7.53,1.926,1.62,0.559,0.738,0.25,0.187,2023
3,Israel,Middle East and North Africa,7.473,1.833,1.521,0.577,0.569,0.124,0.158,2023
4,Netherlands,Western Europe,7.403,1.942,1.488,0.545,0.672,0.251,0.394,2023


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1367 entries, 0 to 1366
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       1367 non-null   object 
 1   region                        1367 non-null   object 
 2   happiness_score               1367 non-null   float64
 3   gdp_per_capita                1367 non-null   float64
 4   social_support                1367 non-null   float64
 5   healthy_life_expectancy       1366 non-null   float64
 6   freedom_to_make_life_choices  1367 non-null   float64
 7   generosity                    1367 non-null   float64
 8   perceptions_of_corruption     1366 non-null   float64
 9   year                          1367 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 106.9+ KB


In [3]:
# Drop rows with nulls
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1365 entries, 0 to 1366
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       1365 non-null   object 
 1   region                        1365 non-null   object 
 2   happiness_score               1365 non-null   float64
 3   gdp_per_capita                1365 non-null   float64
 4   social_support                1365 non-null   float64
 5   healthy_life_expectancy       1365 non-null   float64
 6   freedom_to_make_life_choices  1365 non-null   float64
 7   generosity                    1365 non-null   float64
 8   perceptions_of_corruption     1365 non-null   float64
 9   year                          1365 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 117.3+ KB


## Preparing Data for Modeling

It was agreed upon by the group that the training set will be from 2015-2022, the test set will be 2023. First we will be disregarding the categorical data for the model and the year.

In [8]:
train = df[df['year'] < 2023]
test = df[df['year'] == 2023]
droppies = ['country','region','year', 'happiness_score']

X_train = train.drop(droppies, axis=1)
y_train = train['happiness_score']

X_test = test.drop(droppies, axis=1)
y_test = test['happiness_score']

## Decision Tree

In [9]:
best_depth = 0
best_result = 0

for depth in range(1,7):
    model_dt5 = DecisionTreeRegressor(random_state=616, max_depth=depth)
    model_dt5.fit(X_train, y_train)
    predictions_test_dt = model_dt5.predict(X_test)
    result = r2_score(y_test, predictions_test_dt)
    if result > best_result:
        best_result = result
        best_depth = depth
f"Best R2 Score: {best_result}, Best Depth: {best_depth}"

'Best R2 Score: 0.5627363380604942, Best Depth: 6'

This is not a good result

## Random Forest

In [10]:
best_result = 0
best_est = 0
best_depth = 0

for est in range(10, 51, 10):
    for depth in range(1,11):
        model_rf5 = RandomForestRegressor(random_state=616, n_estimators=est, max_depth=depth)
        model_rf5.fit(X_train, y_train)
        predictions_test_rf = model_rf5.predict(X_test)
        result = r2_score(y_test, predictions_test_rf)
        if result > best_result:
            best_result = result
            best_est = est
            best_depth = depth
f"Best R2 Score: {best_result}, Best Number of Estimators: {best_est}, Best Depth: {best_depth}"

'Best R2 Score: 0.5775926868414045, Best Number of Estimators: 10, Best Depth: 3'

This is also not a good score

## Ridge

In [11]:
model_r6 = Ridge(random_state=616)
model_r6.fit(X_train, y_train)
predictions_test_r = model_r6.predict(X_test)
r2score = r2_score(y_test, predictions_test_r)
f"R2 Score: {r2score}"

'R2 Score: 0.7628044066232471'

This score is the best one yet, but 70s is not where we want to be.
***
I now want to try the models with the same training breakdown, but with the encoded categorical features.

In [12]:
#Separating the Categorical and Numeric data
df_cat = df[['country','region']]
df_num = df.drop(['country','region'], axis=1)

In [13]:
#Using OneHotEncoder to encode the categorical data
ohe = OneHotEncoder(drop='first') #initializing encoder
ohe.fit(df_cat)
df_cat_new = ohe.transform(df_cat).toarray()
df_cat_new

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

I actually do not want an array at this time. Instead I will use get_dummies

In [14]:
df_cat_dummy = pd.get_dummies(df_cat, drop_first='True')
df_cat_dummy

Unnamed: 0,country_Albania,country_Algeria,country_Angola,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,country_Bahrain,country_Bangladesh,...,region_Central and Eastern Europe,region_Commonwealth of Independent States,region_East Asia,region_Latin America and Caribbean,region_Middle East and North Africa,region_North America and ANZ,region_South Asia,region_Southeast Asia,region_Sub-Saharan Africa,region_Western Europe
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1363,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1364,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1365,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [16]:
#combining encoded category df with numeric df
new_df = pd.concat([df_cat_dummy, df_num], axis=1)
new_df

Unnamed: 0,country_Albania,country_Algeria,country_Angola,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,country_Bahrain,country_Bangladesh,...,region_Sub-Saharan Africa,region_Western Europe,happiness_score,gdp_per_capita,social_support,healthy_life_expectancy,freedom_to_make_life_choices,generosity,perceptions_of_corruption,year
0,False,False,False,False,False,False,False,False,False,False,...,False,True,7.804,1.88800,1.58500,0.53500,0.77200,0.12600,0.53500,2023
1,False,False,False,False,False,False,False,False,False,False,...,False,True,7.586,1.94900,1.54800,0.53700,0.73400,0.20800,0.52500,2023
2,False,False,False,False,False,False,False,False,False,False,...,False,True,7.530,1.92600,1.62000,0.55900,0.73800,0.25000,0.18700,2023
3,False,False,False,False,False,False,False,False,False,False,...,False,False,7.473,1.83300,1.52100,0.57700,0.56900,0.12400,0.15800,2023
4,False,False,False,False,False,False,False,False,False,False,...,False,True,7.403,1.94200,1.48800,0.54500,0.67200,0.25100,0.39400,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362,False,False,False,False,False,False,False,False,False,False,...,True,False,3.465,0.22208,0.77370,0.42864,0.59201,0.22628,0.55191,2015
1363,False,False,False,False,False,False,False,False,False,False,...,True,False,3.340,0.28665,0.35386,0.31910,0.48450,0.18260,0.08010,2015
1364,False,False,False,False,False,False,False,False,False,False,...,False,False,3.006,0.66320,0.47489,0.72193,0.15684,0.47179,0.18906,2015
1365,False,False,False,False,False,False,False,False,False,False,...,True,False,2.905,0.01530,0.41587,0.22396,0.11850,0.19727,0.10062,2015


In [17]:
train2 = new_df[new_df['year'] < 2023]
test2 = new_df[new_df['year'] == 2023]

droppies2 = ['year', 'happiness_score']
X_train2 = train2.drop(droppies2, axis=1)
y_train2 = train2['happiness_score']

X_test2 = test2.drop(droppies2, axis=1)
y_test2 = test2['happiness_score']

### Decision Tree with encoded categorical data

In [20]:
best_depth = 0
best_result = 0

for depth in range(1,7):
    model_dt6 = DecisionTreeRegressor(random_state=616, max_depth=depth)
    model_dt6.fit(X_train2, y_train2)
    predictions_test_dt = model_dt6.predict(X_test2)
    result = r2_score(y_test2, predictions_test_dt)
    if result > best_result:
        best_result = result
        best_depth = depth
f"Best R2 Score: {best_result}, Best Depth: {best_depth}"

'Best R2 Score: 0.5322761150934382, Best Depth: 5'

This is also a bad score

### Random Forest with encoded categorical data

In [21]:
best_result = 0
best_est = 0
best_depth = 0

for est in range(10, 51, 10):
    for depth in range(1,11):
        model_rf6 = RandomForestRegressor(random_state=616, n_estimators=est, max_depth=depth)
        model_rf6.fit(X_train2, y_train2)
        predictions_test_rf = model_rf6.predict(X_test2)
        result = r2_score(y_test2, predictions_test_rf)
        if result > best_result:
            best_result = result
            best_est = est
            best_depth = depth
f"Best R2 Score: {best_result}, Best Number of Estimators: {best_est}, Best Depth: {best_depth}"

'Best R2 Score: 0.6390890240986671, Best Number of Estimators: 20, Best Depth: 10'

This is better than the Decision Tree model for sure

### Ridge with encoded categorical data

In [22]:
model_r7 = Ridge(random_state=616)
model_r7.fit(X_train2, y_train2)
predictions_test_r2 = model_r7.predict(X_test2)
r2score2 = r2_score(y_test2, predictions_test_r2)
f"R2 Score: {r2score2}"

'R2 Score: 0.8845262873135782'

<span style='color:green'> __This is the best score I have attained with the models. It seems using the categorical data for the training hepled the model be much better at predicting. A Ridge model with no extra hyperparameters.__ </span>