In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

In [2]:
tw22 = pd.read_csv('clnd_v1_2022.csv')

In [3]:
tw22.head()

Unnamed: 0,RANK,Country,Happiness score,Whisker-high,Whisker-low,Dystopia score,Dystopia + residual,Explained by: GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Regional indicator
0,1,Finland,7.821,7.886,7.756,1.83,2.518,1.892,1.258,0.775,0.736,0.109,0.534,Western Europe
1,2,Denmark,7.636,7.71,7.563,1.83,2.226,1.953,1.243,0.777,0.719,0.188,0.532,Western Europe
2,3,Iceland,7.557,7.651,7.464,1.83,2.32,1.936,1.32,0.803,0.718,0.27,0.191,Western Europe
3,4,Switzerland,7.512,7.586,7.437,1.83,2.153,2.026,1.226,0.822,0.677,0.147,0.461,Western Europe
4,5,Netherlands,7.415,7.471,7.359,1.83,2.137,1.945,1.206,0.787,0.651,0.271,0.419,Western Europe


In [4]:
response = tw22[["Happiness score"]]
predictor = tw22[['Regional indicator', 'Explained by: GDP per capita']]

In [5]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error

sc_resp = StandardScaler()
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories = 'auto'), [0]), ('pred_scaler', StandardScaler(), [1])], remainder = 'passthrough') #to transform each column of the set using different methods
ct.fit(predictor) #to ensure all regions are fit
sc_resp.fit(response)

StandardScaler()

#### Training the models using simple Train/Test split, may not be appropriate due to small sample size

In [22]:
pred_train, pred_Test, resp_train, resp_Test = train_test_split(predictor, response, test_size = 0.25)

pred_train = ct.transform(pred_train)
resp_train = sc_resp.transform(resp_train)

In [23]:
regressor = SVR(kernel = 'linear')
regressor.fit(pred_train, resp_train.ravel())

test_predicted_SVR = sc_resp.inverse_transform(regressor.predict(ct.transform(pred_Test)))
train_predicted_SVR = sc_resp.inverse_transform(regressor.predict(pred_train))

print('SVR (Linear)')
print()

print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", r2_score(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR)))
print()

print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", r2_score(resp_Test, test_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(resp_Test, test_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(resp_Test, test_predicted_SVR)))
print()

#test model against test set and against train set
#check MSE and RMSE

SVR (Linear)

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.7389247117658565
Mean Squared Error (MSE) 	: 0.32310141245323754
Root Mean Squared Error (RMSE) 	: 0.5684201020840463

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.6208296525556998
Mean Squared Error (MSE) 	: 0.3795529291545928
Root Mean Squared Error (RMSE) 	: 0.616078671238173



In [24]:
regressor = SVR(kernel = 'poly')
regressor.fit(pred_train, resp_train.ravel())

test_predicted_SVR = sc_resp.inverse_transform(regressor.predict(ct.transform(pred_Test)))
train_predicted_SVR = sc_resp.inverse_transform(regressor.predict(pred_train))

print('SVR (Poly)')
print()

print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", r2_score(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR)))
print()

print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", r2_score(resp_Test, test_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(resp_Test, test_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(resp_Test, test_predicted_SVR)))
print()

#test model against test set and against train set
#check MSE and RMSE

SVR (Poly)

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.7649792001898814
Mean Squared Error (MSE) 	: 0.29085691291638693
Root Mean Squared Error (RMSE) 	: 0.539311517507634

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.6954311683613879
Mean Squared Error (MSE) 	: 0.30487614064970825
Root Mean Squared Error (RMSE) 	: 0.5521559024856189



In [25]:
regressor = SVR(kernel = 'rbf')
regressor.fit(pred_train, resp_train.ravel())

test_predicted_SVR = sc_resp.inverse_transform(regressor.predict(ct.transform(pred_Test)))
train_predicted_SVR = sc_resp.inverse_transform(regressor.predict(pred_train))

print('SVR (RBF)')

print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", r2_score(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR)))
print()

print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", r2_score(resp_Test, test_predicted_SVR))
print("Mean Squared Error (MSE) \t:", mean_squared_error(resp_Test, test_predicted_SVR))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(resp_Test, test_predicted_SVR)))
print()

#test model against test set and against train set
#check MSE and RMSE

SVR (RBF)
Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.791911265036288
Mean Squared Error (MSE) 	: 0.2575263428305962
Root Mean Squared Error (RMSE) 	: 0.5074705339530525

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.6993609692536562
Mean Squared Error (MSE) 	: 0.3009423746004687
Root Mean Squared Error (RMSE) 	: 0.5485821493636743



In [26]:
linreg = LinearRegression()
linreg.fit(pred_train, resp_train.ravel())

test_predicted_linreg = sc_resp.inverse_transform(linreg.predict(ct.transform(pred_Test)))
train_predicted_linreg = sc_resp.inverse_transform(linreg.predict(pred_train))

print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", r2_score(sc_resp.inverse_transform(resp_train), train_predicted_linreg))
print("Mean Squared Error (MSE) \t:", mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_linreg))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_linreg)))
print()

print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", r2_score(resp_Test, test_predicted_linreg))
print("Mean Squared Error (MSE) \t:", mean_squared_error(resp_Test, test_predicted_linreg))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(resp_Test, test_predicted_linreg)))
print()

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.7435672410185352
Mean Squared Error (MSE) 	: 0.3173559136393066
Root Mean Squared Error (RMSE) 	: 0.5633435130001113

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.5773375659345598
Mean Squared Error (MSE) 	: 0.4230888991568987
Root Mean Squared Error (RMSE) 	: 0.6504528416087508



#### Split data using K-Folds, more appropriate since it is a small sample size

In [39]:
k = 10
kf = KFold(n_splits = k)

def kflds(kf, method, k):
    
    r2_train, mse_train, rmse_train = [], [], []
    r2_test, mse_test, rmse_test = [], [], []

    for train_ind, test_ind in kf.split(predictor):
        pred_train, pred_test = predictor.iloc[train_ind, :], predictor.iloc[test_ind, :]
        resp_train, resp_test = response.iloc[train_ind, :], response.iloc[test_ind, :]

        pred_train = ct.transform(pred_train)
        resp_train = sc_resp.transform(resp_train)

        regressor = SVR(kernel = method)
        regressor.fit(pred_train, resp_train.ravel())

        test_predicted_SVR = sc_resp.inverse_transform(regressor.predict(ct.transform(pred_Test)))
        train_predicted_SVR = sc_resp.inverse_transform(regressor.predict(pred_train))

        mse = mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_SVR)
        r2_train.append(r2_score(sc_resp.inverse_transform(resp_train), train_predicted_SVR))
        mse_train.append(mse)
        rmse_train.append(np.sqrt(mse))

        mse = mean_squared_error(resp_Test, test_predicted_SVR)
        r2_test.append(r2_score(resp_Test, test_predicted_SVR))
        mse_test.append(mse)
        rmse_test.append(np.sqrt(mse))
        
    return (sum(r2_train)/k, sum(mse_train)/k, sum(rmse_train)/k, sum(r2_test)/k, sum(mse_test)/k, sum(rmse_test)/k)
    

In [42]:

lin = kflds(kf, 'linear', k)
poly = kflds(kf, 'poly', k)
rbf = kflds(kf, 'rbf', k)

print("SVR (Linear)")
print("Goodness of Fit of Model \tTrain Dataset")
print("Average Explained Variance (R^2) \t:", lin[0])
print("Average Mean Squared Error (MSE) \t:", lin[1])
print("Average Root Mean Squared Error (RMSE) \t:", lin[2])

print("Goodness of Fit of Model \tTest Dataset")
print("Average Explained Variance (R^2) \t:", lin[3])
print("Average Mean Squared Error (MSE) \t:", lin[4])
print("Average Root Mean Squared Error (RMSE) \t:", lin[5])
print()

print("SVR (Poly)")
print("Goodness of Fit of Model \tTrain Dataset")
print("Average Explained Variance (R^2) \t:", poly[0])
print("Average Mean Squared Error (MSE) \t:", poly[1])
print("Average Root Mean Squared Error (RMSE) \t:", poly[2])

print("Goodness of Fit of Model \tTest Dataset")
print("Average Explained Variance (R^2) \t:", poly[3])
print("Average Mean Squared Error (MSE) \t:", poly[4])
print("Average Root Mean Squared Error (RMSE) \t:", poly[5])
print()

print("SVR (RBF)")
print("Goodness of Fit of Model \tTrain Dataset")
print("Average Explained Variance (R^2) \t:", rbf[0])
print("Average Mean Squared Error (MSE) \t:", rbf[1])
print("Average Root Mean Squared Error (RMSE) \t:", rbf[2])

print("Goodness of Fit of Model \tTest Dataset")
print("Average Explained Variance (R^2) \t:", rbf[3])
print("Average Mean Squared Error (MSE) \t:", rbf[4])
print("Average Root Mean Squared Error (RMSE) \t:", rbf[5])
print()

SVR (Linear)
Goodness of Fit of Model 	Train Dataset
Average Explained Variance (R^2) 	: 0.71462562604326
Average Mean Squared Error (MSE) 	: 0.3319699844213754
Average Root Mean Squared Error (RMSE) 	: 0.5746947329953376
Goodness of Fit of Model 	Test Dataset
Average Explained Variance (R^2) 	: 0.6641796344494081
Average Mean Squared Error (MSE) 	: 0.33615920726295
Average Root Mean Squared Error (RMSE) 	: 0.5796706138116905

SVR (Poly)
Goodness of Fit of Model 	Train Dataset
Average Explained Variance (R^2) 	: 0.7538116119646467
Average Mean Squared Error (MSE) 	: 0.28771525262033804
Average Root Mean Squared Error (RMSE) 	: 0.5344004266329015
Goodness of Fit of Model 	Test Dataset
Average Explained Variance (R^2) 	: 0.7322173397782301
Average Mean Squared Error (MSE) 	: 0.268052852099444
Average Root Mean Squared Error (RMSE) 	: 0.5174587709235823

SVR (RBF)
Goodness of Fit of Model 	Train Dataset
Average Explained Variance (R^2) 	: 0.7763338273837376
Average Mean Squared Error (MSE

In [45]:
k = 10
kf = KFold(n_splits = k)

r2_train, mse_train, rmse_train = [], [], []
r2_test, mse_test, rmse_test = [], [], []

for train_ind, test_ind in kf.split(predictor):
    pred_train, pred_test = predictor.iloc[train_ind, :], predictor.iloc[test_ind, :]
    resp_train, resp_test = response.iloc[train_ind, :], response.iloc[test_ind, :]

    pred_train = ct.transform(pred_train)
    resp_train = sc_resp.transform(resp_train)

    linreg = LinearRegression()
    linreg.fit(pred_train, resp_train.ravel())

    test_predicted_linreg = sc_resp.inverse_transform(linreg.predict(ct.transform(pred_Test)))
    train_predicted_linreg = sc_resp.inverse_transform(linreg.predict(pred_train))

    mse = mean_squared_error(sc_resp.inverse_transform(resp_train), train_predicted_linreg)
    r2_train.append(r2_score(sc_resp.inverse_transform(resp_train), train_predicted_linreg))
    mse_train.append(mse)
    rmse_train.append(np.sqrt(mse))

    mse = mean_squared_error(resp_Test, test_predicted_linreg)
    r2_test.append(r2_score(resp_Test, test_predicted_linreg))
    mse_test.append(mse)
    rmse_test.append(np.sqrt(mse))
    
print("Linear Regression")
print("Goodness of Fit of Model \tTrain Dataset")
print("Average Explained Variance (R^2) \t:", sum(r2_train)/k)
print("Average Mean Squared Error (MSE) \t:", sum(mse_train)/k)
print("Average Root Mean Squared Error (RMSE) \t:", sum(rmse_train)/k)
print()

print("Goodness of Fit of Model \tTest Dataset")
print("Average Explained Variance (R^2) \t:", sum(r2_test)/k)
print("Average Mean Squared Error (MSE) \t:", sum(mse_test)/k)
print("Average Root Mean Squared Error (RMSE) \t:", sum(rmse_test)/k)
print()

Linear Regression
Goodness of Fit of Model 	Train Dataset
Average Explained Variance (R^2) 	: 0.7228056673877779
Average Mean Squared Error (MSE) 	: 0.3225755734592751
Average Root Mean Squared Error (RMSE) 	: 0.5665381029050434

Goodness of Fit of Model 	Test Dataset
Average Explained Variance (R^2) 	: 0.6499732308466196
Average Mean Squared Error (MSE) 	: 0.35037994508312653
Average Root Mean Squared Error (RMSE) 	: 0.5917694560771974

