In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, KernelPCA
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from utils.paths import DATA_RAW_DIR

In [2]:
path_happiness = str(DATA_RAW_DIR / "happiness.csv")
print(Path(path_happiness).exists())

True


In [3]:
df_happiness = pd.read_csv(path_happiness, sep=",", encoding="utf-8")
df_happiness.head()

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,Iceland,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [4]:
df_happiness.describe()

Unnamed: 0,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,5.354019,5.452326,5.255713,0.984718,1.188898,0.551341,0.408786,0.246883,0.12312,1.850238
std,44.888751,1.13123,1.118542,1.14503,0.420793,0.287263,0.237073,0.149997,0.13478,0.101661,0.500028
min,1.0,2.693,2.864884,2.521116,0.0,0.0,0.0,0.0,0.0,0.0,0.377914
25%,39.5,4.5055,4.608172,4.374955,0.663371,1.042635,0.369866,0.303677,0.154106,0.057271,1.591291
50%,78.0,5.279,5.370032,5.193152,1.064578,1.253918,0.606042,0.437454,0.231538,0.089848,1.83291
75%,116.5,6.1015,6.1946,6.006527,1.318027,1.414316,0.723008,0.516561,0.323762,0.153296,2.144654
max,155.0,7.537,7.62203,7.479556,1.870766,1.610574,0.949492,0.658249,0.838075,0.464308,3.117485


In [5]:
df_happiness.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     155 non-null    object 
 1   rank        155 non-null    int64  
 2   score       155 non-null    float64
 3   high        155 non-null    float64
 4   low         155 non-null    float64
 5   gdp         155 non-null    float64
 6   family      155 non-null    float64
 7   lifexp      155 non-null    float64
 8   freedom     155 non-null    float64
 9   generosity  155 non-null    float64
 10  corruption  155 non-null    float64
 11  dystopia    155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 14.7+ KB


In [6]:
df_happiness.columns

Index(['country', 'rank', 'score', 'high', 'low', 'gdp', 'family', 'lifexp',
       'freedom', 'generosity', 'corruption', 'dystopia'],
      dtype='object')

## Split Data

In [7]:
X = df_happiness[['gdp', 'family', 'lifexp', 'freedom', 'generosity', 'corruption', 'dystopia']]
y = df_happiness['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Standarize

In [8]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

## Model

In [12]:
# Linear Regression Model
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)
y_test_pred_linear = model_linear.predict(X_test)
print("Linear Regression Model")
print("Mean Squared Error:", mean_squared_error(y_test, y_test_pred_linear))
print("Coefficients:", model_linear.coef_)


Linear Regression Model
Mean Squared Error: 9.89333728308725e-08
Coefficients: [1.00012843 0.99994621 0.99983515 1.00003428 1.00025981 0.99977126
 0.99993814]


In [13]:
# Lasso Regression Model
model_lasso = Lasso(alpha=0.02)
model_lasso.fit(X_train, y_train)
y_test_pred_lasso = model_lasso.predict(X_test)
print("Lasso Regression Model")
print("Mean Squared Error:", mean_squared_error(y_test, y_test_pred_lasso))
print("Coefficients:", model_lasso.coef_)


Lasso Regression Model
Mean Squared Error: 0.04960575113982908
Coefficients: [1.28921417 0.91969417 0.47686397 0.73297273 0.14245522 0.
 0.89965327]


In [14]:
# Ridge Regression Model
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train, y_train)
y_test_pred_ridge = model_ridge.predict(X_test)
print("Ridge Regression Model")
print("Mean Squared Error:", mean_squared_error(y_test, y_test_pred_ridge))
print("Coefficients:", model_ridge.coef_)


Ridge Regression Model
Mean Squared Error: 0.0056501244999628
Coefficients: [1.07234856 0.97048582 0.85605399 0.87400159 0.73285696 0.68583271
 0.96206567]


In [16]:
# ElasticNet Regression Model
model_elasticnet = ElasticNet(alpha=0.01, l1_ratio=0.5)
model_elasticnet.fit(X_train, y_train)
y_test_pred_elasticnet = model_elasticnet.predict(X_test)
print("ElasticNet Regression Model")
print("Mean Squared Error:", mean_squared_error(y_test, y_test_pred_elasticnet))
print("Coefficients:", model_elasticnet.coef_)

ElasticNet Regression Model
Mean Squared Error: 0.00912409728272295
Coefficients: [1.10654244 0.96288317 0.80301977 0.86167394 0.65466699 0.554539
 0.95372006]
