In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [8]:
df = pd.read_csv("../clear_data/happiness_data.csv")

In [9]:
df.head()

Unnamed: 0,country,continent,year,health,social_support,economy,corruption_perception,freedom,generosity,happiness_rank,happiness_score
0,Switzerland,Europe,2015,0.94143,1.34951,1.39651,0.41978,0.66557,0.29678,1,7.587
1,Iceland,Europe,2015,0.94784,1.40223,1.30232,0.14145,0.62877,0.4363,2,7.561
2,Denmark,Europe,2015,0.87464,1.36058,1.32548,0.48357,0.64938,0.34139,3,7.527
3,Norway,Europe,2015,0.88521,1.33095,1.459,0.36503,0.66973,0.34699,4,7.522
4,Canada,North America,2015,0.90563,1.32261,1.32629,0.32957,0.63297,0.45811,5,7.427


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                782 non-null    object 
 1   continent              782 non-null    object 
 2   year                   782 non-null    int64  
 3   health                 782 non-null    float64
 4   social_support         782 non-null    float64
 5   economy                782 non-null    float64
 6   corruption_perception  782 non-null    float64
 7   freedom                782 non-null    float64
 8   generosity             782 non-null    float64
 9   happiness_rank         782 non-null    int64  
 10  happiness_score        782 non-null    float64
dtypes: float64(7), int64(2), object(2)
memory usage: 67.3+ KB


# Data splitting and creating dummies

In [11]:
def generate_dummy_variables(df):
    df = pd.get_dummies(df, columns=["continent"])

    rename_columns = {
        "continent_North America": "continent_North_America",
        "continent_Central America": "continent_Central_America",
        "continent_South America": "continent_South_America"
    }

    df = df.rename(columns=rename_columns)

    return df

df = generate_dummy_variables(df)


## Splitting data

To proceed, we divide the data into training and testing sets. Before that, we need to exclude certain columns that won't be useful for our model, which are:

happiness_score: This is our target variable, which we want to predict.

happiness_rank: This variable is inversely related to our target; including it could create unnecessary complexity and confusion for the model.

country: This column contains a large amount of categorical information, which could add extra weight to the training process without offering meaningful insights for prediction.

In [None]:
features = df.drop(["happiness_score", "happiness_rank", "country"], axis=1)
target = df["happiness_score"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=200)

In [None]:
print("Tamaño del conjunto de entrenamiento:", X_train.shape)
print("Tamaño del conjunto de prueba:", X_test.shape)

Tamaño del conjunto de entrenamiento: (547, 14)
Tamaño del conjunto de prueba: (235, 14)


In [15]:
X_test.columns

Index(['year', 'health', 'social_support', 'economy', 'corruption_perception',
       'freedom', 'generosity', 'continent_Africa', 'continent_Asia',
       'continent_Central_America', 'continent_Europe',
       'continent_North_America', 'continent_Oceania',
       'continent_South_America'],
      dtype='object')

In [16]:
X_train.columns

Index(['year', 'health', 'social_support', 'economy', 'corruption_perception',
       'freedom', 'generosity', 'continent_Africa', 'continent_Asia',
       'continent_Central_America', 'continent_Europe',
       'continent_North_America', 'continent_Oceania',
       'continent_South_America'],
      dtype='object')

# Model training

In [30]:
# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=50, random_state=200)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Gradient Boosting Regressor Model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Decision Tree Regressor Model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# K-Nearest Neighbors Regressor Model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Support Vector Regressor Model
svr_model = SVR()
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# XGBoost Regressor Model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Create a DataFrame to display results
results = pd.DataFrame({
    "Model": [
        "Linear Regression", "Random Forest Regressor", "Gradient Boosting Regressor",
        "Decision Tree Regressor", "K-Nearest Neighbors Regressor", "Support Vector Regressor", "XGBoost Regressor"
    ],
    "Mean Squared Error": [mse_lr, mse_rf, mse_gb, mse_dt, mse_knn, mse_svr, mse_xgb],
    "R2 Score": [r2_lr, r2_rf, r2_gb, r2_dt, r2_knn, r2_svr, r2_xgb]
})

# Display the results in a table format
print(results)


                           Model  Mean Squared Error  R2 Score
0              Linear Regression            0.210874  0.833289
1        Random Forest Regressor            0.171708  0.864253
2    Gradient Boosting Regressor            0.171200  0.864654
3        Decision Tree Regressor            0.347161  0.725545
4  K-Nearest Neighbors Regressor            0.308126  0.756405
5       Support Vector Regressor            1.265592 -0.000540
6              XGBoost Regressor            0.175235  0.861465


In [33]:
joblib.dump(gb_model, '../model/gb_model.pkl')

['../model/gb_model.pkl']

## Model Selection and Training
In this section, we assess various regression models to predict happiness scores for countries based on socioeconomic and continental indicators. Each model has unique strengths and methodologies for tackling our prediction task.

_Linear Regression_
We begin with a linear regression model that assumes a straight-line relationship between the independent features and the target variable.

Results:
Mean Squared Error (MSE): 0.2109

Coefficient of Determination (R²): 0.8333

The linear regression model explains approximately 83% of the variance in the happiness scores, suggesting a reasonable fit, though there is potential for improvement with more complex models.

_Random Forest Regressor_
Next, we apply a Random Forest Regressor, an ensemble technique that aggregates multiple decision trees to produce more reliable and accurate predictions.

Results:
Mean Squared Error (MSE): 0.1717

Coefficient of Determination (R²): 0.8643

The Random Forest model achieves a lower MSE and a higher R² score than linear regression, explaining about 86% of the variance. This suggests it captures more complex, non-linear relationships in the data.

_Gradient Boosting Regressor_
We then test a Gradient Boosting Regressor, another ensemble model that builds trees sequentially, with each tree aiming to correct errors made by the previous one.

Results:
Mean Squared Error (MSE): 0.1712

Coefficient of Determination (R²): 0.8647

The Gradient Boosting model performs comparably to Random Forest, with a marginally lower MSE and similar R² score, indicating both ensemble methods are effective for this data.

_Decision Tree Regressor_
We also examine a single Decision Tree Regressor, a simple, interpretable model.

Results:
Mean Squared Error (MSE): 0.3472

Coefficient of Determination (R²): 0.7255

The Decision Tree model shows higher error and lower R² than the ensemble models, as it is more prone to overfitting and lacks the robustness provided by averaging multiple trees.

_K-Nearest Neighbors Regressor_
The K-Nearest Neighbors (KNN) Regressor predicts based on the average of nearest neighbors in the feature space.

Results:
Mean Squared Error (MSE): 0.3081

Coefficient of Determination (R²): 0.7564

The KNN model shows moderate performance, with a higher error than ensemble models. It can be useful for capturing localized patterns but may struggle with global relationships in the data.

_Support Vector Regressor_
The Support Vector Regressor (SVR) tries to find a hyperplane that best fits the data within a margin of tolerance.

Results:
Mean Squared Error (MSE): 1.2656

Coefficient of Determination (R²): -0.0054

The SVR performs poorly here, with a high MSE and a negative R² score, indicating it does not capture the relationships in this dataset well.

_XGBoost Regressor_
Finally, we use an XGBoost Regressor, an optimized gradient boosting model known for high performance.

Results:
Mean Squared Error (MSE): 0.1752

Coefficient of Determination (R²): 0.8615

The XGBoost model provides competitive results, with performance close to the other ensemble models, capturing non-linear patterns effectively.