Importing Libraries (yet again)

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
import joblib

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


Importing dataframes

In [13]:
df = joblib.load("C:/Users/LENOVO/Desktop/Javin Programming/machine learning files/Cohort/Capstone Project/Models & Dataset/df.pkl")
print("Dataframes imported successfully!")

Dataframes imported successfully!


In [14]:
df = df.drop(columns=['Country', 'age_group'])

In [15]:
X = df.drop(columns=['Age'])
y = df['Age']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Splitting data into training and testing sets

In [17]:
cat_col = ['Gender', 'family_history',
           'work_interfere', 'no_employees', 'remote_work', 'benefits',
           'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
           'mental_health_consequence', 'coworkers', 'supervisor', 'treatment',
           'obs_consequence']

In [None]:
# Defining a preprocessor 
preprocessing = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_col)
    ]
)

In [19]:
# 1. Linear Regression
print("Applying Linear Regression...\n")
print("=" * 40)
pipe_lr = Pipeline([
    ("preprocess", preprocessing),
    ("regressor", LinearRegression())
])

pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)

print("MAE: ", mean_absolute_error(y_test, y_pred_lr))
print("RMSE: ", root_mean_squared_error(y_test, y_pred_lr))
print("R² Score: ", r2_score(y_test, y_pred_lr))
print("=" * 40)
print("\nLinear Regression applied successfully!")


Applying Linear Regression...

MAE:  5.523413472260937
RMSE:  7.129080567397488
R² Score:  0.0073463043136894335

Linear Regression applied successfully!


In [20]:
# 2. Random Forest
print("Applying Random Forest Regression...")
print("=" * 40)
pipe_rf = Pipeline([
    ("preprocess", preprocessing),
    ("regressor", RandomForestRegressor(random_state=42))
])

param_grid_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

y_pred_rf = best_rf.predict(X_test)
print("MAE: ", mean_absolute_error(y_test, y_pred_rf))
print("RMSE: ", root_mean_squared_error(y_test, y_pred_rf))
print("R² Score: ", r2_score(y_test, y_pred_rf))
print("Best hyperparameters: ", grid_rf.best_params_)
print("=" * 40)
print("Random Forest Regression applied successfully!")

Applying Random Forest Regression...
MAE:  5.234136686300276
RMSE:  6.883705110898743
R² Score:  0.07450253595221035
Best hyperparameters:  {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Random Forest Regression applied successfully!


In [21]:
# 3. XGB Regressor
print("Applying XGB Regression...")
print("=" * 40)
pipe_xgb = Pipeline([
    ("preprocess", preprocessing),
    ("regressor", XGBRegressor())
])

param_grid_xgb = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.8, 1.0]
}

grid_xgb = GridSearchCV(pipe_xgb, param_grid= param_grid_xgb, cv=5)
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_

y_pred_xgb = best_xgb.predict(X_test)
print("MAE: ", mean_absolute_error(y_test, y_pred_xgb))
print("RMSE: ", root_mean_squared_error(y_test, y_pred_xgb))
print("R² Score: ", r2_score(y_test, y_pred_xgb))
print("Best hyperparameters: ", grid_xgb.best_params_)
print("=" * 40)
print("XGB Regression applied successfully!")

Applying XGB Regression...
MAE:  5.261291980743408
RMSE:  6.862639427185059
R² Score:  0.08015835285186768
Best hyperparameters:  {'regressor__colsample_bytree': 1.0, 'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}
XGB Regression applied successfully!


In [None]:
# Making a dataframe to compare the models
def evaluate_model(name, y_true, y_pred):
    return {
        "Model": name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": root_mean_squared_error(y_true, y_pred),
        "R² Score": r2_score(y_true, y_pred)
    }

# Collect all results
results = [
    evaluate_model("Linear Regression", y_test, y_pred_lr),
    evaluate_model("Random Forest", y_test, y_pred_rf),
    evaluate_model("XGBoost Regression", y_test, y_pred_xgb)
]

# Display as a DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="R² Score", ascending=False)

print("📊 Regression Model Comparison:")
print("=" * 40)
display(results_df)


📊 Regression Model Comparison:


Unnamed: 0,Model,MAE,RMSE,R² Score
2,XGBoost Regression,5.261292,6.862639,0.080158
1,Random Forest,5.234137,6.883705,0.074503
0,Linear Regression,5.523413,7.129081,0.007346


In [None]:
joblib.dump(best_xgb, 'regression_model.pkl')