**Importing Libraries**

In [123]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [124]:
from sklearn.model_selection import train_test_split, GridSearchCV , cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler , LabelEncoder , OrdinalEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [125]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

**Importing dataset**

In [126]:

df=pd.read_csv('cleaned_survey2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,self_employed,family_history,treatment,work_interfere,remote_work,tech_company,benefits,care_options,...,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,Cleaned_Gender,no_employees
0,0,37,No,No,Yes,Often,No,Yes,Yes,Not sure,...,No,No,Some of them,Yes,No,Maybe,Yes,No,Female,15
1,1,44,No,No,No,Rarely,No,No,Don't know,No,...,Maybe,No,No,No,No,No,Don't know,No,Male,1000
2,2,32,No,No,No,Rarely,No,Yes,No,No,...,No,No,Yes,Yes,Yes,Yes,No,No,Male,15
3,3,31,Yes,Yes,Yes,Often,No,Yes,No,Yes,...,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,Male,63
4,4,31,Yes,No,No,Never,Yes,Yes,Yes,No,...,No,No,Some of them,Yes,Yes,Yes,Don't know,No,Male,300


In [127]:
df.drop('Unnamed: 0',axis=1,inplace=True)

**Train Test Split**

In [128]:
X=df.drop('Age',axis=1)
y=df['Age']

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**preparing for encoding data**

In [130]:
binary=['Yes','No']

In [131]:
work_int=['Often','Sometimes','Rarely','Never','Dont know']

In [132]:
bene=['Yes',"No", "Don't know"]

In [133]:
care_opt=['Yes','No','Not sure']

In [134]:
well_pro=['Yes','No',"Don't know"]
seek=['Yes','No',"Don't know"]

In [135]:
anoy_mity=['Yes','No',"Don't know"]
ment_vs_phy=['Yes','No',"Don't know"]

In [136]:
lea_ve=['Very easy','Somewhat easy','Somewhat difficult','Very difficult',"Don't know"]

In [137]:
men_hel_cons=['Yes','Maybe','No']
phy_hel_cons=['Yes','Maybe','No']

In [138]:
co_workers=['Yes','Some of them','No']
super_visor=['Yes','Some of them','No']

In [139]:
men_heal_inter=['Yes','Maybe','No']
phy_heal_inter=['Yes','Maybe','No']

In [140]:
cln_gen=['Male','Female']

In [141]:
binary_columns = ['self_employed',
 'family_history',
 'remote_work',
 'tech_company',
 'obs_consequence',
                  'treatment']


In [142]:
df.columns

Index(['Age', 'self_employed', 'family_history', 'treatment', 'work_interfere',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'Cleaned_Gender',
       'no_employees'],
      dtype='object')

In [143]:
preprocessor=ColumnTransformer(transformers=[('trf1',OrdinalEncoder(categories=[binary] * len(binary_columns)),binary_columns),
                                             ('trf2',OrdinalEncoder(categories=[work_int]),["work_interfere"]),
                                             ('trf3',OrdinalEncoder(categories=[bene]),["benefits"]),
                                             ('trf4',OrdinalEncoder(categories=[care_opt]),["care_options"]),
                                             ('trf5',OrdinalEncoder(categories=[well_pro]),["wellness_program"]),
                                             ('trf6',OrdinalEncoder(categories=[seek]),["seek_help"]),
                                             ('trf7',OrdinalEncoder(categories=[anoy_mity]),["anonymity"]),
                                             ('trf8',OrdinalEncoder(categories=[ment_vs_phy]),["mental_vs_physical"]),
                                             ('trf9',OrdinalEncoder(categories=[lea_ve]),["leave"]),
                                             ('trf10',OrdinalEncoder(categories=[men_hel_cons]),["mental_health_consequence"]),
                                             ('trf11',OrdinalEncoder(categories=[phy_hel_cons]),["phys_health_consequence"]),
                                             ('trf12',OrdinalEncoder(categories=[co_workers]),["coworkers"]),
                                             ('trf13',OrdinalEncoder(categories=[super_visor]),["supervisor"]),
                                             ('trf14',OrdinalEncoder(categories=[men_heal_inter]),["mental_health_interview"]),
                                             ('trf15',OrdinalEncoder(categories=[phy_heal_inter]),["phys_health_interview"]),
                                             ('trf16',OrdinalEncoder(categories=[cln_gen]),["Cleaned_Gender"]),

                                             ('trf17',StandardScaler(),['no_employees'])

                                             ],remainder='passthrough')

In [144]:
X_train_encoded=preprocessor.fit_transform(X_train)
X_test_encoded=preprocessor.fit_transform(X_test)

**Linear Regression**

In [145]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train_encoded, y_train)
preds = lr_model.predict(X_test_encoded)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:",np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))

MSE: 41.66017109076887
RMSE: 6.454469078922671
R2 Score: 0.05108698866536188


In [146]:
import pandas as pd
results = []
results.append({
    'Model': 'Linear Regression',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})


**Ridge Regression**

In [147]:
from sklearn.linear_model import Ridge

ridge = Ridge()
param_grid = {'alpha': [0.01, 0.1, 1.0]}
grid = GridSearchCV(ridge, param_grid, cv=5, scoring='r2')
grid.fit(X_train_encoded, y_train)
preds = grid.predict(X_test_encoded)

print("Best Params:", grid.best_params_)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))


Best Params: {'alpha': 1.0}
MSE: 41.64429524479578
RMSE: 6.453239128127501
R2 Score: 0.05144860016179531


In [148]:
results.append({
    'Model': 'Ridge Regression',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})

**Lasso Regression**

In [149]:
from sklearn.linear_model import Lasso

lasso = Lasso()
param_grid = {'alpha': [0.01, 0.1, 1.0]}
grid = GridSearchCV(lasso, param_grid, cv=5, scoring='r2')
grid.fit(X_train_encoded, y_train)
preds = grid.predict(X_test_encoded)

print("Best Params:", grid.best_params_)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))


Best Params: {'alpha': 0.01}
MSE: 41.583852376893
RMSE: 6.448554285798717
R2 Score: 0.0528253353573942


In [150]:
results.append({
    'Model': 'Lasso Regression',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})

**Decision Tree Regressor**

In [151]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid = GridSearchCV(tree, param_grid, cv=5, scoring='r2')
grid.fit(X_train_encoded, y_train)
preds = grid.predict(X_test_encoded)

print("Best Params:", grid.best_params_)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))


Best Params: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
MSE: 44.568474064983114
RMSE: 6.675962407397387
R2 Score: -0.01515677512337188


In [152]:
results.append({
    'Model': 'Decision Tree',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})

**Random Forest Regressor**

In [153]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}
grid = GridSearchCV(rf, param_grid, cv=5, scoring='r2', verbose=0)
grid.fit(X_train_encoded, y_train)
preds = grid.predict(X_test_encoded)

print("Best Params:", grid.best_params_)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))

Best Params: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}
MSE: 42.87626772392533
RMSE: 6.547997229987604
R2 Score: 0.02338739243163168


In [154]:
results.append({
    'Model': 'Random Forest',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})

XGBoost Regressor

In [155]:
from xgboost import XGBRegressor

xgb = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
grid = GridSearchCV(xgb, param_grid, cv=5, scoring='r2')
grid.fit(X_train_encoded, y_train)
preds = grid.predict(X_test_encoded)

print("Best Params:", grid.best_params_)
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
print("R2 Score:", r2_score(y_test, preds))

Best Params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
MSE: 41.942684173583984
RMSE: 6.476317176728143
R2 Score: 0.04465204477310181


In [156]:
results.append({
    'Model': 'XGBoost',
    'MSE': mean_squared_error(y_test, preds),
    'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
    'R2 Score': r2_score(y_test, preds)
})

**Comparison**

In [157]:
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2 Score", ascending=False))

               Model        MSE      RMSE  R2 Score
2   Lasso Regression  41.583852  6.448554  0.052825
1   Ridge Regression  41.644295  6.453239  0.051449
0  Linear Regression  41.660171  6.454469  0.051087
5            XGBoost  41.942684  6.476317  0.044652
4      Random Forest  42.876268  6.547997  0.023387
3      Decision Tree  44.568474  6.675962 -0.015157
