In [40]:
import pandas as pd
import numpy as np


df=pd.read_csv("/home/Ima/work/Dataset/cleaned_burnout_dataset.csv")



In [41]:
#encoding
original_numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [col for col in df.columns if df[col].dtype == "object"]
print("Categorical columns:", categorical_cols)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_array = encoder.fit_transform(df[categorical_cols])

encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))
encoded_df.index = df.index

df_num = df.drop(columns=categorical_cols)

df_encoded = pd.concat([df_num, encoded_df], axis=1)


Categorical columns: ['Gender', 'Country', 'JobRole', 'Department', 'RemoteWork', 'HasMentalHealthSupport', 'HasTherapyAccess', 'SalaryRange']


In [42]:
#normalize
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_encoded[original_numeric_cols] = scaler.fit_transform(df_encoded[original_numeric_cols])

df_enoded.to_csv("cleaned_burnout_dataset.csv", index=False)

In [43]:
#2 interaction feature

df_encoded['Stress_Work_Interaction'] = df_encoded['StressLevel'] * df_encoded['WorkHoursPerWeek']
df_encoded['Sleep_Stress_Ratio'] = df_encoded['SleepHours'] / (df_encoded['StressLevel'] + 1)


In [44]:
#Feature Selection

from sklearn.feature_selection import mutual_info_regression

X = df_encoded.drop(columns=['StressLevel'])
y = df_encoded['StressLevel']

mi_scores = mutual_info_regression(X, y)

mi_scores_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})
mi_scores_df = mi_scores_df.sort_values(by='MI Score', ascending=False)

print(mi_scores_df)

X_selected = X[mi_scores_df[mi_scores_df['MI Score'] > 0.01]['Feature']]




                       Feature  MI Score
53     Stress_Work_Interaction  0.725132
54          Sleep_Stress_Ratio  0.118551
23             Country_Germany  0.011331
52            SalaryRange_<40K  0.010839
43              RemoteWork_Yes  0.009960
14           CareerGrowthScore  0.009690
7                   SleepHours  0.007985
10         ManagerSupportScore  0.007936
48           SalaryRange_100K+  0.007631
31   JobRole_Marketing Manager  0.007400
21              Country_Brazil  0.006017
41           RemoteWork_Hybrid  0.005854
34   JobRole_Software Engineer  0.005687
38        Department_Marketing  0.004962
37               Department_IT  0.004565
15                 BurnoutRisk  0.004449
29       JobRole_HR Specialist  0.003506
51        SalaryRange_80K-100K  0.003099
20           Country_Australia  0.002984
22              Country_Canada  0.001891
36               Department_HR  0.001270
28      JobRole_Data Scientist  0.000895
6            ProductivityScore  0.000736
2               

In [45]:
#model training 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['StressLevel'])
y = df_encoded['StressLevel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
#Linear ,Ridge and Lasso model training 

lr = LinearRegression()
ridge=Ridge(alpha=1.0)
lasso=Lasso(alpha=0.1)

lr.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)




0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [49]:
#predictions

y_pred_lr = lr.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lasso = lasso.predict(X_test)


In [50]:
#MSE and R² scores of each model training
print("Linear Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R²:", r2_score(y_test, y_pred_lr))

print("Ridge Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_ridge))
print("R²:", r2_score(y_test, y_pred_ridge))

print("Lasso Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_lasso))
print("R²:", r2_score(y_test, y_pred_lasso))


Linear Regression:
MSE: 0.013413586718019387
R²: 0.835935896498945
Ridge Regression:
MSE: 0.013735341218281778
R²: 0.8320004566540551
Lasso Regression:
MSE: 0.08195472453703702
R²: -0.0024036591780627337


In [52]:
#Linear regression model performed the best in this case because it has a MSE of 0.134 and R^2 of 0.836. This shows that the selected features to predict the target have strong relationsthips with the target