In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Loading the csv file

In [2]:
file = ('mental_health_workplace_survey.csv')
df = pd.read_csv(file)
df.head()

Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1


# <h1>Encoding categorical features<h1><h3>I used Label Encoding technique instead of One hot Encoding as no of columns after doing Label Encoding is 25 wheras 54 columns when i use One hot Encoding. So I used Label Encoding technique to reduce the dimension of dataframe.<h3>

In [3]:
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Feature Selection using correlation

In [4]:
corr = df.corr()['StressLevel'].sort_values(ascending=False)
selected_features = corr[abs(corr) > 0.01].index.tolist()
selected_features.remove('StressLevel')

# Interaction Features

In [5]:
df['Stress_WorkHours'] = df['StressLevel'] * df['WorkHoursPerWeek']
df['Sleep_Stress'] = df['SleepHours'] / (df['StressLevel'] + 0.001) 
selected_features.extend(['Stress_WorkHours', 'Sleep_Stress'])

# Train-test split

In [6]:
X = df[selected_features]
y = df['StressLevel']

In [7]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=5)

# Standardization

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Models

In [9]:
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
}

for model_name, model in models.items():
    print(f"Training {model_name} model..")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} trained - MSE: {mse:.2f}, R²: {r2:.2f}")

Training Linear model..
Linear trained - MSE: 0.83, R²: 0.88
Training Ridge model..
Ridge trained - MSE: 0.83, R²: 0.88
Training Lasso model..
Lasso trained - MSE: 2.17, R²: 0.70


# <h1>Short Summary:<h1>
<h3>The Linear model worked best, with an MSE of 0.83 and an R² of 0.88, and the Ridge model was just as good with the same scores. The Lasso model didn’t do as well, with an MSE of 2.17 and an R² of 0.70. The Linear and Ridge models are better because they handle the data’s patterns well without too much adjustment, while Lasso’s strong adjustment might have made it less accurate.<h3>