Step 1: Load the Dataset

In [48]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/post_pandemic_remote_work_health_impact_2025.csv")

# Quick look
print(df.shape)
print(df.head())


(3157, 14)
  Survey_Date  Age  Gender         Region               Industry  \
0  2025-06-01   27  Female           Asia  Professional Services   
1  2025-06-01   37  Female           Asia  Professional Services   
2  2025-06-01   32  Female         Africa              Education   
3  2025-06-01   40  Female         Europe              Education   
4  2025-06-01   30    Male  South America          Manufacturing   

           Job_Role Work_Arrangement  Hours_Per_Week Mental_Health_Status  \
0      Data Analyst           Onsite              64      Stress Disorder   
1      Data Analyst           Onsite              37      Stress Disorder   
2  Business Analyst           Onsite              36                 ADHD   
3      Data Analyst           Onsite              63                 ADHD   
4   DevOps Engineer           Hybrid              65                  NaN   

  Burnout_Level  Work_Life_Balance_Score     Physical_Health_Issues  \
0          High                        3   Sho

Step 2: Check Missing Values

In [49]:
# Check missing values per column
df.isnull().sum()


Unnamed: 0,0
Survey_Date,0
Age,0
Gender,0
Region,0
Industry,0
Job_Role,0
Work_Arrangement,0
Hours_Per_Week,0
Mental_Health_Status,799
Burnout_Level,0


Step 3: Handle Missing Values

In [50]:
# List of categorical features with missing values
missing_cat_cols = ['Mental_Health_Status', 'Physical_Health_Issues']

# Fill missing values with mode
for col in missing_cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Verify missing values are handled
df.isnull().sum()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Unnamed: 0,0
Survey_Date,0
Age,0
Gender,0
Region,0
Industry,0
Job_Role,0
Work_Arrangement,0
Hours_Per_Week,0
Mental_Health_Status,0
Burnout_Level,0


Step 04: Categorical encoding

In [51]:
#identified categorical features
cat_cols = ['Gender', 'Region', 'Industry', 'Job_Role',
            'Work_Arrangement', 'Mental_Health_Status',
            'Physical_Health_Issues', 'Salary_Range']


Step : Encode based on type

Nominal categorical variables → One-Hot Encoding

Prevents introducing false order among categories.

Ordinal variable (target: Burnout_Level) → Label Encoding

In [52]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Burnout_Level'] = le.fit_transform(df['Burnout_Level'])


In [53]:
df.columns.tolist()


['Survey_Date',
 'Age',
 'Gender',
 'Region',
 'Industry',
 'Job_Role',
 'Work_Arrangement',
 'Hours_Per_Week',
 'Mental_Health_Status',
 'Burnout_Level',
 'Work_Life_Balance_Score',
 'Physical_Health_Issues',
 'Social_Isolation_Score',
 'Salary_Range']

In [54]:
# 1. Ensure column names have no extra spaces
df.columns = df.columns.str.strip()

# 2. Label encode the target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Burnout_Level'] = le.fit_transform(df['Burnout_Level'])  # Low=0, Medium=1, High=2

# 3. Define input categorical columns for one-hot encoding
cat_cols = ['Gender', 'Region', 'Industry', 'Job_Role',
            'Work_Arrangement', 'Mental_Health_Status',
            'Physical_Health_Issues', 'Salary_Range']

# 4. Apply one-hot encoding
df = pd.get_dummies(df, columns=cat_cols)

# 5. Verify encoding
print("Dataset shape after encoding:", df.shape)
print(df.head())


Dataset shape after encoding: (3157, 94)
  Survey_Date  Age  Hours_Per_Week  Burnout_Level  Work_Life_Balance_Score  \
0  2025-06-01   27              64              0                        3   
1  2025-06-01   37              37              0                        4   
2  2025-06-01   32              36              0                        3   
3  2025-06-01   40              63              2                        1   
4  2025-06-01   30              65              2                        5   

   Social_Isolation_Score  Gender_Female  Gender_Male  Gender_Non-binary  \
0                       2           True        False              False   
1                       2           True        False              False   
2                       2           True        False              False   
3                       2           True        False              False   
4                       4          False         True              False   

   Gender_Prefer not to say  ... 

Step 05: Identify numerical features

In [55]:
num_cols = ['Age', 'Hours_Per_Week', 'Work_Life_Balance_Score', 'Social_Isolation_Score']


Step 06: Apply Min-Max Scaling

In [56]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Verify scaling
print(df[num_cols].head())


        Age  Hours_Per_Week  Work_Life_Balance_Score  Social_Isolation_Score
0  0.116279        0.966667                     0.50                    0.25
1  0.348837        0.066667                     0.75                    0.25
2  0.232558        0.033333                     0.50                    0.25
3  0.418605        0.933333                     0.00                    0.25
4  0.186047        1.000000                     1.00                    0.75
