In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("D:/projects/workwell-ai-burnout-prediction/data/ibm_hr_employee_attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
# Burnout features selection 
burnout_features = [
    "OverTime",
    "WorkLifeBalance",
    "JobSatisfaction",
    "EnvironmentSatisfaction",
    "YearsAtCompany",
    "JobLevel"
]
df[burnout_features].head()

Unnamed: 0,OverTime,WorkLifeBalance,JobSatisfaction,EnvironmentSatisfaction,YearsAtCompany,JobLevel
0,Yes,1,4,2,6,2
1,No,3,2,3,10,2
2,Yes,3,3,4,0,1
3,Yes,3,3,4,8,1
4,No,3,2,1,2,1


In [5]:
# create a burnout risk score
burnout_score = (
    (df["OverTime"] == "Yes").astype(int) * 2 +
    (df["WorkLifeBalance"] <= 2).astype(int) * 2 +
    (df["JobSatisfaction"] <= 2).astype(int) +
    (df["EnvironmentSatisfaction"] <= 2).astype(int)
)

df["burnout_score"] = burnout_score
df["burnout_score"].value_counts().sort_index()

burnout_score
0    270
1    371
2    335
3    286
4    136
5     55
6     17
Name: count, dtype: int64

In [6]:
# mapping score to burnout risk
def map_burnout_risk(score):
    if score <= 1:
        return "Low"
    elif score <= 3:
        return "Medium"
    else:
        return "High"
df["burnout_risk"] = df["burnout_score"].apply(map_burnout_risk)
df["burnout_risk"].value_counts()

burnout_risk
Low       641
Medium    621
High      208
Name: count, dtype: int64

In [7]:
df[["burnout_score", "burnout_risk"]].head(10)


Unnamed: 0,burnout_score,burnout_risk
0,5,High
1,1,Low
2,2,Medium
3,2,Medium
4,2,Medium
5,2,Medium
6,5,High
7,0,Low
8,0,Low
9,2,Medium


In [8]:
df["burnout_risk"].value_counts(normalize=True)*100

burnout_risk
Low       43.605442
Medium    42.244898
High      14.149660
Name: proportion, dtype: float64