In [1]:
import numpy as np
import pandas as pd

In [None]:
import numpy as np
import pandas as pd

def generate_realistic_employee_data(num_employees=200, random_state=42):
    np.random.seed(random_state)

    # ----- Calendar -----
    meeting_hours = np.random.normal(8, 3, num_employees).clip(2, 15)
    meeting_counts = np.random.poisson(12, num_employees).clip(5, 25)

    # ----- Communication -----
    messages_sent = np.random.normal(80, 25, num_employees).clip(20, 150).astype(int)
    messages_received = np.random.normal(100, 30, num_employees).clip(30, 180).astype(int)
    latency = np.random.normal(10, 5, num_employees).clip(1, 30)
    burstiness = np.random.beta(2, 4, num_employees)
    after_hours_ratio = np.random.beta(1.5, 10, num_employees)
    balance = np.clip(messages_sent / (messages_received + 1), 0.6, 1.4)
    conversation_len = np.random.normal(12, 5, num_employees).clip(5, 30)

    # ----- Project Management -----
    tasks_assigned = np.random.poisson(25, num_employees).clip(10, 40)
    completion_rate = np.random.uniform(0.7, 1.0, num_employees)
    tasks_completed = (tasks_assigned * completion_rate).round().astype(int)
    task_age = np.random.normal(6, 3, num_employees).clip(1, 15)
    overdue_ratio = np.clip(1 - completion_rate + np.random.normal(0, 0.05, num_employees), 0, 0.3)
    sentiment = np.random.normal(0.1 - overdue_ratio, 0.3).clip(-1, 1)

    # ----- Attendance -----
    logged_hours = np.random.normal(42, 4, num_employees).clip(35, 50)
    var_hours = np.random.normal(1.5, 0.5, num_employees).clip(0, 3)
    late_starts = np.random.poisson(2, num_employees).clip(0, 6)
    early_exits = np.random.poisson(1, num_employees).clip(0, 4)
    
    # <<< ADDED these two lines to generate the new data
    early_start_counts = np.random.poisson(2, num_employees).clip(0, 6)
    late_exit_counts = np.random.poisson(1.5, num_employees).clip(0, 5)
    
    absenteeism = np.random.uniform(0, 0.08, num_employees)
    avg_break = np.random.normal(45, 15, num_employees).clip(30, 90)

    # ----- Outcomes (y) -----
    performance = (
        0.4 * completion_rate
        + 0.2 * (1 - absenteeism)
        + 0.1 * (1 - overdue_ratio)
        + 0.1 * sentiment
        + 0.1 * (1 - after_hours_ratio)
        + np.random.normal(0, 0.03, num_employees)
    )
    performance = np.clip(performance, 0, 1)

    burnout = (
        0.3 * (meeting_hours / 15)
        + 0.3 * after_hours_ratio
        + 0.2 * (logged_hours / 50)
        - 0.2 * sentiment
        + np.random.normal(0, 0.05, num_employees)
    )
    burnout = np.clip(burnout, 0, 1)

    # ----- Assemble -----
    df = pd.DataFrame({
        "meeting_hours_per_week": meeting_hours.round(1),
        "meeting_counts_per_week": meeting_counts,
        "messages_sent_per_week": messages_sent,
        "messages_received_per_week": messages_received,
        "avg_response_latency_min": latency.round(1),
        "communication_burstiness": burstiness.round(2),
        "after_hours_message_ratio": after_hours_ratio.round(3),
        "communication_balance": balance.round(2),
        "conversation_length_avg": conversation_len.round(1),
        "avg_tasks_assigned_per_week": tasks_assigned,
        "avg_tasks_completed_per_week": tasks_completed,
        "task_completion_rate": completion_rate.round(2),
        "avg_task_age_days": task_age.round(1),
        "overdue_task_ratio": overdue_ratio.round(2),
        "task_comment_sentiment_mean": sentiment.round(2),
        "logged_hours": logged_hours.round(1),
        "variance_in_work_hours": var_hours.round(2),
        "late_start_count_per_week": late_starts,
        "early_exit_count_per_week": early_exits,
        
        # <<< ADDED these two lines to assemble the DataFrame
        "early_start_count_per_week": early_start_counts,
        "late_exit_count_per_week": late_exit_counts,
        
        "absenteeism_rate": absenteeism.round(3),
        "avg_break_length_minutes_per_week": avg_break.round(1),
        "performance_score": performance.round(2),
        "burnout_risk_score": burnout.round(2),
    })

    return df



In [3]:
df = generate_realistic_employee_data(2000)

In [4]:
df.head()


Unnamed: 0,meeting_hours_per_week,meeting_counts_per_week,messages_sent_per_week,messages_received_per_week,avg_response_latency_min,communication_burstiness,after_hours_message_ratio,communication_balance,conversation_length_avg,avg_tasks_assigned_per_week,...,logged_hours_per_week,variance_in_work_hours,late_start_count_per_week,early_exit_count_per_week,early_start_count_per_week,late_exit_count_per_week,absenteeism_rate,avg_break_length_minutes_per_week,performance_score,burnout_risk_score
0,9.5,11,93,133,9.6,0.37,0.201,0.69,12.2,19,...,43.4,1.04,5,0,2,0,0.064,59.0,0.68,0.42
1,7.6,10,61,112,16.6,0.2,0.072,0.6,20.7,23,...,40.1,1.27,4,0,3,1,0.057,35.9,0.75,0.32
2,9.9,12,106,71,6.5,0.29,0.104,1.4,11.6,19,...,46.1,1.93,3,0,2,2,0.02,57.8,0.78,0.39
3,12.6,6,48,42,19.3,0.48,0.18,1.12,9.8,30,...,41.2,1.38,0,1,1,1,0.033,34.8,0.65,0.53
4,7.3,15,92,47,6.6,0.54,0.054,1.4,17.7,25,...,42.4,1.51,3,0,1,1,0.012,51.5,0.65,0.38


In [5]:
df.to_csv("employee_data.csv", index=False)

In [6]:
# get the data
df = pd.read_csv("employee_data.csv")

In [7]:
df.head()

Unnamed: 0,meeting_hours_per_week,meeting_counts_per_week,messages_sent_per_week,messages_received_per_week,avg_response_latency_min,communication_burstiness,after_hours_message_ratio,communication_balance,conversation_length_avg,avg_tasks_assigned_per_week,...,logged_hours_per_week,variance_in_work_hours,late_start_count_per_week,early_exit_count_per_week,early_start_count_per_week,late_exit_count_per_week,absenteeism_rate,avg_break_length_minutes_per_week,performance_score,burnout_risk_score
0,9.5,11,93,133,9.6,0.37,0.201,0.69,12.2,19,...,43.4,1.04,5,0,2,0,0.064,59.0,0.68,0.42
1,7.6,10,61,112,16.6,0.2,0.072,0.6,20.7,23,...,40.1,1.27,4,0,3,1,0.057,35.9,0.75,0.32
2,9.9,12,106,71,6.5,0.29,0.104,1.4,11.6,19,...,46.1,1.93,3,0,2,2,0.02,57.8,0.78,0.39
3,12.6,6,48,42,19.3,0.48,0.18,1.12,9.8,30,...,41.2,1.38,0,1,1,1,0.033,34.8,0.65,0.53
4,7.3,15,92,47,6.6,0.54,0.054,1.4,17.7,25,...,42.4,1.51,3,0,1,1,0.012,51.5,0.65,0.38


In [8]:
# create feature and target variables
x = df.drop(columns=["burnout_risk_score"])
y = df["burnout_risk_score"]

In [9]:
# split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
# create a list of 3 classification models to compare the evaluate 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

models = [
    RandomForestRegressor(),
    LinearRegression(),
    SVR()
]

In [11]:
# fit the data to each model and evaluate performance
for model in models:
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    print(f"{model.__class__.__name__} R^2 Score: {score:.4f}")

RandomForestRegressor R^2 Score: 0.6984
LinearRegression R^2 Score: 0.7529
SVR R^2 Score: 0.3618


In [12]:
# use randamised cv to evaluate the best model LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression

param_dist = {
    "fit_intercept": [True, False],
    "copy_X": [True, False],
}

lr = LinearRegression()
lr_random = RandomizedSearchCV(
    estimator=lr,
    param_distributions=param_dist,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
lr_random.fit(x_train, y_train)
best_lr = lr_random.best_estimator_
best_score = best_lr.score(x_test, y_test)
print(f"Best LinearRegression R^2 Score after RandomizedSearchCV: {best_score:.4f}")

Fitting 3 folds for each of 4 candidates, totalling 12 fits




[CV] END ...................copy_X=True, fit_intercept=False; total time=   0.0s
[CV] END ...................copy_X=True, fit_intercept=False; total time=   0.0s
[CV] END ....................copy_X=True, fit_intercept=True; total time=   0.0s
[CV] END ....................copy_X=True, fit_intercept=True; total time=   0.0s
[CV] END ....................copy_X=True, fit_intercept=True; total time=   0.0s
[CV] END ...................copy_X=True, fit_intercept=False; total time=   0.0s
[CV] END ...................copy_X=False, fit_intercept=True; total time=   0.0s
[CV] END ..................copy_X=False, fit_intercept=False; total time=   0.0s
[CV] END ...................copy_X=False, fit_intercept=True; total time=   0.0s
[CV] END ...................copy_X=False, fit_intercept=True; total time=   0.0s
[CV] END ..................copy_X=False, fit_intercept=False; total time=   0.0s
[CV] END ..................copy_X=False, fit_intercept=False; total time=   0.0s
Best LinearRegression R^2 Sc