# Workforce Wellbeing Analysis - Model Training

Training three separate models to predict:
1. **Burnout Risk Score** (0-1)
2. **Wellbeing Score** (0-100)
3. **Efficiency Score** (0-100)

Using realistic dataset with 300 samples and 110 features.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("âœ“ All libraries imported successfully")

## 2. Load and Explore Dataset

In [None]:
# Load the realistic dataset
df = pd.read_csv('dataset/realistic_emp_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {len(df)}")
print(f"Total features: {len(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("âœ“ No missing values found")
else:
    print("Missing values:")
    print(missing_values[missing_values > 0])

# Display data types
print(f"\nData types:\n{df.dtypes.value_counts()}")

## 3. Exploratory Data Analysis

In [None]:
# Target variable statistics
target_stats = df[['burnout_risk_score', 'wellbeing_score', 'efficiency_score']].describe()
print("Target Variable Statistics:")
target_stats

In [None]:
# Visualize target variable distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Burnout Risk Score
axes[0].hist(df['burnout_risk_score'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Burnout Risk Score Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Burnout Risk Score (0-1)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['burnout_risk_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["burnout_risk_score"].mean():.3f}')
axes[0].legend()

# Wellbeing Score
axes[1].hist(df['wellbeing_score'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1].set_title('Wellbeing Score Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Wellbeing Score (0-100)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['wellbeing_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["wellbeing_score"].mean():.2f}')
axes[1].legend()

# Efficiency Score
axes[2].hist(df['efficiency_score'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[2].set_title('Efficiency Score Distribution', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Efficiency Score (0-100)')
axes[2].set_ylabel('Frequency')
axes[2].axvline(df['efficiency_score'].mean(), color='red', linestyle='--', label=f'Mean: {df["efficiency_score"].mean():.2f}')
axes[2].legend()

plt.tight_layout()
plt.show()

print("âœ“ Target variable distributions plotted")

In [None]:
# Role distribution
role_counts = df['role'].value_counts()
print("Role Distribution:")
print(role_counts)
print(f"\nRole Percentages:")
print(role_counts / len(df) * 100)

# Visualize role distribution
plt.figure(figsize=(10, 5))
role_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Employee Role Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Role')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Correlation between targets
target_corr = df[['burnout_risk_score', 'wellbeing_score', 'efficiency_score']].corr()
print("Target Variable Correlations:")
print(target_corr)

plt.figure(figsize=(8, 6))
sns.heatmap(target_corr, annot=True, fmt='.3f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Target Variable Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Data Preparation

In [None]:
# Separate features and targets
X = df.drop(['employee_id', 'burnout_risk_score', 'wellbeing_score', 'efficiency_score'], axis=1)
y_burnout = df['burnout_risk_score']
y_wellbeing = df['wellbeing_score']
y_efficiency = df['efficiency_score']

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"\nFeature columns:")
print(list(X.columns))

In [None]:
# One-hot encode categorical variable (role)
X_encoded = pd.get_dummies(X, columns=['role'], prefix='role', drop_first=False)

print(f"Encoded feature matrix shape: {X_encoded.shape}")
print(f"Number of features after encoding: {X_encoded.shape[1]}")
print(f"\nNew role columns: {[col for col in X_encoded.columns if col.startswith('role_')]}")

In [None]:
# Save feature columns for later use
feature_columns = list(X_encoded.columns)
print(f"Total features for training: {len(feature_columns)}")

## 5. Model Training Functions

In [None]:
def train_models(X_train, y_train, X_test, y_test, target_name):
    """
    Train multiple models and select the best one using cross-validation.
    
    Args:
        X_train: Training features
        y_train: Training target
        X_test: Test features
        y_test: Test target
        target_name: Name of the target variable
    
    Returns:
        Tuple of (best_model, model_name, cv_scores, metrics)
    """
    print(f"\n{'='*60}")
    print(f"Training models for: {target_name}")
    print(f"{'='*60}")
    
    # Define models to test
    models = {
        'XGBoost': XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1),
        'GradientBoosting': GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1),
        'RandomForest': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10)
    }
    
    best_score = -float('inf')
    best_model = None
    best_model_name = None
    all_cv_scores = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Perform 5-fold cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
        mean_cv_score = cv_scores.mean()
        all_cv_scores[name] = cv_scores
        
        print(f"  CV RÂ² scores: {cv_scores}")
        print(f"  Mean CV RÂ²: {mean_cv_score:.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        # Track best model
        if mean_cv_score > best_score:
            best_score = mean_cv_score
            best_model = model
            best_model_name = name
    
    # Train best model on full training set
    print(f"\n{'='*60}")
    print(f"Best model: {best_model_name} (CV RÂ²: {best_score:.4f})")
    print(f"Training {best_model_name} on full training set...")
    best_model.fit(X_train, y_train)
    
    # Evaluate on train and test sets
    train_pred = best_model.predict(X_train)
    test_pred = best_model.predict(X_test)
    
    metrics = {
        'model_name': best_model_name,
        'cv_mean_r2': float(best_score),
        'cv_std_r2': float(all_cv_scores[best_model_name].std()),
        'train_r2': float(r2_score(y_train, train_pred)),
        'test_r2': float(r2_score(y_test, test_pred)),
        'train_rmse': float(np.sqrt(mean_squared_error(y_train, train_pred))),
        'test_rmse': float(np.sqrt(mean_squared_error(y_test, test_pred))),
        'train_mae': float(mean_absolute_error(y_train, train_pred)),
        'test_mae': float(mean_absolute_error(y_test, test_pred))
    }
    
    print(f"\nTraining Metrics:")
    print(f"  RÂ²: {metrics['train_r2']:.4f}")
    print(f"  RMSE: {metrics['train_rmse']:.4f}")
    print(f"  MAE: {metrics['train_mae']:.4f}")
    
    print(f"\nTest Metrics:")
    print(f"  RÂ²: {metrics['test_r2']:.4f}")
    print(f"  RMSE: {metrics['test_rmse']:.4f}")
    print(f"  MAE: {metrics['test_mae']:.4f}")
    
    return best_model, best_model_name, all_cv_scores, metrics

print("âœ“ Model training function defined")

In [None]:
def plot_predictions(y_true, y_pred, title, target_name):
    """
    Plot actual vs predicted values.
    """
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, alpha=0.5, edgecolor='black')
    
    # Plot perfect prediction line
    min_val = min(y_true.min(), y_pred.min())
    max_val = max(y_true.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    plt.xlabel(f'Actual {target_name}')
    plt.ylabel(f'Predicted {target_name}')
    plt.title(title, fontsize=14, fontweight='bold')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

print("âœ“ Prediction plotting function defined")

In [None]:
def get_feature_importance(model, feature_names, top_n=15):
    """
    Extract and plot feature importance.
    """
    if hasattr(model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return importance_df
    else:
        return None

def plot_feature_importance(importance_df, title, top_n=15):
    """
    Plot top N most important features.
    """
    if importance_df is not None:
        plt.figure(figsize=(10, 8))
        top_features = importance_df.head(top_n)
        plt.barh(range(len(top_features)), top_features['importance'], color='steelblue', edgecolor='black')
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Importance')
        plt.title(title, fontsize=14, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

print("âœ“ Feature importance functions defined")

## 6. Train Model 1: Burnout Risk Score

In [None]:
# Split data for burnout risk model
X_train_burnout, X_test_burnout, y_train_burnout, y_test_burnout = train_test_split(
    X_encoded, y_burnout, test_size=0.2, random_state=42
)

# Scale features
scaler_burnout = StandardScaler()
X_train_burnout_scaled = scaler_burnout.fit_transform(X_train_burnout)
X_test_burnout_scaled = scaler_burnout.transform(X_test_burnout)

print(f"Training set size: {X_train_burnout_scaled.shape[0]}")
print(f"Test set size: {X_test_burnout_scaled.shape[0]}")

In [None]:
# Train models for burnout risk
model_burnout, model_name_burnout, cv_scores_burnout, metrics_burnout = train_models(
    X_train_burnout_scaled, y_train_burnout,
    X_test_burnout_scaled, y_test_burnout,
    'Burnout Risk Score'
)

In [None]:
# Plot predictions for burnout risk
y_pred_burnout_test = model_burnout.predict(X_test_burnout_scaled)
plot_predictions(y_test_burnout, y_pred_burnout_test, 
                f'Burnout Risk: Actual vs Predicted ({model_name_burnout})',
                'Burnout Risk Score')

In [None]:
# Feature importance for burnout risk
importance_burnout = get_feature_importance(model_burnout, feature_columns)
if importance_burnout is not None:
    print("\nTop 15 Features for Burnout Risk Prediction:")
    print(importance_burnout.head(15))
    plot_feature_importance(importance_burnout, 
                          f'Top 15 Features for Burnout Risk ({model_name_burnout})')

## 7. Train Model 2: Wellbeing Score

In [None]:
# Split data for wellbeing model
X_train_wellbeing, X_test_wellbeing, y_train_wellbeing, y_test_wellbeing = train_test_split(
    X_encoded, y_wellbeing, test_size=0.2, random_state=42
)

# Scale features
scaler_wellbeing = StandardScaler()
X_train_wellbeing_scaled = scaler_wellbeing.fit_transform(X_train_wellbeing)
X_test_wellbeing_scaled = scaler_wellbeing.transform(X_test_wellbeing)

print(f"Training set size: {X_train_wellbeing_scaled.shape[0]}")
print(f"Test set size: {X_test_wellbeing_scaled.shape[0]}")

In [None]:
# Train models for wellbeing
model_wellbeing, model_name_wellbeing, cv_scores_wellbeing, metrics_wellbeing = train_models(
    X_train_wellbeing_scaled, y_train_wellbeing,
    X_test_wellbeing_scaled, y_test_wellbeing,
    'Wellbeing Score'
)

In [None]:
# Plot predictions for wellbeing
y_pred_wellbeing_test = model_wellbeing.predict(X_test_wellbeing_scaled)
plot_predictions(y_test_wellbeing, y_pred_wellbeing_test,
                f'Wellbeing: Actual vs Predicted ({model_name_wellbeing})',
                'Wellbeing Score')

In [None]:
# Feature importance for wellbeing
importance_wellbeing = get_feature_importance(model_wellbeing, feature_columns)
if importance_wellbeing is not None:
    print("\nTop 15 Features for Wellbeing Prediction:")
    print(importance_wellbeing.head(15))
    plot_feature_importance(importance_wellbeing,
                          f'Top 15 Features for Wellbeing ({model_name_wellbeing})')

## 8. Train Model 3: Efficiency Score

In [None]:
# Split data for efficiency model
X_train_efficiency, X_test_efficiency, y_train_efficiency, y_test_efficiency = train_test_split(
    X_encoded, y_efficiency, test_size=0.2, random_state=42
)

# Scale features
scaler_efficiency = StandardScaler()
X_train_efficiency_scaled = scaler_efficiency.fit_transform(X_train_efficiency)
X_test_efficiency_scaled = scaler_efficiency.transform(X_test_efficiency)

print(f"Training set size: {X_train_efficiency_scaled.shape[0]}")
print(f"Test set size: {X_test_efficiency_scaled.shape[0]}")

In [None]:
# Train models for efficiency
model_efficiency, model_name_efficiency, cv_scores_efficiency, metrics_efficiency = train_models(
    X_train_efficiency_scaled, y_train_efficiency,
    X_test_efficiency_scaled, y_test_efficiency,
    'Efficiency Score'
)

In [None]:
# Plot predictions for efficiency
y_pred_efficiency_test = model_efficiency.predict(X_test_efficiency_scaled)
plot_predictions(y_test_efficiency, y_pred_efficiency_test,
                f'Efficiency: Actual vs Predicted ({model_name_efficiency})',
                'Efficiency Score')

In [None]:
# Feature importance for efficiency
importance_efficiency = get_feature_importance(model_efficiency, feature_columns)
if importance_efficiency is not None:
    print("\nTop 15 Features for Efficiency Prediction:")
    print(importance_efficiency.head(15))
    plot_feature_importance(importance_efficiency,
                          f'Top 15 Features for Efficiency ({model_name_efficiency})')

## 9. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame({
    'Model': ['Burnout Risk', 'Wellbeing', 'Efficiency'],
    'Algorithm': [model_name_burnout, model_name_wellbeing, model_name_efficiency],
    'CV RÂ²': [metrics_burnout['cv_mean_r2'], metrics_wellbeing['cv_mean_r2'], metrics_efficiency['cv_mean_r2']],
    'Train RÂ²': [metrics_burnout['train_r2'], metrics_wellbeing['train_r2'], metrics_efficiency['train_r2']],
    'Test RÂ²': [metrics_burnout['test_r2'], metrics_wellbeing['test_r2'], metrics_efficiency['test_r2']],
    'Test RMSE': [metrics_burnout['test_rmse'], metrics_wellbeing['test_rmse'], metrics_efficiency['test_rmse']],
    'Test MAE': [metrics_burnout['test_mae'], metrics_wellbeing['test_mae'], metrics_efficiency['test_mae']]
})

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# RÂ² comparison
x_pos = np.arange(len(comparison_df))
width = 0.25

axes[0].bar(x_pos - width, comparison_df['CV RÂ²'], width, label='CV RÂ²', alpha=0.8)
axes[0].bar(x_pos, comparison_df['Train RÂ²'], width, label='Train RÂ²', alpha=0.8)
axes[0].bar(x_pos + width, comparison_df['Test RÂ²'], width, label='Test RÂ²', alpha=0.8)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('RÂ² Score')
axes[0].set_title('Model Performance Comparison (RÂ²)', fontsize=12, fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=1)

# Error comparison
axes[1].bar(x_pos - width/2, comparison_df['Test RMSE'], width, label='RMSE', alpha=0.8)
axes[1].bar(x_pos + width/2, comparison_df['Test MAE'], width, label='MAE', alpha=0.8)
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Error')
axes[1].set_title('Model Error Comparison', fontsize=12, fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Save Models and Artifacts

In [None]:
# Create output directory
import os

output_dir = 'model_realistic'
os.makedirs(output_dir, exist_ok=True)

print(f"Created output directory: {output_dir}")

In [None]:
# Save models
joblib.dump(model_burnout, f'{output_dir}/burnout_risk_model.pkl')
joblib.dump(model_wellbeing, f'{output_dir}/wellbeing_model.pkl')
joblib.dump(model_efficiency, f'{output_dir}/efficiency_model.pkl')

print("âœ“ Models saved")

In [None]:
# Save scalers
joblib.dump(scaler_burnout, f'{output_dir}/burnout_risk_scaler.pkl')
joblib.dump(scaler_wellbeing, f'{output_dir}/wellbeing_scaler.pkl')
joblib.dump(scaler_efficiency, f'{output_dir}/efficiency_scaler.pkl')

print("âœ“ Scalers saved")

In [None]:
# Save feature columns
with open(f'{output_dir}/feature_columns.json', 'w') as f:
    json.dump(feature_columns, f, indent=2)

print("âœ“ Feature columns saved")

In [None]:
# Save feature importance
if importance_burnout is not None:
    importance_burnout.to_csv(f'{output_dir}/burnout_risk_feature_importance.csv', index=False)

if importance_wellbeing is not None:
    importance_wellbeing.to_csv(f'{output_dir}/wellbeing_feature_importance.csv', index=False)

if importance_efficiency is not None:
    importance_efficiency.to_csv(f'{output_dir}/efficiency_feature_importance.csv', index=False)

print("âœ“ Feature importance saved")

In [None]:
# Save model metrics
all_metrics = {
    'burnout_risk': metrics_burnout,
    'wellbeing': metrics_wellbeing,
    'efficiency': metrics_efficiency,
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset': 'realistic_emp_data.csv',
    'total_samples': len(df),
    'total_features': len(feature_columns),
    'test_size': 0.2
}

with open(f'{output_dir}/model_metrics.json', 'w') as f:
    json.dump(all_metrics, f, indent=2)

print("âœ“ Model metrics saved")

In [None]:
# Generate training summary report
summary_report = f"""
{'='*80}
WORKFORCE WELLBEING ANALYSIS - MODEL TRAINING SUMMARY
{'='*80}

Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Dataset: realistic_emp_data.csv
Total Samples: {len(df)}
Training Samples: {len(X_train_burnout)}
Test Samples: {len(X_test_burnout)}
Total Features: {len(feature_columns)}

{'='*80}
MODEL 1: BURNOUT RISK SCORE
{'='*80}
Algorithm: {model_name_burnout}
Cross-Validation RÂ²: {metrics_burnout['cv_mean_r2']:.4f} (+/- {metrics_burnout['cv_std_r2']:.4f})
Training RÂ²: {metrics_burnout['train_r2']:.4f}
Test RÂ²: {metrics_burnout['test_r2']:.4f}
Test RMSE: {metrics_burnout['test_rmse']:.4f}
Test MAE: {metrics_burnout['test_mae']:.4f}

Top 5 Important Features:
{importance_burnout.head(5).to_string(index=False) if importance_burnout is not None else 'N/A'}

{'='*80}
MODEL 2: WELLBEING SCORE
{'='*80}
Algorithm: {model_name_wellbeing}
Cross-Validation RÂ²: {metrics_wellbeing['cv_mean_r2']:.4f} (+/- {metrics_wellbeing['cv_std_r2']:.4f})
Training RÂ²: {metrics_wellbeing['train_r2']:.4f}
Test RÂ²: {metrics_wellbeing['test_r2']:.4f}
Test RMSE: {metrics_wellbeing['test_rmse']:.4f}
Test MAE: {metrics_wellbeing['test_mae']:.4f}

Top 5 Important Features:
{importance_wellbeing.head(5).to_string(index=False) if importance_wellbeing is not None else 'N/A'}

{'='*80}
MODEL 3: EFFICIENCY SCORE
{'='*80}
Algorithm: {model_name_efficiency}
Cross-Validation RÂ²: {metrics_efficiency['cv_mean_r2']:.4f} (+/- {metrics_efficiency['cv_std_r2']:.4f})
Training RÂ²: {metrics_efficiency['train_r2']:.4f}
Test RÂ²: {metrics_efficiency['test_r2']:.4f}
Test RMSE: {metrics_efficiency['test_rmse']:.4f}
Test MAE: {metrics_efficiency['test_mae']:.4f}

Top 5 Important Features:
{importance_efficiency.head(5).to_string(index=False) if importance_efficiency is not None else 'N/A'}

{'='*80}
FILES SAVED
{'='*80}
- burnout_risk_model.pkl
- wellbeing_model.pkl
- efficiency_model.pkl
- burnout_risk_scaler.pkl
- wellbeing_scaler.pkl
- efficiency_scaler.pkl
- burnout_risk_feature_importance.csv
- wellbeing_feature_importance.csv
- efficiency_feature_importance.csv
- feature_columns.json
- model_metrics.json
- training_summary.txt

{'='*80}
TRAINING COMPLETE
{'='*80}
"""

# Save summary report
with open(f'{output_dir}/training_summary.txt', 'w') as f:
    f.write(summary_report)

print(summary_report)
print("\nâœ“ Training summary saved")

## 11. Model Usage Example

In [None]:
# Example: Load models and make predictions
def load_models_and_predict(employee_data):
    """
    Load trained models and make predictions for a single employee.
    
    Args:
        employee_data: Dictionary with employee features
    
    Returns:
        Dictionary with predictions
    """
    # Load models and scalers
    model_burnout_loaded = joblib.load(f'{output_dir}/burnout_risk_model.pkl')
    model_wellbeing_loaded = joblib.load(f'{output_dir}/wellbeing_model.pkl')
    model_efficiency_loaded = joblib.load(f'{output_dir}/efficiency_model.pkl')
    
    scaler_burnout_loaded = joblib.load(f'{output_dir}/burnout_risk_scaler.pkl')
    scaler_wellbeing_loaded = joblib.load(f'{output_dir}/wellbeing_scaler.pkl')
    scaler_efficiency_loaded = joblib.load(f'{output_dir}/efficiency_scaler.pkl')
    
    # Load feature columns
    with open(f'{output_dir}/feature_columns.json', 'r') as f:
        feature_cols = json.load(f)
    
    # Prepare features
    employee_df = pd.DataFrame([employee_data])
    employee_encoded = pd.get_dummies(employee_df, columns=['role'], prefix='role', drop_first=False)
    
    # Ensure all features are present
    for col in feature_cols:
        if col not in employee_encoded.columns:
            employee_encoded[col] = 0
    
    employee_encoded = employee_encoded[feature_cols]
    
    # Scale features
    employee_scaled_burnout = scaler_burnout_loaded.transform(employee_encoded)
    employee_scaled_wellbeing = scaler_wellbeing_loaded.transform(employee_encoded)
    employee_scaled_efficiency = scaler_efficiency_loaded.transform(employee_encoded)
    
    # Make predictions
    predictions = {
        'burnout_risk_score': float(model_burnout_loaded.predict(employee_scaled_burnout)[0]),
        'wellbeing_score': float(model_wellbeing_loaded.predict(employee_scaled_wellbeing)[0]),
        'efficiency_score': float(model_efficiency_loaded.predict(employee_scaled_efficiency)[0])
    }
    
    return predictions

print("âœ“ Prediction function defined")
print("\nExample usage:")
print("predictions = load_models_and_predict(employee_data)")

In [None]:
# Test with a sample employee from test set
sample_idx = 0
sample_employee = X_test_burnout.iloc[sample_idx].to_dict()

print("Sample Employee Features (first 10):")
for i, (key, value) in enumerate(list(sample_employee.items())[:10]):
    print(f"  {key}: {value}")
print("  ...")

# Make predictions
predictions = load_models_and_predict(sample_employee)

print("\n" + "="*60)
print("PREDICTIONS FOR SAMPLE EMPLOYEE")
print("="*60)
print(f"Burnout Risk Score: {predictions['burnout_risk_score']:.4f}")
print(f"Wellbeing Score: {predictions['wellbeing_score']:.2f}")
print(f"Efficiency Score: {predictions['efficiency_score']:.2f}")
print("="*60)

# Compare with actual values
print("\nActual Values:")
print(f"Burnout Risk Score: {y_test_burnout.iloc[sample_idx]:.4f}")
print(f"Wellbeing Score: {y_test_wellbeing.iloc[sample_idx]:.2f}")
print(f"Efficiency Score: {y_test_efficiency.iloc[sample_idx]:.2f}")

## Training Complete! ðŸŽ‰

All three models have been successfully trained and saved to the `model_realistic/` directory.

### Next Steps:
1. Review model performance metrics above
2. Integrate models into your API for real-time predictions
3. Collect real data from APIs to further improve models
4. Deploy models to production environment