# HR Overtime Prediction and Forecasting

# Predicción y Pronóstico de Horas Extra de RRHH

Este notebook implementa el pronóstico de horas extra utilizando Prophet y visualiza datos históricos, predicciones e intervalos de confianza usando Plotly. Los componentes principales incluyen:
- Carga de datos históricos de horas extra desde SQL Server
- Entrenamiento de modelos Prophet para cada departamento
- Generación de predicciones con intervalos de incertidumbre 
- Visualización interactiva con Plotly
- Almacenamiento de modelos y predicciones

Objetivo: Predecir las horas extras acumuladas por departamento semana a semana para las próximas 4 semanas.

## 1. Importar Librerías and Setup


In [11]:
import pandas as pd
import pymssql
from prophet import Prophet
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib
import datetime
import logging
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configure logging
logging.basicConfig(level=logging.INFO)

## 2. Conectar a la base de datos

Setup de SQL Server

In [12]:
# SQL Server Setup
SQL_SERVER = "172.28.192.1:50121\\SQLEXPRESS"
SQL_DB = "HR_Analytics"
SQL_USER = "sa"
SQL_PASSWORD = "123456"

# Conectar a SQL Server
def get_db_connection():
    server_name = SQL_SERVER.split('\\')[0]
    try:
        conn = pymssql.connect(
            server=server_name,
            database=SQL_DB,
            user=SQL_USER,
            password=SQL_PASSWORD
        )
        return conn
    except Exception as e:
        logging.error(f"Error de conexión a la base de datos: {e}")
        raise

## 3. Cargar y procesar datos históricos de horas extras

Consulta y preparación

In [13]:
def load_historical_data():
    conn = get_db_connection()
    query = """
    SELECT k.work_date, w.department, SUM(k.overtime_hours) as total_overtime
    FROM Kronos_TimeEntries k
    JOIN Workday_Employees w ON k.employee_id = w.employee_id
    GROUP BY k.work_date, w.department
    """
    
    try:
        df = pd.read_sql(query, conn)
        df['work_date'] = pd.to_datetime(df['work_date']).dt.strftime('%Y-%m-%d')
        df['work_date'] = pd.to_datetime(df['work_date'])
        
        # Aggregate by week
        df = df.groupby([pd.Grouper(key='work_date', freq='W'), 'department'])['total_overtime'].sum().reset_index()
        
        logging.info(f"Loaded data: {len(df)} records")
        return df
    
    finally:
        conn.close()

# Load and display data
historical_data = load_historical_data()
print("Muestra de datos históricos:")
display(historical_data.head(10))


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.

INFO:root:Loaded data: 324 records


Muestra de datos históricos:


Unnamed: 0,work_date,department,total_overtime
0,2024-05-19,Finance,0.0
1,2024-05-19,HR,0.0
2,2024-05-19,IT,1.29
3,2024-05-19,Inventory,0.0
4,2024-05-19,Marketing,3.04
5,2024-05-19,Sales,2.44
6,2024-05-26,Finance,1.69
7,2024-05-26,HR,8.84
8,2024-05-26,IT,3.85
9,2024-05-26,Inventory,7.03


## 4. Entrenar Modelo Prophet

Crear y entrenar modelo Prophet para cada departamento.

In [14]:
def train_prophet_model(dept_data):

    
    # Preparar datos para Prophet
    df = dept_data[['work_date', 'total_overtime']].copy()
    df.columns = ['ds', 'y']
    
    # Split de datos: usar últimas 4 semanas para evaluación
    train = df.iloc[:-4] if len(df) > 4 else df
    test = df.iloc[-4:] if len(df) > 4 else pd.DataFrame()
    
    # Crear y entrenar modelo
    model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=False,
        interval_width=0.95
    )
    model.fit(train)
    
    # Evaluar modelo en datos de prueba
    metrics = {}
    if not test.empty:
        # Generar predicciones para el período de prueba
        forecast = model.predict(test[['ds']])
        y_pred = forecast['yhat']
        y_true = test['y']
        
        # Calcular métricas
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        r2 = 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - y_true.mean()) ** 2)
        
        metrics = {
            'rmse': rmse,
            'mape': mape,
            'r2': r2,
            'model_quality': evaluate_model_quality(mape, r2)
        }
    
    return model, metrics

def evaluate_model_quality(mape, r2):
    """Evaluate model quality based on MAPE and R2"""
    if mape <= 10 and r2 >= 0.8:
        return "Excelente"
    elif mape <= 20 and r2 >= 0.7:
        return "Bueno"
    elif mape <= 30 and r2 >= 0.6:
        return "Aceptable"
    else:
        return "Necesita mejoras"

# Entrenamiento por cada departamento
department_models = {}
metrics = {}

for department in historical_data['department'].unique():
    dept_data = historical_data[historical_data['department'] == department]
    
    if len(dept_data) < 10:
        logging.warning(f"Datos insuficientes para departamento: {department}")
        continue

    # Entrenar y evaluar modelo
    model, model_metrics = train_prophet_model(dept_data)
    department_models[department] = model
    metrics[department] = model_metrics
    
    # Mostrar métricas
    print(f"\nMétricas para departamento {department}:")
    print(f"RMSE: {model_metrics['rmse']:.2f}")
    print(f"MAPE: {model_metrics['mape']:.2f}%")
    print(f"R²: {model_metrics['r2']:.2f}")
    print(f"Calidad del modelo: {model_metrics['model_quality']}")

# Guardar resultados
results_df = pd.DataFrame([
    {
        'department': dept,
        'rmse': metrics[dept]['rmse'],
        'mape': metrics[dept]['mape'],
        'r2': metrics[dept]['r2'],
        'model_quality': metrics[dept]['model_quality']
    }
    for dept in metrics.keys()
])

print("\nResumen de calidad de modelos:")
display(results_df)

DEBUG:cmdstanpy:cmd: where.exe tbb.dll
cwd: None


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\fn0ehijk.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\ntjgo6jq.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=18630', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\fn0ehijk.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\ntjgo6jq.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_modelu3hds9od\\prophet_model-20250521115732.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:32 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:33 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1


Métricas para departamento Finance:
RMSE: 8.89
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\5_hfpz2e.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\v9nutm1w.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=5487', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\5_hfpz2e.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\v9nutm1w.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_modelxqnzz1g7\\prophet_model-20250521115734.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:34 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:35 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1]


Métricas para departamento HR:
RMSE: 9.62
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\4_e7j01l.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\6fhtu2gb.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=49582', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\4_e7j01l.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\6fhtu2gb.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_model91nzoyj5\\prophet_model-20250521115735.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:35 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1


Métricas para departamento IT:
RMSE: 22.86
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\25vmw971.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\6o0ruu7c.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=65144', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\25vmw971.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\6o0ruu7c.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_modelgjkofxvf\\prophet_model-20250521115737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:37 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1


Métricas para departamento Inventory:
RMSE: 60.10
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\_y0bpd1_.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\8sbik3t5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=8047', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\_y0bpd1_.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\8sbik3t5.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_modelj1ee3o5_\\prophet_model-20250521115739.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:39 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1]


Métricas para departamento Marketing:
RMSE: 26.64
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras


DEBUG:cmdstanpy:TBB already found in load path
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\nvvktzdm.json
DEBUG:cmdstanpy:input tempfile: C:\Users\joey_\AppData\Local\Temp\tmpbhueithd\z2cgypy_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\joey_\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=596', 'data', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\nvvktzdm.json', 'init=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\z2cgypy_.json', 'output', 'file=C:\\Users\\joey_\\AppData\\Local\\Temp\\tmpbhueithd\\prophet_model_l729ahq\\prophet_model-20250521115740.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:57:40 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:57:40 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] 


Métricas para departamento Sales:
RMSE: 4.27
MAPE: nan%
R²: 1.00
Calidad del modelo: Necesita mejoras

Resumen de calidad de modelos:


Unnamed: 0,department,rmse,mape,r2,model_quality
0,Finance,8.89133,,1.0,Necesita mejoras
1,HR,9.624972,,1.0,Necesita mejoras
2,IT,22.857598,,1.0,Necesita mejoras
3,Inventory,60.095096,,1.0,Necesita mejoras
4,Marketing,26.640833,,1.0,Necesita mejoras
5,Sales,4.267392,,1.0,Necesita mejoras


## 5. Generar Predicciones

Predicción para las próximas 4 semanas de Overtime

In [15]:
def generate_predictions(model, periods=4):
    """Generate future predictions with confidence intervals"""
    future = model.make_future_dataframe(periods=periods, freq='W')
    forecast = model.predict(future)
    return forecast

# Generate predictions for each department
department_forecasts = {}
for department, model in department_models.items():
    forecast = generate_predictions(model)
    department_forecasts[department] = forecast

print("Generated predictions for departments:", list(department_forecasts.keys()))

Generated predictions for departments: ['Finance', 'HR', 'IT', 'Inventory', 'Marketing', 'Sales']


## 6. Visualize Results with Plotly

Create interactive plots showing historical data, predictions, and confidence intervals.

In [16]:
def plot_forecast(department, historical_data, forecast):
    """Create interactive plot for a department's forecast"""
    
    # Filter historical data for department
    hist_dept = historical_data[historical_data['department'] == department]
    
    # Create figure with secondary y-axis
    fig = go.Figure()
    
    # Add historical data
    fig.add_trace(
        go.Scatter(
            x=hist_dept['work_date'],
            y=hist_dept['total_overtime'],
            name='Historical',
            mode='markers+lines',
            line=dict(color='blue')
        )
    )
    
    # Add forecast
    fig.add_trace(
        go.Scatter(
            x=forecast['ds'],
            y=forecast['yhat'],
            name='Forecast',
            mode='lines',
            line=dict(color='red')
        )
    )
    
    # Add confidence interval
    fig.add_trace(
        go.Scatter(
            x=forecast['ds'].tolist() + forecast['ds'].tolist()[::-1],
            y=forecast['yhat_upper'].tolist() + forecast['yhat_lower'].tolist()[::-1],
            fill='toself',
            fillcolor='rgba(0,100,80,0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            name='95% Confidence Interval'
        )
    )
    
    # Update layout
    fig.update_layout(
        title=f'Overtime Forecast - {department}',
        xaxis_title='Date',
        yaxis_title='Overtime Hours',
        hovermode='x unified',
        showlegend=True,
        template='plotly_white'
    )
    
    return fig

# Create and display plots for each department
for department in department_models.keys():
    fig = plot_forecast(
        department,
        historical_data,
        department_forecasts[department]
    )
    fig.show()

## 7. Save Model and Predictions

Store trained models and predictions in the database.

In [None]:
def save_predictions():
    conn = get_db_connection()
    cursor = conn.cursor()
    
    try:
        # Save predictions for each department
        for department, forecast in department_forecasts.items():
            # Get future predictions only
            future_predictions = forecast[forecast['ds'] > datetime.datetime.now()]
            
            for _, row in future_predictions.iterrows():
                cursor.execute(
                    """
                    INSERT INTO Overtime_Predictions 
                    (department, prediction_date, predicted_overtime)
                    VALUES (%s, %s, %s)
                    """,
                    (department, row['ds'], row['yhat'])
                )
        
        # Save model metrics
        for department, metric in metrics.items():
            cursor.execute(
                """
                INSERT INTO ML_Model_Accuracy 
                (model_name, run_date, accuracy, precision, recall, f1_score)
                VALUES (%s, %s, %s, %s, %s, %s)
                """,
                (f'Overtime_Forecast_{department}', 
                 datetime.datetime.now(),
                 1 - metric['rmse']/100,  # Proxy accuracy
                 0, 0, 0)
            )
        
        conn.commit()
        logging.info("Successfully saved predictions and metrics")
        
    except Exception as e:
        conn.rollback()
        logging.error(f"Error saving predictions: {e}")
        raise
    finally:
        conn.close()

# Save models and predictions
save_predictions()
print("Models and predictions saved successfully")