# 📞 Call Center Forecasting — end-to-end notebook

Description: Demonstrates full pipeline: load -> anomaly detection -> feature engineering -> train (RandomForest & LSTM) -> evaluate -> forecast -> save results.

In [None]:
# === 0. Imports and global settings ===
import warnings
warnings.filterwarnings("ignore")

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 5)

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib   # for saving models
import json

# Set project paths (adjust if needed)
ROOT = Path("..")  # if notebook is in notebooks/, project root is one level up
DATA_DIR = ROOT / "data"
RESULTS_DIR = ROOT / "results"
SRC_DIR = ROOT / "src"

# Ensure results directory exists
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


## 1. Utility functions
Здесь вспомогательные функции: загрузка, базовая визуализация, оценка метрик.


In [None]:
# === 1. Utilities ===
def load_csv(path: Path, sep: str=';', parse_dates: bool=True) -> pd.DataFrame:
    """Load CSV with expected separators; returns DataFrame."""
    return pd.read_csv(path, sep=sep, header=0)

def evaluate_regression(y_true, y_pred):
    """Return MAE, RMSE, R2 in a dict."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return {"MAE": float(mae), "RMSE": float(rmse), "R2": float(r2)}

def plot_series(ts, label='value', title=None):
    """Simple plotting helper."""
    plt.figure(figsize=(14,4))
    plt.plot(ts, label=label)
    plt.title(title or label)
    plt.xlabel('Time')
    plt.ylabel(label)
    plt.legend()
    plt.show()

def save_json(obj, path: Path):
    with open(path, 'w') as f:
        json.dump(obj, f, indent=2)


## 2. Load & initial preprocessing
Загружаем CSV, парсим даты, создаём TIMESTAMP, проверяем типы.
(Адаптируй названия столбцов под свой файл: `DATESTART`, `RAZREZ`, `MONTH`, `CNT_CALLS`.)


In [None]:
# === 2. Load and initial preprocessing ===
csv_path = DATA_DIR / "calls.csv"   # <- замените по необходимости

if not csv_path.exists():
    rng = pd.date_range(start="2024-01-01", periods=24*30, freq="H")
    df_synth = pd.DataFrame({
        "DATESTART": rng.date,
        "RAZREZ": rng.time.astype(str).str[:5],  # 'HH:MM'
        "CNT_CALLS": (200 + 50*np.sin(np.arange(len(rng))/24) + np.random.poisson(10, len(rng))).astype(int),
        "MONTH": rng.month.astype(str)
    })
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    df_synth.to_csv(csv_path, sep=';', index=False)
    print("Synthetic dataset created:", csv_path)

raw = load_csv(csv_path, sep=';')
raw.head(3)

In [None]:
# Normalize and parse date/time columns - robust approach with checks
def prepare_timestamp(df: pd.DataFrame,
                      date_col='DATESTART',
                      time_col='RAZREZ',
                      month_col='MONTH') -> pd.DataFrame:
    df = df.copy()
    # If time column contains hh:mm or hh:mm:ss, normalize to HH:MM:SS
    if df[time_col].dtype == object:
        df[time_col] = df[time_col].astype(str).str.strip()
        # ensure seconds part
        df[time_col] = df[time_col].apply(lambda x: x if len(x.split(':'))==3 else (x + ':00'))
    # Parse date and time
    df[date_col] = pd.to_datetime(df[date_col], dayfirst=True, errors='coerce')
    df[time_col] = pd.to_timedelta(df[time_col])
    df['TIMESTAMP'] = df[date_col] + df[time_col]
    # Optional: month -> first day of month
    if month_col in df.columns:
        try:
            df[month_col] = pd.to_datetime(df[month_col].astype(str) + '-01', errors='coerce')
        except Exception:
            pass
    return df

df = prepare_timestamp(raw)
df = df.sort_values('TIMESTAMP').reset_index(drop=True)
plot_series(df['CNT_CALLS'], label='Raw calls', title='Raw call counts (unsmoothed)')
df.head(3)

## 3. Anomaly detection & correction
Используем IsolationForest (можно заменить на другой метод). После детекции — заменяем аномалии медианой нормальных значений.


In [None]:
# === 3. Anomaly detection and correction ===
def detect_and_fix_anomalies(df: pd.DataFrame,
                             value_col='CNT_CALLS',
                             contamination=0.001,
                             random_state=42):
    df = df.copy()
    scaler = StandardScaler()
    vals = df[[value_col]].fillna(0).values
    vals_scaled = scaler.fit_transform(vals)
    iso = IsolationForest(contamination=contamination, random_state=random_state)
    labels = iso.fit_predict(vals_scaled)
    df['anomaly_flag'] = (labels == -1)
    n_anom = df['anomaly_flag'].sum()
    if n_anom > 0:
        median_val = df.loc[~df['anomaly_flag'], value_col].median()
        df.loc[df['anomaly_flag'], value_col] = median_val
    else:
        median_val = None
    df.drop(columns=['anomaly_flag'], inplace=True)
    return df, n_anom, median_val

df_clean, n_anom, med = detect_and_fix_anomalies(df, value_col='CNT_CALLS', contamination=0.001)
print(f"Detected anomalies: {n_anom}; replaced with median={med}")
plot_series(df['CNT_CALLS'], label='Original')
plot_series(df_clean['CNT_CALLS'], label='Cleaned')


## 4. Feature engineering
Создаем временные признаки, лаги, скользящие средние, billing/critical days, one-hot для weekday и полу-часовые индикаторы.


In [None]:
# === 4. Feature engineering ===
def create_time_features(df: pd.DataFrame, timestamp_col='TIMESTAMP', value_col='CNT_CALLS', max_lag=10):
    df = df.copy()
    df = df.set_index(timestamp_col).asfreq('30T')  # ensure uniform frequency (30 minutes)
    # Fill missing CNT_CALLS with interpolation or 0 (choose strategy)
    df[value_col] = df[value_col].interpolate().fillna(method='bfill').astype(float)

    # Basic time features
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    df['weekday'] = df.index.weekday  # Monday=0
    df['is_day'] = ((df['hour'] >= 8) & (df['hour'] < 21)).astype(int)
    df['is_lunch'] = df['hour'].between(12,13).astype(int) * 0.2
    df['is_worktime'] = df['hour'].between(8,18).astype(int) * 0.2
    df['is_morning'] = df['hour'].between(8,9).astype(int) * 0.4
    df['workdays'] = (df['weekday'] < 5).astype(int)
    df['holidays'] = (df['weekday'] >=5).astype(int)
    df['day'] = df.index.day

    # billing & critical day flags (adapt lists if needed)
    df['billing_day'] = df['day'].isin([5,15,25]).astype(int)
    df['critical_day'] = df['day'].isin([3,4,13,14,23,24]).astype(int)

    # one-hot weekdays
    weekdays = pd.get_dummies(df['weekday'], prefix='weekday')
    df = pd.concat([df, weekdays], axis=1)

    # half-hour indicators (IS_0_d_HH_MM_00 style)
    half_hour_index = (df.index.hour * 2 + (df.index.minute // 30)).astype(int)
    for hh in range(48):
        df[f'IS_half_{hh:02d}'] = (half_hour_index == hh).astype(int)

    # lag features
    for lag in range(1, max_lag+1):
        df[f'lag_{lag}'] = df[value_col].shift(lag)

    # rolling features
    df['rolling_mean_7'] = df[value_col].shift(1).rolling(window=7).mean()
    df['rolling_std_3'] = df[value_col].shift(1).rolling(window=3).std()

    # drop rows with NaNs introduced by lags
    df = df.dropna()
    return df

feature_df = create_time_features(df_clean, timestamp_col='TIMESTAMP', value_col='CNT_CALLS', max_lag=10)
feature_df.shape


In [None]:
# Quick peek and sanity plots
feature_df.head(3)
plot_series(feature_df['CNT_CALLS'], label='CNT_CALLS (processed)', title='Processed Call Volume')

## 5. Train/Test split and scaling
Делаем временной разрез, масштабируем числовые фичи. Используем shuffle=False для временных рядов.

In [None]:
# === 5. Train/Test split and scaling ===
target = 'CNT_CALLS'
features = [c for c in feature_df.columns if c != target]

# Time-based train/test split (70%/30% or custom)
train_size = int(len(feature_df) * 0.7)
train_df = feature_df.iloc[:train_size]
test_df = feature_df.iloc[train_size:]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# Scale numeric columns only (lags and rolling)
numeric_cols = [c for c in X_train.columns if c.startswith('lag_') or 'rolling' in c]
scaler_x = StandardScaler()
X_train[numeric_cols] = scaler_x.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler_x.transform(X_test[numeric_cols])

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1,1))
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1,1))

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

## 6. RandomForest baseline model
Тренируем RandomForest с дефолтными / tuned параметрами; оцениваем на тесте.

In [None]:
# === 6. RandomForest baseline ===
rf = RandomForestRegressor(n_estimators=1000, max_depth=None, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

metrics_rf = evaluate_regression(y_test, y_pred_rf)
print("RandomForest metrics:", metrics_rf)
plot_series(y_test.values, label='Actual', title='RandomForest — Actual vs Pred')
plt.plot(y_pred_rf, label='RF Predictions', alpha=0.8)
plt.legend()
plt.show()

# Save RF model
joblib.dump(rf, RESULTS_DIR / "rf_model.joblib")
save_json(metrics_rf, RESULTS_DIR / "metrics_rf.json")


## 7. LSTM model (simple 1-step LSTM)
Подготовим данные в формате (samples, timesteps, features). Здесь используем time_steps=1, можно расширить при желании.


In [None]:
# === 7. LSTM model ===
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# reshape inputs for LSTM: (samples, timesteps, features)
time_steps = 1
X_train_lstm = X_train.values.reshape((X_train.shape[0], time_steps, X_train.shape[1]))
X_test_lstm = X_test.values.reshape((X_test.shape[0], time_steps, X_test.shape[1]))

model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dropout(0.2),
    LSTM(64),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train_lstm, y_train_scaled, validation_data=(X_test_lstm, y_test_scaled),
                    epochs=50, batch_size=32, callbacks=[es], verbose=1)

# Predictions and inverse scaling
y_pred_lstm_scaled = model.predict(X_test_lstm)
y_pred_lstm = scaler_y.inverse_transform(y_pred_lstm_scaled).ravel()

metrics_lstm = evaluate_regression(y_test, y_pred_lstm)
print("LSTM metrics:", metrics_lstm)

# Save LSTM model
model.save(RESULTS_DIR / "lstm_model.h5")
save_json(metrics_lstm, RESULTS_DIR / "metrics_lstm.json")

# Plot training history
plt.figure(figsize=(12,4))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.title("LSTM training loss")
plt.show()

plot_series(y_test.values, label='Actual')
plt.plot(y_pred_lstm, label='LSTM Predictions')
plt.legend()
plt.show()


## 8. Forecasting (rolling predictions for future timestamps)
Генерируем будущие признаки и делаем предсказания пошагово (auto-regressive style): на каждом шаге добавляем предсказание в качестве нового lag.


In [None]:
# === 8. Forecasting future periods (auto-regressive loop) ===
from datetime import timedelta

def generate_future_index(last_timestamp, periods=48, freq='30T'):
    return pd.date_range(start=last_timestamp + pd.Timedelta(freq), periods=periods, freq=freq)

def prepare_future_df(feature_df, last_timestamp, periods=48, freq='30T'):
    future_idx = generate_future_index(last_timestamp, periods=periods, freq=freq)
    future_df = pd.DataFrame(index=future_idx)
    # create time features using same logic as create_time_features but on empty df
    future_df['hour'] = future_df.index.hour
    future_df['minute'] = future_df.index.minute
    future_df['weekday'] = future_df.index.weekday
    future_df['is_day'] = ((future_df['hour'] >= 8) & (future_df['hour'] < 21)).astype(int)
    future_df['is_lunch'] = future_df['hour'].between(12,13).astype(int) * 0.2
    future_df['is_worktime'] = future_df['hour'].between(8,18).astype(int) * 0.2
    future_df['is_morning'] = future_df['hour'].between(8,9).astype(int) * 0.4
    future_df['workdays'] = (future_df['weekday'] < 5).astype(int)
    future_df['holidays'] = (future_df['weekday'] >= 5).astype(int)
    future_df['day'] = future_df.index.day
    future_df['billing_day'] = future_df['day'].isin([5,15,25]).astype(int)
    future_df['critical_day'] = future_df['day'].isin([3,4,13,14,23,24]).astype(int)
    # weekdays one-hot
    weekdays = pd.get_dummies(future_df['weekday'], prefix='weekday')
    future_df = pd.concat([future_df, weekdays], axis=1)
    # half-hour indicators
    half_hour_index = (future_df.index.hour * 2 + (future_df.index.minute // 30)).astype(int)
    for hh in range(48):
        future_df[f'IS_half_{hh:02d}'] = (half_hour_index == hh).astype(int)
    # initialize lag and rolling features as NaN (will be filled iteratively)
    for lag in range(1, 11):
        future_df[f'lag_{lag}'] = np.nan
    future_df['rolling_mean_7'] = np.nan
    future_df['rolling_std_3'] = np.nan
    return future_df

# Prepare combined df with history + future skeleton
last_time = feature_df.index[-1]
horizon = 48*2  # example: 48 half-hours -> 24 hours; change as needed
future_skeleton = prepare_future_df(feature_df, last_time, periods=horizon)
combined = pd.concat([feature_df, future_skeleton], axis=0)

# Fill initial lags for future rows from historic values
for i in range(1, 11):
    combined[f'lag_{i}'] = combined['CNT_CALLS'].shift(i)

# Iterative forecasting using LSTM model (you can switch to RF similarly)
predictions = []
model_used = 'lstm'  # or 'rf' if you prefer
for t in future_skeleton.index:
    row = combined.loc[t]
    # If any numeric features require scaling, do it like below
    features_row = combined.loc[t, features].copy()
    # For numeric columns (lags + rolling), ensure they are imputed (use last known)
    for col in features_row.index:
        if pd.isna(features_row[col]):
            # fallback: take last available value for that column
            features_row[col] = combined[col].ffill().iloc[-1]
    # Scale numeric cols
    features_row[numeric_cols] = scaler_x.transform(features_row[numeric_cols].values.reshape(1, -1))
    # reshape for LSTM
    X_row = features_row.values.reshape(1, time_steps, -1)
    pred_scaled = model.predict(X_row)
    pred = scaler_y.inverse_transform(pred_scaled.reshape(-1, 1))[0,0]
    # write prediction into combined
    combined.loc[t, 'CNT_CALLS'] = pred
    # update subsequent lag columns for future rows
    # shift lags (this is a simple approach; for production, implement more robust rolling)
    for lag in range(10, 1, -1):
        combined.loc[t:, f'lag_{lag}'] = combined.loc[t:, f'lag_{lag-1}']
    combined.loc[t:, 'lag_1'] = combined['CNT_CALLS'].shift(1).loc[t:]
    # recompute rolling features for rows after current t
    combined.loc[:, 'rolling_mean_7'] = combined['CNT_CALLS'].shift(1).rolling(window=7).mean()
    combined.loc[:, 'rolling_std_3'] = combined['CNT_CALLS'].shift(1).rolling(window=3).std()
    predictions.append((t, pred))

# Collect forecasted series
forecast_df = pd.DataFrame(predictions, columns=['timestamp','forecast_calls']).set_index('timestamp')
forecast_df['forecast_calls'] = forecast_df['forecast_calls'].round().astype(int)
forecast_df.head(20)

# Save forecast
forecast_df.to_csv(RESULTS_DIR / "forecast_output.csv")
plot_series(forecast_df['forecast_calls'], label='Forecast (auto-regressive)', title='Future Forecast')


## 9. Operator staffing calculation (Erlang-C)
Пример применения прогноза для расчёта количества операторов через библиотеку `pyworkforce` (опционально).


In [None]:
# === 9. Staffing calculation (optional) ===
try:
    from pyworkforce.queuing import ErlangC
    print("pyworkforce available — computing required operators for first forecast window")
    sample_calls = int(forecast_df['forecast_calls'].iloc[:1].values[0])
    erlang = ErlangC(transactions=sample_calls, asa=20/60, aht=3.14, interval=60, shrinkage=0.29)
    req = erlang.required_positions(service_level=0.8, max_occupancy=0.75)
    print("Example required operators (sample):", req)
except Exception as e:
    print("pyworkforce not installed or error occurred. Skip staffing calc. Error:", e)


## 10. Save models, metrics and wrap-up
Сохраняем модели и отчёты в /results.


In [None]:
# === 10. Save models & metrics (already saved earlier for RF & LSTM) ===
# Save also final forecast plot
plt.figure(figsize=(12,5))
plt.plot(feature_df['CNT_CALLS'].iloc[-48*7:], label='Recent actual')
plt.plot(forecast_df['forecast_calls'].iloc[:48*3], label='Forecast (first 3 days)')
plt.legend()
plt.title("Recent actual vs forecast")
plt.savefig(RESULTS_DIR / "forecast_plot.png", dpi=150, bbox_inches='tight')
plt.show()

# Save combined metrics summary
all_metrics = {"rf": metrics_rf, "lstm": metrics_lstm}
save_json(all_metrics, RESULTS_DIR / "summary_metrics.json")
print("Saved metrics and forecast in:", RESULTS_DIR.resolve())


# Done
Notebook demonstrates end-to-end approach: data cleaning, anomaly detection, feature engineering, model training, evaluation, and forecasting.
