In [None]:
# ============================================
# STEP 1: Core Imports & Reproducibility
# ============================================
import pandas as pd
import numpy as np
from datetime import timedelta
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

np.random.seed(42)

# ============================================
# STEP 2: Load & Prepare Data
# ============================================
df = pd.read_csv("DEL_SBY_prepared_data (UAT).csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

# Encode Rank
df['Rank'] = df['Rank'].map({'FO': 0, 'CP': 1})

# Target
df['activation_ratio'] = (
    df['Standby Activation Count'] / df['Pairing Start Count']
).clip(0, 0.6)

# ============================================
# STEP 3: SAFE FEATURE ENGINEERING (NO LEAKAGE)
# ============================================
GROUP_COLS = ['Station', 'Duty Window Number', 'Rank']
TARGET = 'activation_ratio'

# Minimal + safe lags
df['lag_1'] = df.groupby(GROUP_COLS)[TARGET].shift(1)
df['lag_7'] = df.groupby(GROUP_COLS)[TARGET].shift(7)

# EMA (shifted)
df['ema_7'] = (
    df.groupby(GROUP_COLS)[TARGET]
    .transform(lambda x: x.shift(1).ewm(span=7).mean())
)

# Temporal features
df['dayofweek'] = df['Date'].dt.dayofweek
df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)

# Activity flags
df['was_active_yesterday'] = (df['lag_1'] > 0).astype(int)

# Drop cold-start rows
df_model = df.dropna(subset=['lag_1']).reset_index(drop=True)

# ============================================
# STEP 4: Feature Matrix
# ============================================
FEATURES = [
    'lag_1',
    'lag_7',
    'ema_7',
    'dayofweek',
    'is_weekend',
    'Duty Window Number',
    'Rank',
    'was_active_yesterday'
]

X = df_model[FEATURES]
y = df_model[TARGET]

# ============================================
# STEP 5: Temporal Train / Val / Test Split
# ============================================
start_date = df_model['Date'].min()
end_date = df_model['Date'].max()
total_days = (end_date - start_date).days

train_end = start_date + timedelta(days=int(total_days * 0.7))
val_end = start_date + timedelta(days=int(total_days * 0.85))

train_idx = df_model['Date'] <= train_end
val_idx = (df_model['Date'] > train_end) & (df_model['Date'] <= val_end)
test_idx = df_model['Date'] > val_end

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]
X_test, y_test = X[test_idx], y[test_idx]

# ============================================
# STEP 6: Evaluation Metrics (ONLY WHAT YOU ASKED)
# ============================================
def evaluate(y_true, y_pred, label):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    bias = np.mean(y_true - y_pred)
    under_pred_rate = np.mean(y_pred < y_true) * 100

    print(f"\nðŸ“Š {label}")
    print("-" * 40)
    print(f"MAE              : {mae:.4f}")
    print(f"RMSE             : {rmse:.4f}")
    print(f"Mean Bias        : {bias:.4f}")
    print(f"Under-pred Rate  : {under_pred_rate:.1f}%")

    return {
        'mae': mae,
        'rmse': rmse,
        'bias': bias,
        'under_pred_rate': under_pred_rate
    }

# ============================================
# STEP 7: Model Evolution
# ============================================

# ---- Model 1: Baseline (Lag-1 only)
model_1 = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)
model_1.fit(X_train[['lag_1']], y_train)

# ---- Model 2: Core Temporal Model
model_2 = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model_2.fit(X_train[['lag_1','lag_7','ema_7']], y_train)

# ---- Model 3: Production Model (FINAL)
model_3 = lgb.LGBMRegressor(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=6,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model_3.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

# ============================================
# STEP 8: Final Evaluation (TEST SET)
# ============================================
print("\nðŸš€ MODEL EVOLUTION â€” TEST SET RESULTS")

pred_1 = model_1.predict(X_test[['lag_1']])
pred_2 = model_2.predict(X_test[['lag_1','lag_7','ema_7']])
pred_3 = model_3.predict(X_test)

m1 = evaluate(y_test, pred_1, "Model 1 â€” Lag-1 Baseline")
m2 = evaluate(y_test, pred_2, "Model 2 â€” Lag + EMA")
m3 = evaluate(y_test, pred_3, "Model 3 â€” Production Model")

# ============================================
# STEP 9: Final Model Output
# ============================================
best_model = model_3
final_predictions = pred_3

print("\nâœ… FINAL MODEL READY FOR PRODUCTION")
