In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Optional LightGBM
try:
    import lightgbm as lgb
    LGB = True
    print("Using LightGBM")
except Exception:
    LGB = False
    from sklearn.ensemble import RandomForestRegressor
    print("LightGBM not installed → Using RandomForest")


In [None]:
def robust_parse_date_hour(s):
    try:
        return pd.to_datetime(s, format='%Y-%m-%d-%H')
    except Exception:
        try:
            return pd.to_datetime(s, errors='coerce')
        except Exception:
            return pd.NaT


def load_and_prepare(csv_path):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]

    df['date_hour_dt'] = df['date_hour'].astype(str).apply(robust_parse_date_hour)
    df = df.dropna(subset=['date_hour_dt'])

    kpi_cols = ['ps_traffic_mb', 'avg_rrc_connected_user', 'prb_dl_used', 'prb_dl_available_total']
    for c in kpi_cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')

    df = df.dropna(subset=['cell_name'])

    return df


In [None]:
def aggregate_weekly(df):
    df['week_start'] = df['date_hour_dt'].dt.to_period('W').apply(lambda p: p.start_time)

    agg_funcs = {
        'ps_traffic_mb': ['sum', 'mean', 'max', 'std'],
        'avg_rrc_connected_user': ['mean', 'max'],
        'prb_dl_used': ['mean', 'max'],
        'prb_dl_available_total': ['mean']
    }

    weekly = df.groupby(['cell_name', 'enodeb', 'week_start']).agg(agg_funcs)
    weekly.columns = ['_'.join(col).strip() for col in weekly.columns.values]
    weekly = weekly.reset_index().fillna(0)

    if 'ps_traffic_mb_sum' in weekly.columns:
        weekly = weekly.rename(columns={'ps_traffic_mb_sum': 'weekly_ps_traffic'})
    else:
        weekly['weekly_ps_traffic'] = 0.0

    return weekly


In [None]:
def create_lag_features(weekly, weeks_lag=3):
    weekly = weekly.sort_values(['cell_name', 'week_start']).copy()
    
    numeric_base = [
        'weekly_ps_traffic', 'ps_traffic_mb_mean', 'ps_traffic_mb_max',
        'avg_rrc_connected_user_mean', 'prb_dl_used_mean', 'prb_dl_available_total_mean'
    ]

    for lag in range(1, weeks_lag + 1):
        for c in numeric_base:
            if c in weekly.columns:
                weekly[f'{c}_lag{lag}'] = weekly.groupby('cell_name')[c].shift(lag)

    lag1 = 'weekly_ps_traffic_lag1'
    lagN = f'weekly_ps_traffic_lag{weeks_lag}'
    if lag1 in weekly.columns and lagN in weekly.columns:
        weekly[f'weekly_traffic_trend_{weeks_lag}w'] = \
            (weekly[lag1] - weekly[lagN]) / float(weeks_lag)

    required = [f'weekly_ps_traffic_lag{l}' for l in range(1, weeks_lag + 1)]
    weekly = weekly.dropna(subset=required)

    return weekly


In [None]:
def make_target_next_week(weekly):
    weekly = weekly.sort_values(['cell_name', 'week_start']).copy()
    weekly['target_weekly_ps_traffic'] = weekly.groupby('cell_name')['weekly_ps_traffic'].shift(-1)
    return weekly.dropna(subset=['target_weekly_ps_traffic'])


In [None]:
def prepare_features(weekly):
    weekly = pd.get_dummies(weekly, columns=['enodeb'], drop_first=True)

    feat_cols = [
        c for c in weekly.columns 
        if ('_lag' in c)
        or c.startswith('enodeb_')
        or c in ['ps_traffic_mb_mean', 'avg_rrc_connected_user_mean', 'prb_dl_used_mean']
    ]

    X = weekly[feat_cols].fillna(0)
    y = weekly['target_weekly_ps_traffic']

    return weekly, X, y, feat_cols


In [None]:
def time_based_split(weekly, test_ratio=0.15):
    weeks = sorted(weekly['week_start'].unique())
    n_test = max(1, int(len(weeks) * test_ratio))
    test_start = weeks[-n_test]

    train_mask = weekly['week_start'] < test_start
    test_mask  = weekly['week_start'] >= test_start

    return train_mask, test_mask


In [None]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    # đảm bảo y là số
    y_train = np.asarray(y_train).astype(float)
    y_test  = np.asarray(y_test).astype(float)

    if LGB:
        params = {
            'objective': 'regression',
            'metric': 'mae',
            'learning_rate': 0.05,
            'num_leaves': 31,
            'verbose': -1
        }
        dtrain = lgb.Dataset(X_train, label=y_train)
        model = lgb.train(params, dtrain, num_boost_round=500)
        y_pred = model.predict(X_test)
        fi = pd.Series(model.feature_importance(), index=X_train.columns)
    else:
        rf = RandomForestRegressor(n_estimators=200, max_depth=12, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        model = rf
        fi = pd.Series(rf.feature_importances_, index=X_train.columns)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)        # trả về MSE (không dùng tham số squared)
    rmse = np.sqrt(mse)                             # convert MSE -> RMSE
    # MAPE: tránh chia cho 0
    with np.errstate(divide='ignore', invalid='ignore'):
        mape = np.abs((y_test - y_pred) / np.where(y_test==0, np.nan, y_test))
        mape = np.nanmean(mape) * 100

    return model, y_pred, {'mae': mae, 'rmse': rmse, 'mape': mape}, fi.sort_values(ascending=False)


In [None]:
csv_path = r"D:\\vhproj\\power-saving\\data\\kpi_15_mins.csv"

df = load_and_prepare(csv_path)
print("Loaded rows:", len(df))
df.head()


In [None]:
weekly = aggregate_weekly(df)
print("Weekly rows:", len(weekly))
weekly.head()


In [None]:
weekly = create_lag_features(weekly, weeks_lag=3)
weekly = make_target_next_week(weekly)

print("Rows after lag + target:", len(weekly))
weekly.head()


In [None]:
weekly, X_all, y_all, feat_cols = prepare_features(weekly)
len(X_all), len(feat_cols)


In [None]:
train_mask, test_mask = time_based_split(weekly, test_ratio=0.15)

X_train, y_train = X_all[train_mask], y_all[train_mask]
X_test, y_test   = X_all[test_mask], y_all[test_mask]

print("Train size:", len(X_train), "Test size:", len(X_test))


In [None]:
model, y_pred, metrics, fi = train_and_evaluate(X_train, y_train, X_test, y_test)

metrics


In [None]:
fi.head(20)


In [None]:
latest_week = weekly['week_start'].max()
latest_df = weekly[weekly['week_start'] == latest_week]

X_latest = latest_df[feat_cols].fillna(0)
pred_next = model.predict(X_latest)

final_next = latest_df[['cell_name', 'week_start']].copy()
final_next['pred_next_week'] = pred_next

final_next.head()


In [None]:
out_dir = "forecast_output"
os.makedirs(out_dir, exist_ok=True)

weekly[test_mask].assign(pred=y_pred).to_csv(f"{out_dir}/test_predictions.csv", index=False)
final_next.to_csv(f"{out_dir}/pred_next_week.csv", index=False)

print("Saved to:", out_dir)
