In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
try:
    from xgboost import XGBRegressor
    XGB_AVAILABLE = True
except Exception:
    print("⚠️ XGBoost não disponível. Instale com: pip install xgboost")
    XGB_AVAILABLE = False

plt.rcParams['figure.figsize'] = (12, 4)
plt.rcParams['axes.grid'] = False

In [5]:
def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    eps = 1e-9
    return np.mean(np.abs((y_true - y_pred) / np.clip(y_true, eps, None))) * 100

def iterative_forecast(model, X_test_scaled, lag_positions, scaler, original_X_test):
    """
    Faz forecast iterativo usando o modelo treinado e atualizando os lags
    dentro do array escalonado.

    - X_test_scaled: np.array já escalonado
    - lag_positions: dict {nome_lag: posicao_no_array}
    - scaler: StandardScaler já ajustado
    - original_X_test: DataFrame original (para pegar datas)
    """
    Xt = X_test_scaled.copy()
    preds = []

    for i in range(Xt.shape[0]):
        row = Xt[i].reshape(1, -1)
        p = model.predict(row)[0]
        preds.append(p)

        # atualiza os lags da PRÓXIMA linha com o valor previsto
        if i + 1 < Xt.shape[0]:
            # 1) precisamos "desescalar" o previsto para colocar no próximo lag?
            # não: nós vamos ESCALAR o previsto e injetar escalado nos lags
            # mas o scaler foi treinado em todas as features de treino, incluindo lags
            # então vamos montar um vetor "fake" só pra escalar o valor previsto
            # jeito simples: pegar linha de referência e trocar só o valor do lag_1
            ref = original_X_test.iloc[[i+1]].copy()
            # atualiza o primeiro lag com o valor previsto
            ref['Weekly_Sales_lag_1'] = p
            # os outros lags (lag_2, lag_3, etc.) já foram montados antes do split.
            # nesse V2 vamos atualizar só o lag_1 na iteração
            ref_scaled = scaler.transform(ref.values)
            # agora injeta esse lag_1 escalado na próxima linha
            Xt[i+1, lag_positions['Weekly_Sales_lag_1']] = ref_scaled[0, lag_positions['Weekly_Sales_lag_1']]

    return np.array(preds)

def plot_series(dates, y_true, y_pred, title):
    plt.figure()
    plt.plot(dates, y_true, label="Real")
    plt.plot(dates, y_pred, label="Previsto")
    plt.title(title)
    plt.xlabel("Data"); plt.ylabel("Vendas Semanais")
    plt.legend(); plt.tight_layout(); plt.show()

In [6]:
# %% 1) carregar dados
DATA_PATH = Path("sales.csv")
df = pd.read_csv(DATA_PATH)
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True, errors="coerce")
df = df.sort_values("Date").reset_index(drop=True)

In [7]:
# ajustes de unidade (do enunciado)
if (df["Fuel_Price"] > 100).any():
    df["Fuel_Price"] = df["Fuel_Price"] / 1000.0
if (df["Unemployment"] > 100).any():
    df["Unemployment"] = df["Unemployment"] / 1000.0

In [8]:
# %% 2) feature engineering avançado
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["weekofyear"] = df["Date"].dt.isocalendar().week.astype(int)
df["quarter"] = df["Date"].dt.quarter
df["is_month_start"] = df["Date"].dt.is_month_start.astype(int)
df["is_month_end"] = df["Date"].dt.is_month_end.astype(int)

In [9]:
# sazonais cíclicas
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

In [10]:
# lags
for lag in [1, 2, 3, 4, 12, 52]:
    df[f"Weekly_Sales_lag_{lag}"] = df["Weekly_Sales"].shift(lag)

In [11]:
# rolling (médias móveis) - deslocadas pra não vazar
df["Weekly_Sales_roll_4"] = df["Weekly_Sales"].rolling(4).mean().shift(1)
df["Weekly_Sales_roll_12"] = df["Weekly_Sales"].rolling(12).mean().shift(1)

In [12]:
# drop das linhas iniciais que ficaram com NaN por causa dos lags
df = df.dropna().reset_index(drop=True)

In [13]:
# %% 3) split temporal
cutoff = pd.Timestamp("2012-01-01")
train_df = df[df["Date"] < cutoff].copy()
test_df  = df[df["Date"] >= cutoff].copy()

target = "Weekly_Sales"

feature_cols = [
    "Holiday_Flag","Temperature","Fuel_Price","CPI","Unemployment",
    "year","month","weekofyear","quarter",
    "is_month_start","is_month_end",
    "month_sin","month_cos",
    "Weekly_Sales_lag_1","Weekly_Sales_lag_2","Weekly_Sales_lag_3",
    "Weekly_Sales_lag_4","Weekly_Sales_lag_12","Weekly_Sales_lag_52",
    "Weekly_Sales_roll_4","Weekly_Sales_roll_12"
]

X_train = train_df[feature_cols].copy()
y_train = train_df[target].copy()
X_test  = test_df[feature_cols].copy()
y_test  = test_df[target].copy()

In [14]:
# %% 4) scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled  = scaler.transform(X_test.values)

In [15]:
# mapeia posição dos lags no array escalado
lag_positions = {col: feature_cols.index(col) for col in feature_cols if "lag_" in col}

In [None]:
# %% 5) modelos

# ---- RandomForest mais parrudo
rf = RandomForestRegressor(
    n_estimators=700,
    max_depth=14,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
    )
rf.fit(X_train_scaled, y_train)

# ---- XGBoost tunado
xgb_model = None
if XGB_AVAILABLE:
    xgb_model = XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X_train_scaled, y_train)

In [17]:
# %% 6) forecast iterativo 2012
pred_rf = iterative_forecast(
    model=rf,
    X_test_scaled=X_test_scaled,
    lag_positions=lag_positions,
    scaler=scaler,
    original_X_test=X_test
)

pred_xgb = None
if xgb_model is not None:
    pred_xgb = iterative_forecast(
        model=xgb_model,
        X_test_scaled=X_test_scaled,
        lag_positions=lag_positions,
        scaler=scaler,
        original_X_test=X_test
    )

In [18]:

# %% 7) métricas
results = [{
    "Modelo": "RandomForest_V2",
    "RMSE": np.sqrt(mean_squared_error(y_test, pred_rf)),
    "R2": r2_score(y_test, pred_rf),
    "MAPE": mape(y_test, pred_rf)
}]

if pred_xgb is not None:
    results.append({
        "Modelo": "XGBoost_V2",
        "RMSE": np.sqrt(mean_squared_error(y_test, pred_xgb)),
        "R2": r2_score(y_test, pred_xgb),
        "MAPE": mape(y_test, pred_xgb)
    })

pd.DataFrame(results)


Unnamed: 0,Modelo,RMSE,R2,MAPE
0,RandomForest_V2,75076.5448,0.564276,3.434496
1,XGBoost_V2,92337.465289,0.340888,4.219404


In [22]:
df['Weekly_Sales'].mean()


np.float64(1578504.6800000002)

In [23]:
df

Unnamed: 0,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,weekofyear,...,month_sin,month_cos,Weekly_Sales_lag_1,Weekly_Sales_lag_2,Weekly_Sales_lag_3,Weekly_Sales_lag_4,Weekly_Sales_lag_12,Weekly_Sales_lag_52,Weekly_Sales_roll_4,Weekly_Sales_roll_12
0,2011-02-04,1606629.58,0,42.27,2.989,212.566881,7.742,2011,2,5,...,0.866025,5.000000e-01,1316899.31,1327405.42,1391013.96,1444732.28,1494479.49,1643690.90,1.370013e+06,1.607574e+06
1,2011-02-11,1649614.93,1,36.39,3.022,212.936705,7.742,2011,2,6,...,0.866025,5.000000e-01,1606629.58,1316899.31,1327405.42,1391013.96,1483784.18,1641957.44,1.410487e+06,1.616920e+06
2,2011-02-18,1686842.78,0,57.36,3.045,213.247885,7.742,2011,2,7,...,0.866025,5.000000e-01,1649614.93,1606629.58,1316899.31,1327405.42,1955624.11,1611968.17,1.475137e+06,1.630739e+06
3,2011-02-25,1456800.28,0,62.90,3.065,213.535609,7.742,2011,2,8,...,0.866025,5.000000e-01,1686842.78,1649614.93,1606629.58,1316899.31,1548033.78,1409727.59,1.564997e+06,1.608341e+06
4,2011-03-04,1636263.41,0,59.58,3.288,213.823333,7.742,2011,3,9,...,1.000000,6.123234e-17,1456800.28,1686842.78,1649614.93,1606629.58,1682614.26,1554806.68,1.599972e+06,1.600738e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2012-09-28,1437059.26,0,76.08,3.666,222.981658,6.908,2012,9,39,...,-1.000000,-1.836970e-16,1506126.06,1517428.87,1661767.33,1582083.40,1769854.16,1394561.83,1.566851e+06,1.568074e+06
87,2012-10-05,1670785.97,0,68.55,3.617,223.181477,6.573,2012,10,40,...,-0.866025,5.000000e-01,1437059.26,1506126.06,1517428.87,1661767.33,1527014.04,1630989.95,1.530595e+06,1.540341e+06
88,2012-10-12,1573072.81,0,62.99,3.601,223.381296,6.573,2012,10,41,...,-0.866025,5.000000e-01,1670785.97,1437059.26,1506126.06,1517428.87,1497954.76,1493525.93,1.532850e+06,1.552322e+06
89,2012-10-19,1508068.77,0,67.97,3.594,223.425723,6.573,2012,10,42,...,-0.866025,5.000000e-01,1573072.81,1670785.97,1437059.26,1506126.06,1439123.71,1502562.78,1.546761e+06,1.558582e+06
