# Stage 10b — Time-Series Modeling (Regression)

In [ ]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA_DIR = Path("../data")
PROC = DATA_DIR / "processed"
FEAT_PATH = PROC / "IYR_features_project.csv"
OUT_PRED = PROC / "IYR_predictions_stage10b.csv"
MODEL_DIR = Path("../model")
MODEL_DIR.mkdir(exist_ok=True)

df = pd.read_csv(FEAT_PATH, parse_dates=["Date"]).sort_values("Date").reset_index(drop=True)

# target = next day's Close
df["target_close_t1"] = df["Close"].shift(-1)
df = df.dropna(subset=["target_close_t1"]).copy()

features = [c for c in df.columns if c not in ["Date", "target_close_t1"]]
X = df[features]
y = df["target_close_t1"]

# split 80/20 by time
split_idx = int(len(df) * 0.8)
X_train, y_train = X.iloc[:split_idx], y.iloc[:split_idx]
X_valid, y_valid = X.iloc[split_idx:], y.iloc[split_idx:]
dates_valid = df["Date"].iloc[split_idx:]

tscv = TimeSeriesSplit(n_splits=5)
pipe = Pipeline([
    ("scale", StandardScaler(with_mean=False)),
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

param_grid = {
    "rf__n_estimators": [200, 400],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_leaf": [1, 3, 5],
}

gcv = GridSearchCV(pipe, param_grid=param_grid, scoring="neg_mean_absolute_error", cv=tscv, n_jobs=-1)
gcv.fit(X_train, y_train)
best_model = gcv.best_estimator_
best_params = gcv.best_params_
best_mae_cv = -gcv.best_score_
best_params, best_mae_cv


In [ ]:
pred_valid = best_model.predict(X_valid)
mae = mean_absolute_error(y_valid, pred_valid)
rmse = mean_squared_error(y_valid, pred_valid, squared=False)
r2 = r2_score(y_valid, pred_valid)
print("MAE:", round(mae, 4))
print("RMSE:", round(rmse, 4))
print("R2:", round(r2, 4))

plt.figure(figsize=(12,5))
plt.plot(dates_valid, y_valid.values, label="Actual Close (t+1)")
plt.plot(dates_valid, pred_valid, label="Predicted")
plt.title("IYR: Actual vs Predicted (hold-out)")
plt.xlabel("Date"); plt.ylabel("Close"); plt.legend(); plt.show()

resid = y_valid.values - pred_valid
plt.figure(figsize=(10,4))
plt.hist(resid, bins=40, edgecolor="k")
plt.title("Residuals (hold-out)")
plt.show()
pd.Series(resid).describe()


In [ ]:
OUT_PRED.parent.mkdir(exist_ok=True, parents=True)
out = pd.DataFrame({
    "Date": dates_valid.values,
    "y_actual": y_valid.values,
    "y_pred": pred_valid
})
out.to_csv(OUT_PRED, index=False)
print("Saved:", OUT_PRED)

import joblib
model_path = MODEL_DIR / "rf_time_pipeline_stage10b.joblib"
joblib.dump(best_model, model_path)
print("Saved:", model_path)


Assumptions & Risks\n- Time-aware split (no shuffling) reduces leakage.\n- Predicting next-day Close using lag/rolling features.\n- Tree model handles non-linearities but is less interpretable.\n- Regime shifts (rates/inflation) can break relationships; retrain periodically.\n- Add prediction intervals / alternative metrics for business use.