# Time-Series Template – Forecasting with Tabular + Temporal Structure

This notebook is a template for **time-series problems**, especially where data is in **tabular form with a time column**, optionally multiple entities (e.g. `store_id`, `symbol`, `sensor_id`).

It focuses on:

- Avoiding **leakage** with time-based splits
- Creating **lag** and **rolling window** features
- Handling **per-entity vs global** models


In [None]:
# ========== 1. Imports & Config (Time-Series) ==========

import os
from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config ----
DATA_DIR = Path("../input")
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"   # optional

TIME_COL = "date"        # name of time column
TARGET_COL = "target"    # name of numeric target
ID_COL = None            # e.g. "store_id" or "series_id", or None for single series

RANDOM_STATE = 42


In [None]:
# ========== 2. Load, Sort & Basic EDA ==========

def load_data(
    data_dir: Path = DATA_DIR,
    train_file: str = TRAIN_FILE,
    test_file: Optional[str] = TEST_FILE,
):
    train_path = data_dir / train_file
    if not train_path.exists():
        raise FileNotFoundError(f"Train file not found: {train_path}")
    train_df = pd.read_csv(train_path)

    test_df = None
    if test_file is not None:
        test_path = data_dir / test_file
        if test_path.exists():
            test_df = pd.read_csv(test_path)
        else:
            print(f"Test file not found: {test_path} (continuing without test_df)")

    return train_df, test_df


train_df, test_df = load_data()

# Ensure time column is datetime
train_df[TIME_COL] = pd.to_datetime(train_df[TIME_COL], errors="coerce")
if test_df is not None and TIME_COL in test_df.columns:
    test_df[TIME_COL] = pd.to_datetime(test_df[TIME_COL], errors="coerce")

# Sort by time (and ID if provided)
if ID_COL is not None and ID_COL in train_df.columns:
    train_df = train_df.sort_values([ID_COL, TIME_COL]).reset_index(drop=True)
else:
    train_df = train_df.sort_values(TIME_COL).reset_index(drop=True)

print("Train shape:", train_df.shape)
display(train_df.head())

# Simple target over time plot (single series or sample of entities)
if ID_COL is None:
    plt.plot(train_df[TIME_COL], train_df[TARGET_COL])
    plt.title("Target over time")
    plt.xlabel(TIME_COL)
    plt.ylabel(TARGET_COL)
    plt.show()


In [None]:
def add_datetime_parts(df: pd.DataFrame, time_col: str = TIME_COL) -> pd.DataFrame:
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
    df[f"{time_col}_year"] = df[time_col].dt.year
    df[f"{time_col}_month"] = df[time_col].dt.month
    df[f"{time_col}_day"] = df[time_col].dt.day
    df[f"{time_col}_dow"] = df[time_col].dt.dayofweek
    return df


def add_lag_features(
    df: pd.DataFrame,
    target_col: str = TARGET_COL,
    lags: List[int] = [1, 7, 14],
    group_col: Optional[str] = ID_COL,
) -> pd.DataFrame:
    df = df.copy()
    if group_col is not None and group_col in df.columns:
        df = df.sort_values([group_col, TIME_COL])
        for lag in lags:
            df[f"{target_col}_lag{lag}"] = df.groupby(group_col)[target_col].shift(lag)
    else:
        df = df.sort_values(TIME_COL)
        for lag in lags:
            df[f"{target_col}_lag{lag}"] = df[target_col].shift(lag)
    return df


def add_rolling_features(
    df: pd.DataFrame,
    target_col: str = TARGET_COL,
    windows: List[int] = [7, 28],
    group_col: Optional[str] = ID_COL,
) -> pd.DataFrame:
    df = df.copy()
    if group_col is not None and group_col in df.columns:
        df = df.sort_values([group_col, TIME_COL])
        for w in windows:
            df[f"{target_col}_rollmean_{w}"] = (
                df.groupby(group_col)[target_col].shift(1).rolling(window=w).mean()
            )
            df[f"{target_col}_rollstd_{w}"] = (
                df.groupby(group_col)[target_col].shift(1).rolling(window=w).std()
            )
    else:
        df = df.sort_values(TIME_COL)
        for w in windows:
            df[f"{target_col}_rollmean_{w}"] = df[target_col].shift(1).rolling(window=w).mean()
            df[f"{target_col}_rollstd_{w}"] = df[target_col].shift(1).rolling(window=w).std()
    return df


# Apply basic FE
ts_fe = add_datetime_parts(train_df, TIME_COL)
ts_fe = add_lag_features(ts_fe, TARGET_COL, lags=[1, 7, 14], group_col=ID_COL)
ts_fe = add_rolling_features(ts_fe, TARGET_COL, windows=[7, 28], group_col=ID_COL)

print("After TS feature engineering:", ts_fe.shape)
display(ts_fe.head())


In [None]:
def time_based_split(
    df: pd.DataFrame,
    time_col: str = TIME_COL,
    valid_fraction: float = 0.2,
):
    df = df.sort_values(time_col)
    n = len(df)
    split_idx = int((1 - valid_fraction) * n)
    train_df = df.iloc[:split_idx].copy()
    valid_df = df.iloc[split_idx:].copy()
    return train_df, valid_df


# Drop rows where lag/rolling features are NaN (at the beginning of series)
ts_fe_clean = ts_fe.dropna().reset_index(drop=True)
train_ts, valid_ts = time_based_split(ts_fe_clean, TIME_COL, valid_fraction=0.2)

print("Train TS shape:", train_ts.shape)
print("Valid TS shape:", valid_ts.shape)


In [None]:
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

def get_feature_target(df: pd.DataFrame, target_col: str = TARGET_COL):
    drop_cols = [target_col, TIME_COL]
    if ID_COL is not None and ID_COL in df.columns:
        drop_cols.append(ID_COL)
    X = df.drop(columns=drop_cols)
    y = df[target_col]
    return X, y


X_train_ts, y_train_ts = get_feature_target(train_ts)
X_valid_ts, y_valid_ts = get_feature_target(valid_ts)

num_cols = X_train_ts.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train_ts.select_dtypes(include=["object", "category"]).columns.tolist()

numeric_pipeline = Pipeline([
    ("imputer", numeric_imputer),
    ("scaler", StandardScaler(with_mean=False)),
])
cat_pipeline = Pipeline([
    ("imputer", categorical_imputer),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

transformers = []
if num_cols:
    transformers.append(("num", numeric_pipeline, num_cols))
if cat_cols:
    transformers.append(("cat", cat_pipeline, cat_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")

rf_ts = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=RANDOM_STATE,
)

ts_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", rf_ts),
])

ts_pipeline.fit(X_train_ts, y_train_ts)
y_pred_ts = ts_pipeline.predict(X_valid_ts)

rmse = mean_squared_error(y_valid_ts, y_pred_ts, squared=False)
mae = mean_absolute_error(y_valid_ts, y_pred_ts)

print(f"Time-series RF baseline – RMSE: {rmse:.4f}, MAE: {mae:.4f}")

plt.plot(valid_ts[TIME_COL], y_valid_ts, label="Actual")
plt.plot(valid_ts[TIME_COL], y_pred_ts, label="Predicted")
plt.legend()
plt.title("Forecast vs Actual (Validation)")
plt.xlabel(TIME_COL)
plt.ylabel(TARGET_COL)
plt.show()
