# Multi-Target Regression Template – Tabular

This notebook is a template for **multi-output regression** problems, where you predict **multiple numeric targets** simultaneously.

It is similar to the single-target regression workflow, but uses:

- A list of target columns (`TARGET_COLS`)
- Multi-output wrappers (`MultiOutputRegressor`)
- Per-target and overall metrics.


In [None]:
# ========== 1. Imports & Config (Multi-Target Regression) ==========

import os
from pathlib import Path
from typing import Optional, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

try:
    from xgboost import XGBRegressor
except ImportError:
    XGBRegressor = None

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.dpi"] = 100

# ---- Config ----
DATA_DIR = Path("../input")
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

TARGET_COLS = ["target1", "target2"]  # list of numeric targets
ID_COL = "id"

RANDOM_STATE = 42


In [None]:
# ========== 2. Load Data & Helpers ==========

def load_data(
    data_dir: Path = DATA_DIR,
    train_file: str = TRAIN_FILE,
    test_file: Optional[str] = TEST_FILE,
):
    train_path = data_dir / train_file
    if not train_path.exists():
        raise FileNotFoundError(f"Train file not found: {train_path}")
    train_df = pd.read_csv(train_path)

    test_df = None
    if test_file is not None:
        test_path = data_dir / test_file
        if test_path.exists():
            test_df = pd.read_csv(test_path)
        else:
            print(f"Test file not found: {test_path} (continuing without test_df)")
    print("Train shape:", train_df.shape)
    if test_df is not None:
        print("Test shape:", test_df.shape)
    return train_df, test_df


def get_numeric_features(df: pd.DataFrame, exclude: Optional[List[str]] = None) -> List[str]:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if exclude:
        num_cols = [c for c in num_cols if c not in exclude]
    return num_cols


def get_categorical_features(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=["object", "category"]).columns.tolist()


train_df, test_df = load_data()
print("Targets:", TARGET_COLS)
display(train_df[TARGET_COLS].head())


### 3️⃣ Simple Preprocessing & Multi-Output Baseline Models

We’ll start with:

- Median imputation for numeric features
- Most frequent for categoricals
- One-hot encoding for categoricals
- MultiOutputRegressor wrapper for base models.


In [None]:
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer = SimpleImputer(strategy="most_frequent")

def build_preprocessor(df: pd.DataFrame):
    num_cols = get_numeric_features(df, exclude=TARGET_COLS + ([ID_COL] if ID_COL in df.columns else []))
    cat_cols = get_categorical_features(df)

    numeric_pipeline = Pipeline([
        ("imputer", numeric_imputer),
        ("scaler", StandardScaler(with_mean=False)),
    ])
    cat_pipeline = Pipeline([
        ("imputer", categorical_imputer),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ])

    transformers = []
    if num_cols:
        transformers.append(("num", numeric_pipeline, num_cols))
    if cat_cols:
        transformers.append(("cat", cat_pipeline, cat_cols))

    preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")
    return preprocessor


def build_multioutput_regressor(base_type: str):
    if base_type == "rf":
        base = RandomForestRegressor(
            n_estimators=300,
            max_depth=None,
            n_jobs=-1,
            random_state=RANDOM_STATE,
        )
    elif base_type == "elasticnet":
        base = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE)
    elif base_type == "xgb":
        if XGBRegressor is None:
            raise ImportError("xgboost not installed")
        base = XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method="hist",
            random_state=RANDOM_STATE,
        )
    else:
        raise ValueError(f"Unknown base_type: {base_type}")

    return MultiOutputRegressor(base)


In [None]:
def evaluate_multioutput_models(
    df: pd.DataFrame,
    target_cols: List[str] = TARGET_COLS,
    id_col: Optional[str] = ID_COL,
    base_types: Optional[List[str]] = None,
    test_size: float = 0.2,
    random_state: int = RANDOM_STATE,
):
    if base_types is None:
        base_types = ["rf", "elasticnet", "xgb"]

    df = df.copy()
    drop_cols = target_cols.copy()
    if id_col is not None and id_col in df.columns:
        drop_cols.append(id_col)

    X = df.drop(columns=drop_cols)
    Y = df[target_cols]

    X_train, X_valid, Y_train, Y_valid = train_test_split(
        X, Y, test_size=test_size, random_state=random_state
    )

    preprocessor = build_preprocessor(df)
    results = []

    for b in base_types:
        print(f"\n=== Base model: {b} ===")
        try:
            reg = build_multioutput_regressor(b)
            pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", reg)])
            pipe.fit(X_train, Y_train)

            Y_pred = pipe.predict(X_valid)

            # Per-target RMSE and R2
            per_target = {}
            for i, t in enumerate(target_cols):
                rmse_t = mean_squared_error(Y_valid.iloc[:, i], Y_pred[:, i], squared=False)
                r2_t = r2_score(Y_valid.iloc[:, i], Y_pred[:, i])
                per_target[t] = {"rmse": rmse_t, "r2": r2_t}
                print(f"  {t}: RMSE={rmse_t:.4f}, R²={r2_t:.4f}")

            # Aggregate RMSE
            overall_rmse = mean_squared_error(Y_valid.values, Y_pred, squared=False)
            print(f"Overall RMSE (all targets): {overall_rmse:.4f}")

            results.append({"base_type": b, "overall_rmse": overall_rmse, "per_target": per_target})
        except Exception as e:
            print("Error:", e)

    results_df = pd.DataFrame(
        [{
            "base_type": r["base_type"],
            "overall_rmse": r["overall_rmse"],
        } for r in results]
    ).sort_values("overall_rmse")
    display(results_df)
    return results, results_df


multi_results, multi_results_df = evaluate_multioutput_models(train_df)
