# 04a – Linear Regression Baseline (Tabular Only)

**Goal**

Train a simple linear regression baseline on tabular features only and evaluate:

- Performance on the **log-transformed target** (e.g., `log_sold_price`)
- Back-transformed metrics in **dollar space** (MAE, MAPE)

This will serve as a baseline to compare against more complex models (e.g., TabNet, multimodal deep learning).


In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

np.random.seed(0)

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)


NumPy version: 2.0.2
Pandas version: 2.2.2


In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    drive.mount("/content/drive")
    PROJECT_ROOT = Path("/content/drive/My Drive/SH")
else:
    PROJECT_ROOT = Path(".").resolve()

DATA_DIR = PROJECT_ROOT / "data"
PROC_DIR = DATA_DIR / "processed"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("PROC_DIR:", PROC_DIR)


Mounted at /content/drive
PROJECT_ROOT: /content/drive/My Drive/SH
PROC_DIR: /content/drive/My Drive/SH/data/processed


In [None]:
# Adjust filenames here if your processed files use different names
train_path = PROC_DIR / "train_multimodal.csv"
val_path   = PROC_DIR / "val_multimodal.csv"
test_path  = PROC_DIR / "test_multimodal.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Val shape  :", val_df.shape)
print("Test shape :", test_df.shape)

# Load prep summary for feature/target info
summary_path = PROC_DIR / "multimodal_prep_summary.json"
with open(summary_path, "r") as f:
    prep_summary = json.load(f)

criteria = prep_summary["criteria"]

TARGET_RAW_COL = criteria["target_column"]        # e.g. "sold_price"
TARGET_LOG_COL = criteria["log_target_column"]    # e.g. "log_sold_price"

NUMERIC_FEATURES = criteria["numeric_features"]
CATEGORICAL_FEATURES = criteria["categorical_features"]

# Keep only existing columns (in case some were dropped earlier)
NUMERIC_FEATURES = [c for c in NUMERIC_FEATURES if c in train_df.columns]
CATEGORICAL_FEATURES = [c for c in CATEGORICAL_FEATURES if c in train_df.columns]

print("Target (raw):", TARGET_RAW_COL)
print("Target (log):", TARGET_LOG_COL)
print("Numeric features:", len(NUMERIC_FEATURES))
print("Categorical features:", len(CATEGORICAL_FEATURES))


Train shape: (143643, 27)
Val shape  : (17955, 27)
Test shape : (17956, 27)
Target (raw): sold_price
Target (log): log_sold_price
Numeric features: 9
Categorical features: 9


## Processing  preprocessing (imputation + safe categorical encoding)

Use the same simple scheme as TabNet:

Numeric: fill NaNs with train median

Categorical: map each category to an integer; reserve "__UNKNOWN__" for unseen values in val/test

In [None]:
def fill_numeric_with_train_median(train_df, other_dfs, numeric_cols):
    med = train_df[numeric_cols].median(numeric_only=True)
    train_df[numeric_cols] = train_df[numeric_cols].fillna(med)
    for df in other_dfs:
        df[numeric_cols] = df[numeric_cols].fillna(med)
    return med


def fit_safe_category_maps(train_df, cat_cols):
    """
    Fit per-column mapping on train; reserve last index for UNKNOWN.
    Returns:
      maps: dict[col] -> dict[value(str)] -> int
    """
    maps = {}
    for col in cat_cols:
        s = train_df[col].astype("string").fillna("__MISSING__")
        cats = pd.Index(sorted(s.unique().tolist()))
        mapping = {v: i for i, v in enumerate(cats)}
        unknown_idx = len(mapping)
        mapping["__UNKNOWN__"] = unknown_idx
        maps[col] = mapping

        train_df[col] = s.map(mapping).astype("int64")

    return maps


def apply_safe_category_maps(df, cat_cols, maps):
    for col in cat_cols:
        mapping = maps[col]
        unknown_idx = mapping["__UNKNOWN__"]
        s = df[col].astype("string").fillna("__MISSING__")
        df[col] = s.map(lambda v: mapping.get(v, unknown_idx)).astype("int64")
    return df


# 1) Numeric imputation
_ = fill_numeric_with_train_median(train_df, [val_df, test_df], NUMERIC_FEATURES)

# 2) Categorical encoding
if CATEGORICAL_FEATURES:
    cat_maps = fit_safe_category_maps(train_df, CATEGORICAL_FEATURES)
    val_df   = apply_safe_category_maps(val_df,   CATEGORICAL_FEATURES, cat_maps)
    test_df  = apply_safe_category_maps(test_df,  CATEGORICAL_FEATURES, cat_maps)
else:
    cat_maps = {}

print("Preprocessing complete.")


Preprocessing complete.


## Build design matrices

Simply concatenate numeric and encoded categorical features without one-hot encoding (this is a quick baseline, not necessarily optimal).

In [None]:
FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
print("Total number of features:", len(FEATURES))

X_train = train_df[FEATURES].to_numpy(dtype=np.float64)
X_val   = val_df[FEATURES].to_numpy(dtype=np.float64)
X_test  = test_df[FEATURES].to_numpy(dtype=np.float64)

y_train = train_df[TARGET_LOG_COL].to_numpy(dtype=np.float64)
y_val   = val_df[TARGET_LOG_COL].to_numpy(dtype=np.float64)
y_test  = test_df[TARGET_LOG_COL].to_numpy(dtype=np.float64)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val  :", X_val.shape,   "y_val  :", y_val.shape)
print("X_test :", X_test.shape,  "y_test :", y_test.shape)


Total number of features: 18
X_train: (143643, 18) y_train: (143643,)
X_val  : (17955, 18) y_val  : (17955,)
X_test : (17956, 18) y_test : (17956,)


## Fit linear regression on log target

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print("Number of coefficients:", lr.coef_.shape[0])
print("Intercept:", lr.intercept_)


Number of coefficients: 18
Intercept: 8.164610542389227


## metrics in log space

In [None]:
def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    return rmse, mae, r2

def eval_split(name, X, y):
    pred = lr.predict(X)
    rmse, mae, r2 = regression_metrics(y, pred)
    print(f"{name} RMSE (log): {rmse:.3f}")
    print(f"{name} MAE  (log): {mae:.3f}")
    print(f"{name} R²         : {r2:.3f}")
    return {"rmse": float(rmse), "mae": float(mae), "r2": float(r2)}

print("=== Linear Regression baseline (log target, tabular only) ===")
metrics_log = {}
metrics_log["train"] = eval_split("Train", X_train, y_train)
metrics_log["val"]   = eval_split("Val",   X_val,   y_val)
metrics_log["test"]  = eval_split("Test",  X_test,  y_test)


=== Linear Regression baseline (log target, tabular only) ===
Train RMSE (log): 0.467
Train MAE  (log): 0.351
Train R²         : 0.536
Val RMSE (log): 0.468
Val MAE  (log): 0.349
Val R²         : 0.541
Test RMSE (log): 0.464
Test MAE  (log): 0.347
Test R²         : 0.548


## Detect log transform and evaluate in dollars

In [None]:
def detect_log_transform(df, raw_col, log_col, n=2000):
    """
    Heuristically detect whether log_col ≈ log(raw_col) or log1p(raw_col).
    """
    sub = df[[raw_col, log_col]].dropna()
    if len(sub) > n:
        sub = sub.sample(n, random_state=0)

    raw = sub[raw_col].to_numpy(dtype=np.float64)
    logv = sub[log_col].to_numpy(dtype=np.float64)

    # If raw has non-positive values, log(raw) invalid; prefer log1p
    if np.any(raw <= 0):
        diff_log1p = np.nanmean(np.abs(logv - np.log1p(np.maximum(raw, 0))))
        return "log1p", diff_log1p

    diff_log  = np.nanmean(np.abs(logv - np.log(raw)))
    diff_log1p = np.nanmean(np.abs(logv - np.log1p(raw)))
    if diff_log <= diff_log1p:
        return "log", diff_log
    else:
        return "log1p", diff_log1p

log_kind, err = detect_log_transform(train_df, TARGET_RAW_COL, TARGET_LOG_COL)
print("Detected log transform:", log_kind, "| mean abs diff:", err)

inv = (np.exp if log_kind == "log" else np.expm1)

def dollar_metrics(name, X, df):
    y_true = df[TARGET_RAW_COL].to_numpy(dtype=np.float64)
    y_pred_log = lr.predict(X)
    y_pred = inv(y_pred_log)

    mae = mean_absolute_error(y_true, y_pred)
    denom = np.maximum(np.abs(y_true), 1.0)
    mape = np.mean(np.abs((y_true - y_pred) / denom)) * 100.0

    print(f"{name} MAE ($)  : {mae:,.0f}")
    print(f"{name} MAPE (%) : {mape:.2f}")
    return {"mae_dollars": float(mae), "mape_pct": float(mape)}

print("\n=== Back-transformed ($) metrics ===")
metrics_dollars = {}
metrics_dollars["val"]  = dollar_metrics("Val",  X_val,  val_df)
metrics_dollars["test"] = dollar_metrics("Test", X_test, test_df)


Detected log transform: log1p | mean abs diff: 5.337952302397752e-16

=== Back-transformed ($) metrics ===
Val MAE ($)  : 172,616
Val MAPE (%) : 40.01
Test MAE ($)  : 171,544
Test MAPE (%) : 39.48
