# 1a. Train-test data split
Run this block of code only ONCE!!!

In [None]:
# Cleaning steps and splitting are done in main.py
# Recommended: run python -m src.data.main on the command line (more robust due to the relative imports)
import runpy

runpy.run_module("src.data.main", run_name="__main__")

# 1b. Load clean train-test dataset

In [1]:
import sys
sys.path.append("../")
import os
import pandas as pd


PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
TRAIN_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "train.csv")
TEST_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "test.csv")

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print(train_df.shape, test_df.shape)


(10868, 14) (2718, 14)


# 2. Separate features and target

In [2]:
# X = features, y = target
TARGET_COL = "price"

X_train = train_df.drop(columns=[TARGET_COL])
y_train = train_df[TARGET_COL]

X_test = test_df.drop(columns=[TARGET_COL])
y_test = test_df[TARGET_COL]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(10868, 13) (10868,)
(2718, 13) (2718,)


# 3. Preprocessing pipeline

In [3]:
# ---------- INITIATE TRAINER ----------

from src.models.trainer import ModelTrainer
trainer = ModelTrainer(
                        X_train=X_train,
                        y_train=y_train,
                        X_test=X_test,
                        y_test=y_test,
                        save_dir="../models/"
                        )
print("Trainer initiated...")

Trainer initiated...


## 3.1. Cross validation for all models

In [5]:
# ---------- CROSS VALIDATION FOR ALL MODELS ----------
# ------------ Ridge, RandomForest, XGBoost -----------

# On training dataset only
cv_results = trainer.cross_validate_models(cv=5)
for model, result in cv_results.items():
    print(f"=== {model} ===")
    print("CV RMSE scores:", result["cv_scores"])
    print("Mean CV RMSE:", round(result["mean_cv_rmse"], 3))
    print()



=== Ridge ===
CV RMSE scores: [0.3007527  0.29237358 0.28234049 0.29509142 0.28424697]
Mean CV RMSE: 0.291

=== RandomForest ===
CV RMSE scores: [232644.9636167  151613.12029042 137196.39559553 167398.75817744
 176404.70641951]
Mean CV RMSE: 173051.589

=== XGBoost ===
CV RMSE scores: [230536.38647916 146572.48095369 134985.08467546 155287.73976227
 170309.56469084]
Mean CV RMSE: 167538.251





**INTERPRETATIONS**

**1. Ridge Regression**
* Ridge is still using the **log-transformed price** internally. On this scale, RMSE ≈ 0.291 corresponds roughly to a **29% relative error** in predicting house price.
* Because of log-transform, the model predicts **multiplicative deviations** well but may underestimate large absolute errors on very expensive houses.

   Business translation:
* If the average house is €300k, expected prediction error ≈ €87k.
* Ridge gives a *solid baseline* for linear relationships between features and price, but may miss nonlinear effects.

**2. RandomForest**
* RandomForest is trained on **raw price**. Mean RMSE ≈ €173k, which is **quite large relative to median house prices** (likely due to skew in price distribution).
* Large variation across folds (from ~€137k to ~€233k) indicates the model struggles with **heterogeneity in prices** (e.g., very high-end vs. mid-range properties).

   Business translation:
* Typical house price prediction error ≈ €173k, so the model might be reliable for *broad pricing ranges* but not for precise valuation.
* Best for *flagging extreme under- or over-priced houses* rather than fine-grained pricing.

**3. XGBoost**
* XGBoost slightly improves over RandomForest (mean RMSE ≈ €167.5k).
* This model captures **nonlinear interactions better**, but the RMSE is still large, which suggests that **current features alone aren’t enough** for high-precision predictions.

   Business translation:
* Typical prediction error ≈ €167k for an average house.
* Useful for **estimating price ranges** but not precise enough for exact listing prices.

**4. Key Takeaways**

1. **Ridge** performs well in relative terms (log-scale), giving *stable predictions for multiplicative price changes*.
2. **Tree models** (RF/XGBoost) perform similarly, slightly better with XGBoost, but absolute errors are high because prices are skewed.
3. **High RMSE in absolute terms** indicates:
   * Skewed price distribution (very high-end properties).
   * Feature set may be **incomplete for capturing full price variability**.

**Business-friendly summary table**

| Model        | Mean CV RMSE | Relative / Business Interpretation                     |
| ------------ | ------------ | ------------------------------------------------------ |
| Ridge (log)  | 0.291        | ~29% typical deviation; €87k on €300k house            |
| RandomForest | €173,051     | High variation; good for broad price estimates         |
| XGBoost      | €167,538     | Slightly better; captures nonlinearities; still coarse |


# 4. Train and evaluate on test dataset
## 4.1. Ridge linear-model training

In [4]:
# sanity check: print available models
print(trainer.models)

{'Ridge': Ridge(), 'RandomForest': RandomForestRegressor(random_state=42), 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=100,
             n_jobs=None, num_parallel_tree=None, ...)}


In [5]:
ridge_results = trainer.train_and_evaluate(model_name="Ridge")

print("Ridge prediction results:")
print(ridge_results["Ridge"])

Saved Ridge model to: ../models/Ridge_pipeline.pkl
Ridge prediction results:
{'rmse_train': 158389.90654430288, 'r2_train': 0.6594917659966447, 'rmse_test': 168892.24429194772, 'r2_test': 0.6514293086700105}


## 4.2. RandomForest tree-model training

In [6]:
rf_results = trainer.train_and_evaluate(model_name="RandomForest")

print("RandomForest prediction results:")
print(rf_results["RandomForest"])



Saved RandomForest model to: ../models/RandomForest_pipeline.pkl
RandomForest prediction results:
{'rmse_train': 68240.08871262903, 'r2_train': 0.9367949500837908, 'rmse_test': 185849.78336192787, 'r2_test': 0.5779192237192274}


## 4.3. XGBoost tree-model training

In [7]:
xgb_results = trainer.train_and_evaluate(model_name="XGBoost")

print("XGBoost prediction results:")
print(xgb_results["XGBoost"])

Saved XGBoost model to: ../models/XGBoost_pipeline.pkl
XGBoost prediction results:
{'rmse_train': 97071.34223351079, 'r2_train': 0.8721046712131224, 'rmse_test': 175517.57981089983, 'r2_test': 0.6235453222970497}




## 4.4. Interpretations of the model-training results



# 5. Feature engineering and selection

In [None]:
# skipped for now because it is time-consuming

# 6. Baseline modeling and model comparison

In [None]:
# skipped for now...

# 7. Hyperparameter tuning (optional)