In [1]:
# Core
import pandas as pd
import numpy as np

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

# Statsmodels (statistical analysis)
from scipy.stats import loguniform, randint

# ML
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingRegressor

# File system
from pathlib import Path
from joblib import dump

In [2]:
ARTIFACTS = Path("artifacts")
ARTIFACTS.mkdir(parents=True, exist_ok=True)

In [3]:
# Load preprocessed data
df = pd.read_csv("data/cleaned_automarket_autos.csv")

print(df.shape)
print(df.head())

(19182, 23)
                                  Title    Price    Year  Condition  \
0        mercedes e trieda 400 e 4matic  22000.0  2014.0  pouzivane   
1   ford kuga 2 0 tdci dpf titanium 4wd   8900.0  2011.0  pouzivane   
2          peugeot 308 1 6 hdi business   3999.0  2014.0  pouzivane   
3  volvo xc90 b5 d awd at8 plus dark 7s  54990.0  2022.0  pouzivane   
4              suzuki swift 1 2 acc glx   4499.0  2015.0  pouzivane   

            Fuel     Transmission       Body Drive  Color EmissionStandard  \
0         Petrol  7st automaticka      sedan   AWD  black           Euro 6   
1         Diesel     6st manualna        suv   AWD  white           Euro 5   
2         Diesel     5st manualna  hatchback   FWD  white           Euro 5   
3  Hybrid Diesel  8st automaticka        suv   AWD  white           Euro 6   
4         Petrol     5st manualna  hatchback   FWD  white           Euro 5   

   ...     Brand Mileage_km Engine_cc Engine_l  Power_kW  Power_PS  \
0  ...  Mercedes   123

In [4]:
# Drop obvious outliers (same logic as in EDA)
p_lo, p_hi = df["Price"].quantile([0.01, 0.99])
m_lo, m_hi = 100, df["Mileage_km"].quantile(0.995)

df_ml = df.loc[
    df["Price"].between(p_lo, p_hi) &
    df["Mileage_km"].between(m_lo, m_hi) &
    (df["Condition"].str.lower() != "havarovane")
].copy()

In [5]:
# Add Car_Age
df_ml["Car_Age"] = 2025 - df_ml["Year"]

# Add 5-year bins
df_ml["Year_bin"] = pd.cut(
    df_ml["Year"],
    bins=[1950, 1960, 1970, 1980, 1990, 2000, 2010, 2015, 2020, 2025, 2030],
    labels=[
        "1950–1959", "1960–1969", "1970–1979", "1980–1989",
        "1990–1999", "2000–2009", "2010–2014", "2015–2019",
        "2020–2024", "2025–2029"
    ],
    right=False  # left-closed, right-open
)
print(df_ml[["Year", "Year_bin", "Car_Age"]].head(10))

     Year   Year_bin  Car_Age
0  2014.0  2010–2014     11.0
1  2011.0  2010–2014     14.0
2  2014.0  2010–2014     11.0
3  2022.0  2020–2024      3.0
4  2015.0  2015–2019     10.0
5  2016.0  2015–2019      9.0
6  2002.0  2000–2009     23.0
7  2023.0  2020–2024      2.0
8  2021.0  2020–2024      4.0
9  2002.0  2000–2009     23.0


In [6]:
# Group brands by market segment
brand_to_segment = {
    # Luxury / Premium
    "Audi": "Luxury", "BMW": "Luxury", "Mercedes": "Luxury",
    "Porsche": "Luxury", "Jaguar": "Luxury", "Lexus": "Luxury",
    "Land Rover": "Luxury", "Maserati": "Luxury", "Bentley": "Luxury",
    "Aston Martin": "Luxury", "Ferrari": "Luxury", "Cadillac": "Luxury",
    "Lincoln": "Luxury", "Alpina": "Luxury", "Hongqi": "Luxury",
    
    # Upper Midrange
    "Volkswagen": "Upper Midrange", "Volvo": "Upper Midrange",
    "Mini": "Upper Midrange", "Cupra": "Upper Midrange",
    
    # Mainstream / Midrange
    "Skoda": "Midrange", "Kia": "Midrange", "Hyundai": "Midrange",
    "Toyota": "Midrange", "Ford": "Midrange", "Peugeot": "Midrange",
    "Renault": "Midrange", "Citroën": "Midrange", "Seat": "Midrange",
    "Mazda": "Midrange", "Honda": "Midrange", "Nissan": "Midrange",
    "Subaru": "Midrange",
    
    # Budget / Economy
    "Dacia": "Budget", "Fiat": "Budget", "Opel": "Budget",
    "Suzuki": "Budget", "Lada": "Budget", "SsangYong": "Budget",
    
    # Commercial / Utility
    "MAN": "Commercial", "Iveco": "Commercial", "Isuzu": "Commercial",
    "Piaggio": "Commercial",
    
    # US Brands
    "Jeep": "US SUV", "Dodge": "US SUV", "Chevrolet": "US SUV",
    "Chrysler": "US SUV", "Buick": "US SUV",
    
    # Electric Focused
    "Tesla": "Electric Focused", "Smart": "Electric Focused", "MG": "Electric Focused",
    
    # Other / Rare
    "DS": "Other", "Dongfeng": "Other", "Mahindra": "Other", "Simca": "Other",
    "Daewoo": "Other", "Infiniti": "Other",
    "Abarth": "Other", "Alfa Romeo": "Other",
}

In [7]:
# Apply mapping to DataFrame
df_ml["Brand_Segment"] = df_ml["Brand"].map(brand_to_segment).fillna("Other")

# Check results
print(df_ml["Brand_Segment"].value_counts())

Brand_Segment
Midrange            9576
Luxury              3045
Upper Midrange      2688
Budget              2052
Other                226
US SUV               188
Commercial            71
Electric Focused      56
Name: count, dtype: int64


In [8]:
# Quick look at categorical variables
print(df_ml["Fuel"].unique())
print(df_ml["Body"].unique())
print(df_ml["Transmission_simple"].unique())
print(df_ml["Brand_Segment"].unique())

['Petrol' 'Diesel' 'Hybrid Diesel' 'Electric' 'Hybrid Petrol'
 'Plug-in Hybrid' 'LPG' 'CNG' 'Unknown']
['sedan' 'suv' 'hatchback' 'van/mpv' 'combi' 'coupe' 'cabrio' 'other'
 'unknown' 'utility' 'pickup']
['automaticka' 'manualna']
['Luxury' 'Midrange' 'Upper Midrange' 'Budget' 'Other' 'Electric Focused'
 'US SUV' 'Commercial']


In [9]:
# Category schema
FUEL_CATS = [
    "Petrol","Diesel","Hybrid Diesel","Electric","Hybrid Petrol",
    "Plug-in Hybrid","LPG","CNG","Unknown"
]
BODY_CATS = [
    "sedan","suv","hatchback","van/mpv","combi","coupe","cabrio",
    "other","unknown","utility","pickup"
]
BRANDSEG_CATS = [
    "Midrange","Luxury","Upper Midrange","Budget","Other",
    "Electric Focused","US SUV","Commercial"
]
TRANS_CATS = ["automaticka","manualna"]

In [10]:
# Quick sanity checks so warnings become actionable
def _assert_known(series, allowed, name):
    unknown = set(series.dropna().unique()) - set(allowed)
    if unknown:
        print(f"[WARN] {name}: unseen categories in data: {sorted(unknown)} (they'll be ignored)")

_assert_known(df_ml["Fuel"], FUEL_CATS, "Fuel")
_assert_known(df_ml["Body"], BODY_CATS, "Body")
_assert_known(df_ml["Brand_Segment"], BRANDSEG_CATS, "Brand_Segment")
_assert_known(df_ml["Transmission_simple"], TRANS_CATS, "Transmission_simple")

In [11]:
# Encoders wired to the schema + chosen baselines
ohe_bin = OneHotEncoder(
    categories=[TRANS_CATS],
    drop=["automaticka"],
    handle_unknown="ignore",
    dtype=float
)

ohe_cat = OneHotEncoder(
    categories=[FUEL_CATS, BODY_CATS, BRANDSEG_CATS],
    drop=["Petrol","sedan","Midrange"],
    handle_unknown="ignore",
    dtype=float
)

In [12]:
# ============================================
# MODEL 1: Base Linear Regression
# ============================================

# Features
num_cols = ["Car_Age", "Mileage_km", "Power_kW"]
cat_cols = ["Fuel", "Body", "Brand_Segment"]
bin_cols = ["Transmission_simple"]
X_base = df_ml[num_cols + bin_cols + cat_cols]

# Log-transform the target
df_ml["Log_Price"] = np.log1p(df_ml["Price"])
y = df_ml["Log_Price"]

# Define preprocessing and modeling pipeline
preprocess = ColumnTransformer([
    ("num", StandardScaler(), ["Car_Age","Mileage_km","Power_kW"]),
    ("bin", ohe_bin, ["Transmission_simple"]),
    ("cat", ohe_cat, ["Fuel","Body","Brand_Segment"]),
])

pipe = Pipeline([
    ("prep", preprocess),
    ("model", LinearRegression())
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_base, y, test_size=0.2, random_state=42)
# Fit and evaluate model
pipe.fit(X_train, y_train)
y_pred_log = pipe.predict(X_test)

# Convert back to original scale
y_pred_price = np.expm1(y_pred_log)
y_test_price = np.expm1(y_test)

# Evaluation metrics
mae = mean_absolute_error(y_test_price, y_pred_price)
r2 = r2_score(y_test_price, y_pred_price)
print("Model 1")
print(f"MAE: {mae:.2f} €")
print(f"R²: {r2:.3f}")

# Also show metrics in log space for reference
mae_log = mean_absolute_error(y_test, y_pred_log)
r2_log  = r2_score(y_test, y_pred_log)
print(f"MAE (log space): {mae_log:.3f}")
print(f"R² (log space):  {r2_log:.3f}")

Model 1
MAE: 2227.95 €
R²: 0.781
MAE (log space): 0.233
R² (log space):  0.746


In [13]:
# ================================================
# MODEL 2: Linear Regression with Mileage_per_Year
# ================================================

# Create new feature
df_ml["Mileage_per_Year"] = df_ml["Mileage_km"] / df_ml["Car_Age"].replace(0, 1)

# New numeric features set
num_cols_v2 = ["Year","Mileage_per_Year", "Power_kW"]
X_base_v2 = df_ml[num_cols_v2 + bin_cols + cat_cols]

# Redefine preprocess and pipeline
preprocess_v2 = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols_v2),
        ("bin", ohe_bin, ["Transmission_simple"]),
        ("cat", ohe_cat, ["Fuel","Body","Brand_Segment"]),
    ],
    remainder="drop"
)

pipe_v2 = Pipeline([
    ("prep", preprocess_v2),
    ("model", LinearRegression())
])

# Use the same target and split
X_train_v2, X_test_v2, y_train_v2, y_test_v2 = train_test_split(X_base_v2, y, test_size=0.2, random_state=42)

pipe_v2.fit(X_train_v2, y_train_v2)
y_pred_log_v2 = pipe_v2.predict(X_test_v2)
y_pred_price_v2 = np.expm1(y_pred_log_v2)

mae_v2 = mean_absolute_error(np.expm1(y_test_v2), y_pred_price_v2)
r2_v2 = r2_score(np.expm1(y_test_v2), y_pred_price_v2)

print("Model 2")
print(f"MAE: {mae_v2:.2f} €")
print(f"R²: {r2_v2:.3f}")

Model 2
MAE: 2386.68 €
R²: 0.756


In [14]:
# ============================================
# MODEL 3: RidgeCV with Age + Mileage_km + Mileage_per_Year
# ============================================

# Numeric features for Ridge (regularized to handle multicollinearity)
num_cols_v3 = ["Car_Age", "Mileage_km", "Mileage_per_Year", "Power_kW"]
X_base_v3 = df_ml[num_cols_v3 + bin_cols + cat_cols]

# Redefine preprocess and pipeline
preprocess_v3 = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols_v3),
        ("bin", ohe_bin, ["Transmission_simple"]),
        ("cat", ohe_cat, ["Fuel","Body","Brand_Segment"])
    ],
    remainder="drop"
)

# Cross-validated alpha on a log grid
alphas = np.logspace(-3, 3, 25)
pipe_v3 = Pipeline([
    ("prep", preprocess_v3),
    ("model", RidgeCV(alphas=alphas))
])

# Use the same split pattern as Model 2 for fair comparison
X_train_v3, X_test_v3, y_train_v3, y_test_v3 = train_test_split(
    X_base_v3, y, test_size=0.2, random_state=42
)

pipe_v3.fit(X_train_v3, y_train_v3)
y_pred_log_v3 = pipe_v3.predict(X_test_v3)
y_pred_price_v3 = np.expm1(y_pred_log_v3)

# Evaluate
mae_v3 = mean_absolute_error(np.expm1(y_test_v3), y_pred_price_v3)
r2_v3  = r2_score(np.expm1(y_test_v3), y_pred_price_v3)
mae_log_v3 = mean_absolute_error(y_test_v3, y_pred_log_v3)
r2_log_v3  = r2_score(y_test_v3, y_pred_log_v3)

print("Model 3")
print(f"MAE: {mae_v3:.2f} €")
print(f"R²:  {r2_v3:.3f}")
print(f"MAE (log): {mae_log_v3:.3f}")
print(f"R² (log):  {r2_log_v3:.3f}")

Model 3
MAE: 2249.86 €
R²:  0.775
MAE (log): 0.232
R² (log):  0.757


In [15]:
# Show chosen alpha
best_alpha = pipe_v3.named_steps["model"].alpha_
print(f"Ridge best alpha: {best_alpha:.4f}")

# Inspect top coefficients
def show_top_coefs(pipe, k=15):
    prep  = pipe.named_steps["prep"]
    model = pipe.named_steps["model"]

    num_names = prep.transformers_[0][2]
    bin_ohe   = prep.named_transformers_["bin"]
    cat_ohe   = prep.named_transformers_["cat"]
    bin_names = bin_ohe.get_feature_names_out(bin_cols)
    cat_names = cat_ohe.get_feature_names_out(cat_cols)
    feat_names = np.r_[num_names, bin_names, cat_names]

    coefs = model.coef_.ravel()
    coef_df = (pd.DataFrame({"feature": feat_names, "coef": coefs})
                 .sort_values("coef", ascending=False))
    print("\nTop positive coefficients:")
    print(coef_df.head(k).to_string(index=False))
    print("\nTop negative coefficients:")
    print(coef_df.tail(k).sort_values("coef").to_string(index=False))

show_top_coefs(pipe_v3)


Ridge best alpha: 3.1623

Top positive coefficients:
                     feature     coef
    Brand_Segment_Commercial 0.328333
                  Body_other 0.259219
                Body_utility 0.253410
                    Power_kW 0.223592
        Brand_Segment_Luxury 0.217050
                 Body_cabrio 0.212337
Brand_Segment_Upper Midrange 0.198414
          Fuel_Hybrid Diesel 0.194317
          Fuel_Hybrid Petrol 0.181337
                    Body_suv 0.174705
                 Fuel_Diesel 0.165799
                  Body_coupe 0.148914
         Fuel_Plug-in Hybrid 0.121061
            Mileage_per_Year 0.107510
                Body_unknown 0.085239

Top negative coefficients:
                       feature      coef
Brand_Segment_Electric Focused -0.312186
                    Mileage_km -0.289335
                       Car_Age -0.267508
  Transmission_simple_manualna -0.141507
                Body_hatchback -0.100986
          Brand_Segment_Budget -0.096087
                 Fuel_El

In [16]:
# ============================================================
# DECISION POINT: Moving from Linear Regression to Random Forest
# ============================================================
# Tried three linear variants:
#   1. OLS on log-price
#   2. OLS + engineered features (Mileage_per_Year)
#   3. RidgeCV regularized regression
# Best MAE ≈ 2250 € (R² ≈ 0.77). Good, but linear models assume additivity
# and miss non-linear effects and interactions (mileage impact differing
# by fuel, body, or brand segment). Next, switch to Random Forest, Decision Tree and HGBR to capture
# non-linearities and interactions without manual feature crosses.

In [17]:
# ================================
# MODEL 4: Random Forest Regressor
# ================================

# Reuse the same feature set as Model 1 (works well and is robust)
num_cols_rf = ["Car_Age", "Mileage_km", "Power_kW"]
cat_cols_rf = ["Fuel", "Body", "Brand_Segment"]
bin_cols_rf = ["Transmission_simple"]
X_rf = df_ml[num_cols_rf + bin_cols_rf + cat_cols_rf] 

# Random Forest Preprocess
bin_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(drop="if_binary", handle_unknown="ignore"))
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocess_rf = ColumnTransformer([
    ("num", "passthrough", ["Car_Age","Mileage_km","Power_kW"]),
    ("bin", ohe_bin, ["Transmission_simple"]),
    ("cat", ohe_cat, ["Fuel","Body","Brand_Segment"]),
])

# Use log-price target as with linear models 
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split( X_rf, y, test_size=0.2, random_state=42 )

# Define model and pipeline
rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
pipe_rf = Pipeline([("prep", preprocess_rf), ("model", rf)])

# Search space
param_dist = {
    "model__n_estimators": randint(200, 800),
    "model__max_depth": [None, 10, 14, 18, 22, 26],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.7]
}

search = RandomizedSearchCV(
    pipe_rf,
    param_distributions=param_dist,
    n_iter=25,
    scoring="neg_mean_absolute_error",
    cv=3,
    random_state=42,
    n_jobs=4,
    pre_dispatch="2*n_jobs",
    verbose=0
)
search.fit(X_train_rf, y_train_rf)

best_rf = search.best_estimator_
y_pred_log_rf = best_rf.predict(X_test_rf)
y_pred_price_rf = np.expm1(y_pred_log_rf)
mae_rf = mean_absolute_error(np.expm1(y_test_rf), y_pred_price_rf)
r2_rf  = r2_score(np.expm1(y_test_rf), y_pred_price_rf)
print("Model 4: Random Forest (log target)")
print(f"RF best params: {search.best_params_}")
print(f"RF MAE: {mae_rf:.2f} €")
print(f"RF R²:  {r2_rf:.3f}")

# Save the best RF pipeline
dump(best_rf, ARTIFACTS / "rf_pipeline.joblib")
print("Saved RF pipeline → artifacts/rf_pipeline.joblib")


Model 4: Random Forest (log target)
RF best params: {'model__max_depth': None, 'model__max_features': 0.3, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 430}
RF MAE: 1825.07 €
RF R²:  0.852
Saved RF pipeline → artifacts/rf_pipeline.joblib


In [18]:
# ===============================
# MODEL 5: DecisionTree Regressor
# ===============================

# Use the same preprocess and split as RF for fair comparison
dt = DecisionTreeRegressor(random_state=42)
pipe_dt = Pipeline([("prep", preprocess_rf), ("model", dt)])

param_dist_dt = {
    "model__max_depth": [6, 8, 10, 12, 14, 16, 18],
    "model__min_samples_leaf": [20, 50, 100, 150, 200],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__ccp_alpha": [0.0, 1e-4, 3e-4, 1e-3, 3e-3]
}

search_dt = RandomizedSearchCV(
    pipe_dt, 
    param_dist_dt, 
    n_iter=30, 
    scoring="neg_mean_absolute_error",
    cv=3, 
    random_state=42, 
    n_jobs=4,
    pre_dispatch="2*n_jobs",
    verbose=0
)
search_dt.fit(X_train_rf, y_train_rf)

# Extract best model and evaluate
best_dt = search_dt.best_estimator_
pred_dt = np.expm1(best_dt.predict(X_test_rf))
print("Model 5: DecisionTree Regressor")
print("DT MAE €:", mean_absolute_error(np.expm1(y_test_rf), pred_dt))
print("DT R²  :", r2_score(np.expm1(y_test_rf), pred_dt))
print("DT best params:", search_dt.best_params_)


Model 5: DecisionTree Regressor
DT MAE €: 2342.8393985028224
DT R²  : 0.7705461531726443
DT best params: {'model__min_samples_split': 20, 'model__min_samples_leaf': 20, 'model__max_depth': 14, 'model__ccp_alpha': 0.0001}


In [19]:
# # =========================================
# # MODEL 6: Hist Gradient Boosting Regressor
# # =========================================

# Use the same preprocess and split as RF for fair comparison
preprocess_hgb = ColumnTransformer(
    transformers=[
        ("num", "passthrough", ["Car_Age","Mileage_km","Power_kW"]),
        ("bin", ohe_bin, ["Transmission_simple"]),
        ("cat", ohe_cat, ["Fuel","Body","Brand_Segment"]),
    ],
    sparse_threshold=0.0
)

hgb = HistGradientBoostingRegressor(
    loss="squared_error",
    early_stopping=True,
    random_state=42
)

pipe_hgb = Pipeline([("prep", preprocess_hgb), ("model", hgb)])

param_dist = {
    "model__learning_rate": loguniform(0.01, 0.3),
    "model__max_iter": randint(400, 1600),
    "model__max_leaf_nodes": randint(31, 512),
    "model__max_depth": [None, 6, 8, 10, 12],
    "model__min_samples_leaf": randint(10, 200),
    "model__l2_regularization": loguniform(1e-6, 1.0),
    "model__max_bins": [255],
}
search_hgb = RandomizedSearchCV(
    pipe_hgb,
    param_distributions=param_dist,
    n_iter=25,
    scoring="neg_mean_absolute_error",
    cv=3,
    random_state=42,
    n_jobs=4,
    pre_dispatch="2*n_jobs",
    verbose=0
)
search_hgb.fit(X_train_rf, y_train_rf)

# Extract best model and evaluate
best_hgb = search_hgb.best_estimator_
y_pred_log = best_hgb.predict(X_test_rf)
y_pred = np.expm1(y_pred_log)
print("Model 6: Hist Gradient Boosting Regressor")
print("HGB MAE €:", mean_absolute_error(np.expm1(y_test_rf), y_pred))
print("HGB R²  :", r2_score(np.expm1(y_test_rf), y_pred))
print("HGB best params:", search_hgb.best_params_)

# Save the best HGB pipeline
dump(best_hgb, ARTIFACTS / "hgb_pipeline.joblib")
print("Saved HGB pipeline → artifacts/hgb_pipeline.joblib")

Model 6: Hist Gradient Boosting Regressor
HGB MAE €: 1850.3506440647457
HGB R²  : 0.8548980165577322
HGB best params: {'model__l2_regularization': 0.0055734523025838985, 'model__learning_rate': 0.02733296850684206, 'model__max_bins': 255, 'model__max_depth': 12, 'model__max_iter': 1045, 'model__max_leaf_nodes': 314, 'model__min_samples_leaf': 37}
Saved HGB pipeline → artifacts/hgb_pipeline.joblib


In [20]:
# ===============================================
# Baseline and Relative MAE (rMAE) for RF and HGB
# ===============================================

from sklearn.metrics import mean_absolute_error, r2_score

# True prices (back from log)
y_true = np.expm1(y_test_rf)
y_train_prices = np.expm1(y_train_rf)
median_price = np.median(y_train_prices)

# Baseline: predict median(train)
baseline_pred = np.full_like(y_true, median_price, dtype=float)
mae_base = mean_absolute_error(y_true, baseline_pred)
r2_base  = r2_score(y_true, baseline_pred)
rmae_base = mae_base / median_price

# Helper to compute rMAE of a pipeline that predicts log-price
def rmae(model, X, y_true, median_price):
    preds = np.expm1(model.predict(X))  # model predicts log-price
    return mean_absolute_error(y_true, preds) / median_price

rmae_rf  = rmae(best_rf,  X_test_rf, y_true, median_price)
rmae_hgb = rmae(best_hgb, X_test_rf, y_true, median_price)

print(f"Baseline → MAE: {mae_base:.0f} €, R²: {r2_base:.3f}, rMAE: {100*rmae_base:.1f}%")
print(f"RF   rMAE: {100*rmae_rf:.1f}%")
print(f"HGB  rMAE: {100*rmae_hgb:.1f}%")

Baseline → MAE: 5028 €, R²: -0.025, rMAE: 52.9%
RF   rMAE: 19.2%
HGB  rMAE: 19.5%


In [21]:
# ===============================
# Summary of all models
# ===============================
results = []
results += [{"model":"Linear", "MAE": mae, "R2": r2}]
results += [{"model":"Linear+MPY", "MAE": mae_v2, "R2": r2_v2}]
results += [{"model":"Ridge", "MAE": mae_v3, "R2": r2_v3}]
results += [{"model":"RF", "MAE": mae_rf, "R2": r2_rf}]
results += [{"model":"DT", "MAE": mean_absolute_error(np.expm1(y_test_rf), pred_dt),
             "R2": r2_score(np.expm1(y_test_rf), pred_dt)}]
results += [{"model":"HGB", "MAE": mean_absolute_error(np.expm1(y_test_rf), y_pred),
             "R2": r2_score(np.expm1(y_test_rf), y_pred)}]
print(pd.DataFrame(results).sort_values("MAE"))

        model          MAE        R2
3          RF  1825.074310  0.852450
5         HGB  1850.350644  0.854898
0      Linear  2227.951571  0.780894
2       Ridge  2249.860146  0.774547
4          DT  2342.839399  0.770546
1  Linear+MPY  2386.682451  0.756281


In [22]:
### What’s working best (so far):
# Random Forest (log-target): MAE ~ €1.83k, R² ~ 0.85, rMAE ~ 19%
# HistGradientBoosting (log-target): MAE ~ €1.85k, R² ~ 0.85, rMAE ~ 19–20%
### What I’ll do next
# I’ll try a few quick improvements and see if I can squeeze the MAE down a bit more.