In [None]:
# Open Questions (maybe ask Ricardo):
# - Is there downside of using sklearn pipelines or is it best practice if we understood it and can explain it well?
# - How far can we go with cleaninig the data so before it becomes outlier handling? (e.g. only keeping cars with mpg between 5 and 150 vs. between 10 and 80)
# - can we remove hasDamage before the actual feature selection or is it only allowed to remove features in the FS part even though this would lead to unnecessary steps and wasting computing power

**Your work will be evaluated according to the following criteria:**
- Project Structure and Notebook(s) Quality (4/20)
- Data Exploration & Initial Preprocessing (4/20)
- Regression Benchmarking and Optimization (7/20)
- Open-Ended Section (4/20)
- Deployment (1/20)
- Extra Point: Have Project Be Publicly Available on GitHub (1/20)


**Project Timeline**
- 22.11.: Preprocessing and Model Preparation
    - Finish clean preprocessing all included in pipeline
    - Finish clean Hyperparameter Tuning
- 29.11.: Feature Selection
    - Clean and structured approach for feature selection for all models (best case: consistent approach imo)
- 29.11.: Regression Benchmarking and Optimization
    - Automize Optimization (add something like mlflow)
- 06.12.: Open-End Section and Deployment
    - Added 4 open-end-experiments
    - Deployment
- 13.12.: Notebook Feinschliff
    - Super clean notebook structure similar to lab-notebooks by Ricardo
    - Show and explain results of different models clearly in markdown tables etc. (see the lab-notebooks)
- 14.12.: Submission

In [None]:
# TODO Open End Section:
# Interface for new Car

<div style="
    background: rgba(25, 25, 25, 0.55);
    backdrop-filter: blur(16px) saturate(150%);
    -webkit-backdrop-filter: blur(16px) saturate(150%);
    border: 1px solid rgba(255, 255, 255, 0.12);
    border-radius: 18px;
    padding: 45px 30px;
    text-align: center;
    font-family: 'Inter', 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
    color: #e0e0e0;
    box-shadow: 0 0 30px rgba(0, 0, 0, 0.35);
    margin: 40px auto;
    max-width: 800px;
">

  <h1 style="
      font-size: 2.8em;
      font-weight: 700;
      margin: 0 0 8px 0;
      letter-spacing: -0.02em;
      background: linear-gradient(90deg, #00e0ff, #9c7eff);
      -webkit-background-clip: text;
      -webkit-text-fill-color: transparent;
  ">
      Machine Learning Project
  </h1>

  <h2 style="
      font-size: 1.6em;
      font-weight: 500;
      margin: 0 0 25px 0;
      color: #b0b0b0;
      letter-spacing: 0.5px;
  ">
      Cars 4 You - Predicting Car Prices
  </h2>

  <p style="
      font-size: 1.25em;
      font-weight: 500;
      color: #c0c0c0;
      margin-bottom: 6px;
  ">
      Group 5 - Lukas Belser, Samuel Braun, Elias Karle, Jan Thier
  </p>

  <p style="
      font-size: 1.05em;
      font-weight: 400;
      color: #8a8a8a;
      font-style: italic;
      letter-spacing: 0.5px;
  ">
      Machine Learning End Results · 22.12.2025
  </p>
</div>


<img src="images/process_ML.png" alt="Drawing" style="width: 1000px;"/>

### **Table of Contents**
 
- [1. Import Packages and Data](#1-import-packages-and-data)  
  - [1.1 Import Required Packages](#11-import-required-packages)  
  - [1.2 Load Datasets](#12-load-datasets)  
  - [1.3 Kaggle Setup](#13-kaggle-setup)  
- [2. Preprocessing](#2-data-cleaning-feature-engineering-split--preprocessing)  
  - [2.1 Data Cleaning](#21-data-cleaning)  
  - [2.2 Feature Engineering](#22-feature-engineering)  
  - [2.3 (No) Data Split](#23-data-split)  
  - [2.4 Encoding, Transforming and Scaling](#24-preprocessing)  
  - [2.5. Feature Selection](#3-feature-selection)  
- [4. Model Evaluation Metrics, Baselining, Setup](#4-model-evaluation-metrics-baselining-setup)  
- [5. Hyperparameter Tuning and Model Evaluation](#5-hyperparameter-tuning-and-model-evaluation)  
  - [5.1 ElasticNet](#51-elasticnet)  
  - [5.2 HistGradientBoost](#52-histgradientboost)  
  - [5.3 RandomForest](#53-randomforest)  
  - [5.4 ExtraTrees](#54-extratrees)  
- [6. Feature Importance of Tree Models (with SHAP)](#6-feature-importance-of-tree-models-with-shap)  
  - [6.1 HGB](#61-hgb)  
  - [6.2 RF](#62-rf)  
- [7. Kaggle Competition](#7-kaggle-competition)  

TODO finish + update toc > at the end of project

### 1. Import Packages and Data

#### 1.1 Import Required Packages

In [None]:
!pip install kaggle
!pip install shap
!pip install -U scikit-learn
!pip install category_encoders
!pip install ydata-profiling

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt; plt.rcParams.update({"figure.max_open_warning": 0, "figure.dpi": 100})
import joblib
import shap

from collections import Counter
from sklearn.feature_selection import VarianceThreshold, RFE
from scipy.stats import spearmanr, uniform, randint
from sklearn.metrics import mean_absolute_error
 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, StandardScaler, FunctionTransformer, RobustScaler
from sklearn.base import clone, BaseEstimator, TransformerMixin
 
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV, cross_validate
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import SelectFromModel



from tqdm.auto import tqdm

from category_encoders import QuantileEncoder # used for median target encoding (sklearn only supports mean target encoding with their TargetEncoder class)
 
from car_functions import clean_car_dataframe
from pipeline_functions import GroupImputer, m_estimate_mean, CarFeatureEngineer, NamedFunctionTransformer, to_float_array, model_hyperparameter_tuning, DebugTransformer, MajorityVoteSelectorTransformer, CorrelationThresholdSelector, MutualInfoThresholdSelector, plot_selector_agreement

from collections import Counter
from sklearn.inspection import permutation_importance
from tqdm.auto import tqdm

#### 1.2 Load Datasets

In [None]:
df_cars_train = pd.read_csv("train.csv")
df_cars_test = pd.read_csv("test.csv")

#### 1.3 Kaggle Setup

In [None]:
# Kaggle API Connect

# Folder containing kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = "/Workspace/Users/20250355@novaims.unl.pt" #add your own kaggle.json api token

# Test
!echo $KAGGLE_CONFIG_DIR

### 2. Preprocessing

#### 2.1 Data Cleaning

In [None]:
# Outlier Preprocessing, Missing Value Handling and Decision justifying happens here
# TODO add clean_car_dataframe to pipeline
df_cars_train = clean_car_dataframe(df_cars_train)
df_cars_test = clean_car_dataframe(df_cars_test)

# Safety Check: print unique values of all columns of df_cars_train // df_cars_test to see if data cleaning worked and if there are still odd values
for col in df_cars_train.columns:
    print(col, df_cars_train[col].unique())
print("X"*150)
for col in df_cars_test.columns:
    print(col, df_cars_test[col].unique())

#### 2.2 (No) Data Split

**Our approach:**
- Train: use cross-validation in the sklearn pipeline on the training data
- Test: Use external hold-out set from kaggle as final test set    
-> An additional val set is therefore not necessary and would waste training data


<u>Place in the pipe:</u> The split is decided here because the data has to be split before all of the following preprocessing steps to avoid data leakage

In [None]:
X_train = df_cars_train.drop(columns='price')
y_train = df_cars_train['price']

#### 2.3 Handling missing values

**Our approach:**
- `Group Imputer`: We use a custom GroupImputer that imputes the missing values to be the median of entries within the same group
    - For that we use a hierarchical structure to identify the most similar group to the one with the missing value:
        - 1st level: ...
        - 2nd level: ...
        - ...
        - 4th level: Model
        - 5th level: Brand

<u>Place in the pipe:</u> The Imputation is decided here because the data has to be imputed on original values before engineering new features

#### 2.4 Outlier Handling

**Our approach:**
- Outlier detection through multiple methods to increase the probability that it's actually an outlier
- ...


<u>Place in the pipe</u>: After imputation to have no missing values left and before FE to not create new features based on (unrealistic) outliers

In [None]:
# TODO maybe even before Imputation: "If you impute first, you fill the original gaps based on a distribution that includes the massive outliers (skewing the mean/median). If you kill the outliers first (set to NaN), the imputation for everyone becomes cleaner."

# TODO
# e.g. how to handle Zeros in tax (use groupimputer?) -> features that are computed with tax are also affected and need to be handled then ~J
# e.g. maybe outlier handling per model (if sample size big enough) ~J

# TODO maybe use different outlier handling pipes for tree-based vs. linear vs. NN models to show that we understand the differences ~J

#### 2.5 Feature Engineering

**Our approach:**
- We use the class CarFeatureEngineer to be able create the engineered features within the pipeline to prevent data leakage

<u>Base Feature Creation</u>

These are foundational features derived directly from the original data, often to create linear relationships or capture interactions.
- `age`: Calculated as `2020 - year`. Creates a simple linear feature representing the car's age. Newer cars (lower age) generally have higher prices.
- `miles_per_year`: Calculated as `mileage / age`. This normalizes the car's usage, preventing high correlation (multicollinearity) between `mileage` and `age`. A 3-year-old car with 60,000 miles is different from a 6-year-old car with 60,000 miles.
- `age_x_engine`: An interaction term `age * engineSize`. This helps the model capture non-linear relationships, such as the possibility that the value of cars with large engines might depreciate faster (or slower) than cars with small engines.
- `mpg_x_engine`: An interaction term `mpg * engineSize`. This captures the combined effect of fuel efficiency and engine power.
- `tax_per_engine`: Calculated as `tax / engineSize`. This feature represents the tax cost relative to the engine's power, which could be an indicator of overall running costs or vehicle class.
- `mpg_per_engine`: Calculated as `mpg / engineSize`. This creates an "efficiency" metric, representing how many miles per gallon the car achieves for each unit of engine size.


<u>Popularity & Demand Features</u>

These features attempt to quantify a car's popularity or market demand, which directly influences price.
- `model_freq`: Calculates the frequency (percentage) of each `model` in the training dataset. Popular, common models often have more stable and predictable pricing and demand.


<u>Price Anchor Features</u>

These features "anchor" a car's price relative to its group. They provide a strong baseline price signal based on brand, model, and configuration.
- `brand_med_price`: The median price for the car's `Brand` (e.g., the typical price for a BMW vs. a Skoda). This captures overall brand positioning.
- `model_med_price`: The median price for the car's `model` (e.g., the typical price for a 3-Series vs. a 1-Series). This captures the model's positioning within the brand.
- `brand_fuel_med_price`: The median price for the car's specific `Brand` and `fuelType` combination (e.g., a Diesel BMW vs. a Petrol BMW).
- `brand_trans_med_price`: The median price for the `Brand` and `transmission` combination (e.g., an Automatic BMW vs. a Manual BMW).


<u>Normalized & Relative Features</u>

These features compare a car to its peers rather than using absolute values.
- `*_anchor` (e.g., `brand_med_price_anchor`): Created by dividing the median price features (from section 3) by the `overall_mean_price`. This makes the feature dimensionless and represents the group's price *relative* to the entire market (e.g., "this brand is 1.5x the market average").
- `age_rel_brand`: Calculated as `age - brand_median_age`. This shows if a car is newer or older than the *typical* car for that specific brand, capturing relative age within its own group.


<u>CV-Safe Target Encodings</u>

This is an advanced technique to encode categorical variables (like `model` or `Brand`) using information from the target variable (`price`) without causing data leakage.
- `*_te` (e.g., `model_te`): Represents the *average price* for that category (e.g., the average price for a "Fiesta").
- **Why is it "CV-Safe"?** Instead of just calculating the global average price for "Fiesta" and applying it to all rows (which leaks target information), this method uses K-Fold cross-validation. For each fold of the data, the target encoding is calculated *only* from the *other* folds. This ensures the encoding for any given row never includes its own price, preventing leakage and leading to a more robust model.

In [None]:
# TODO maybe add ('poly', PolynomialFeatures(degree=2)) to the pipeline for interaction terms ~J

#### 2.6 Encoding, Transforming and Scaling

**Our approach:**
- We identify different treatments for different groups of variables and combine all of them in the ColumnTransformer

In [None]:
# Original features
orig_numeric_features = ["year", "mileage", "tax", "mpg", "engineSize", "previousOwners", "hasDamage"]
# TODO create origic_boolean_features for hasDamage ~J
orig_categorical_features = ["Brand", "model", "transmission", "fuelType"]

##### [2.6.0 Baseline Pipe]

**Our approach:**
- We create two different pipelines to compare our fully adjusted model to the baseline
    - original features with simple imputer
    - engineered features with group imputer

In [None]:
# PIPELINE WITH preprocessor_orig CONTAINING ONLY ORIGINAL FEATURES and USING SIMPLE IMPUTATION
numeric_transformer_orig = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),    # simple global median imputation
    ("to_float", FunctionTransformer()), # TODO dont we have to add the to_float_array function here too? ~J
    ("scaler", RobustScaler())
])

categorical_transformer_orig = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),  # fill by mode instead of Unknown
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-hot encoding
])

preprocessor_orig = ColumnTransformer([
    ("num", numeric_transformer_orig, orig_numeric_features),
    ("cat", categorical_transformer_orig, orig_categorical_features)
])

##### 2.6.1 Categorize Features

**Our approach:**
- Numeric Features:
    - For scaling:
        - ...
    - For log-transformation before scaling (due to right-skew identified in EDA):
        - ...
- Categorical Features:
    - For OHE:
        - ...
    - For TE:
        - ...
- Unused Features:
    - `year`: dropped because replaced by derived feature 'age'
    - `hasDamage`: droppped because unclear what 0 and NaN mean
    - `paintQuality`: dropped because added by mechanic so not available for our predictions in production

In [None]:
# TODO maybe this step can also be automated within the pipeline ~J
numeric_features = [
    "hasDamage",
    "age", "tax", "mpg", "engineSize", "previousOwners",        # Original features (mileage is handled separately because of log transformation)
    "mpg_x_engine",                                             # TODO this feature does not really make sense, however it improves MAE slightly (3) ~J
    "engine_x_age", "mpg_x_age", "tax_x_age",                   # multiplication interaction features (multiplying for amplification)                                   
    "engine_per_mpg", "tax_per_mpg",                            # division interaction features (division for normalization for ratios (efficiency))                 
    "model_freq",
    "age_rel_brand", "age_rel_model", "engine_rel_model"
]
numeric_features_for_log = ["mileage", "miles_per_year"] #, "mileage_x_age"] # mileage_x_age decreases performance slightly
boolean_features = ["hasDamage"]                                # TODO create logic for boolean features in GroupImputer and ColumnTransformer
categorical_features_ohe = ["transmission", "fuelType"]
# categorical_features_te_mean = ["Brand", "model"]             # TODO currently not used because median TE is used
categorical_features_te_median = ["Brand", "model",             # original features
                                  "brand_fuel", "brand_trans"]  # engineered features for anchors
unused_columns = ["year"]                                       # replaced by age

all_feature_names_before_encoding = numeric_features + numeric_features_for_log + boolean_features + categorical_features_ohe + categorical_features_te_median
print(len(all_feature_names_before_encoding))

##### 2.6.2 Create Pipelines for each feature type

**Our approach:**
- `Transformation` for skewed Numerics:
    - Log-transform for right-skewed variables
    - box-cox not used in final pipe because ...
- `Scaler` for Numerics:
    - StandardScaler because ...
    - MinMaxScaler performed worse because ...
    - RobustScaler performed worse because ...
- `Encoding` for Categoricals:
    - Low cardinality:
        - OHE because best performance with tree-based models
    - High Cardinality:
        - Median TE on categorical features because performs better than Mean TE
- `ColumnTransformer`: All operations are combined in the ColumnTransformer which applies the different steps to different columns of the data in one unified pipeline (reproducible and prevents data leakage)    
  -> outputs a combined feature matrix

In [None]:
# TODO two different scalers are used here. Isnt it better to use exactly one? ~J 
log_transformer_and_scaler = Pipeline([
    ("to_float", NamedFunctionTransformer(to_float_array, feature_names=numeric_features_for_log, validate=False)),
    ("log",    NamedFunctionTransformer(np.log1p, feature_names=numeric_features_for_log, validate=False)),  # log1p handles zeros safely
    ("scaler", RobustScaler()),
])

numeric_scaler = Pipeline([
    ("to_float", NamedFunctionTransformer(to_float_array, feature_names=numeric_features, validate=False)),
    ("scaler", RobustScaler()),
])

categorical_transformer_ohe = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

# Keep mean target encoder in the code but dont use it for now because median TE seems more robust and we use only one method for consistency ~J
# categorical_transformer_te_mean = Pipeline([ 
#     ("encoder", TargetEncoder(target_type='continuous', cv=5, smooth='auto', random_state=42)), # Prevents data leakage with CV (e.g. for the samples in Fold 1, it calculates the target mean using the data from Folds 2, 3, 4, and 5) # TODO If it overfits test data too much, increasing the smoothing parameter can help
#     ("scaler", StandardScaler()),
# ])

# Names for median-TE features (one per input column, since QuantileEncoder outputs 1 column per feature)
median_te_feature_names = [f"{col}_median_te" for col in categorical_features_te_median]
categorical_transformer_te_median = Pipeline(steps=[
    ('median_encoder', QuantileEncoder(quantile=0.5, m=10.0)), # not specifying the cols means it encodes all columns (m is the smoothing parameter -> smoothing mitigates but doesnt eliminate leakage) # TODO tune m?
    # TODO Why is to_float not needed here but in other pipelines? ~J
    ('scaler', RobustScaler()),
    ('name_wrapper', NamedFunctionTransformer(feature_names=median_te_feature_names,
                                              validate=False)),
])

enc_transf_scale = ColumnTransformer([
    ("log", log_transformer_and_scaler, numeric_features_for_log),
    ("num", numeric_scaler, numeric_features),
    ("cat", categorical_transformer_ohe, categorical_features_ohe),
    # ("mean_te", categorical_transformer_te_mean, categorical_features_te_mean), # Mean TE is currently not used but we keep it in the code for reference or later experimenting ~J
    ("median_te", categorical_transformer_te_median, categorical_features_te_median)
])

#### 2.7 Feature Selection

**Our approach:**
- We try to generalize feature selection as much as possible to find features that are most likely actually irrelevant/redundant and therefore generate noise in the model that might lead to overfitting
- To achieve that goal we use a two-step technique: (1) We filter, (2) We use majority voting of multiple techniques:
    - *Filter* methods to make an initial screening of the statistical properties of the data: 
        - `VarianceThreshold` to remove constant features. It serves as a cheap, fast garbage collector.
        - `Correlation Indices` to identify redundant features and analyze the relationship with the target to identify irrelevant features (leakage-proof)
        - `Statistical Hypothesis Testing` to identify irrelevant features by measuring the relationship with the target
        - `Mutual Information` also considers nonlinear features and is implemented in the following way in sklearn (KNN approach for regression)
            1. It looks at a data point in the Feature space (e.g., a specific car's mileage)
            2. It finds the k closest neighbors (other cars with similar mileage)
            3. It checks if those neighbors also have similar Prices (Target)
            4. If neighbors in Mileage are consistently neighbors in Price, the MI score goes up.
    - *Wrapper*:
        - `Recursive Feature Elimination` to eliminate weak features stepwise 
    - *Embedded*: SelectFromModel(RandomForest) inside each model pipeline as a supervised feature selector.
        - This selector is trained within cross-validation and shared across all models, ensuring: _No data leakage, Consistent feature selection logic, Model-agnostic, non-linear evaluation of feature relevance._
        - We only use this supervised selector for our models that are more sensitive to high dimensionality and collinearity (ElasticNet, SVR)

<u>Place in the pipe:</u> The Feature Selection is placed after the scaling to have the features on one scale (just like in the lab)

**Our findings:**
- For tree-based ensemble models (RF, ET, HGB, and SR), our runs showed that explicit feature selection did **not improve** performance, as these models already handle redundant features well through mechanisms like greedy node splitting, feature subsampling (bagging), and iterative error correction.

-----

SKlearn elements we also considered:
- Filter Methods: SelectKBest & SelectPercentile
- Embedded: No more than SelectFromModel
- Wrapper Methods: RFE, RFECV, SequentialFeatureSelector (too expensive)

In [None]:
# TODO maybe use different pipes for tree-based vs. linear vs. NN models to show that we understand the differences ~J

######################## Filter methods ########################
# Variance Threshold (If using a different value than threshold 0, VarianceThreshold has to be applied before scaling while most of the other FS techniques should be applied after scaling)
vt = VarianceThreshold(threshold=0.0)

# Select top features based on importance from model (Random Forest)
fs_estimator_for_fs = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
)


######### Define voters
### SelectKBest is a is a univariate filter -> runs a statistical test on each feature individually (e.g. corr with target) and takes the top k
# Voter: Statistical linear Correlation -> misses nonlinear relationships so has to be combined with other techniques in majority voting
stat_voter_linear_corr = CorrelationThresholdSelector(threshold=0.05)                     # TODO tune threshold? # Use custom correlation threshold selector instead of SelectKBest(score_func=f_regression, k=x) because we dont want to fix that k

# Voter: Mutual Information (Non-Linear Dependency) MI calculates how much the "uncertainty" (Entropy) drops when using different features ()
# stat_voter_nonlinear_mi = SelectKBest(score_func=mutual_info_regression(n_neighbors=10), k=20)     # TODO tune k? # Increasing n_neighbors makes the estimation more stable but computationally slower
stat_voter_nonlinear_mi = MutualInfoThresholdSelector(threshold=0.01, n_neighbors=10)                # Use custom instead of SelectKBest because we dont want to fix k

### SelectFromModel is a model-based feature selector that uses model importance scores
# Voter: Linear Model (Lasso) (only selects linear features and will kill interaction terms) (!Lasso needs scaling beforehand!)
lasso_for_fs = Lasso(alpha=0.1) # TODO do I have to use LassoCV?
select_from_lasso = SelectFromModel(lasso_for_fs, threshold=0.01)         # Very low threshold to filter out the values set to 0 by lasso. Default prefit=False to fit in the pipeline

# Voter: Tree Importance
rf_for_fs = RandomForestRegressor(max_depth=5)
select_from_rf = SelectFromModel(rf_for_fs, threshold='0.001*mean')         # threshold relative because it sums to 1 and if we have many features, many features will have a low importance but are still important


# Voter: RFE (Recursive Feature Elimination) is excluded for now because it is very computationally expensive


##### Initialize Custom Selector
majority_selector = MajorityVoteSelectorTransformer(
    selectors=[stat_voter_linear_corr,
               stat_voter_nonlinear_mi,
               select_from_lasso,
               select_from_rf],
    min_votes=2                                     # TODO can be tuned in hyperparameter tuning ~J
)


# ==> Final FS pipeline
fs_pipe = Pipeline([
    ("vt", vt), # Apply VT first to remove constant features (it serves as a "dictator" and not a "voter" in our pipeline)
    ('selector', majority_selector),
])

#### 2.9 Create Final Preprocessing Pipeline

**Our approach:**
- The `Pipeline` combines feature engineering, group imputation and the column transformer into the final preprocessing pipe
    - Feature Engineering: see Markdown in Section 2.2
    - Group Imputer: see Markdown in ...
    - Column Transformer: see explanation in line 1

In [None]:
preprocessor_pipe = Pipeline([
    ("group_imputer", GroupImputer(
        group_cols=("Brand", "model"),
        num_cols=orig_numeric_features,                         # We have to use the original features here because the others are engineered in the next step
        cat_cols=orig_categorical_features,                     # We have to use the original features here because the others are engineered in the next step
        fallback="__MISSING__",
    )),
    # TODO add outlier handling step here (maybe even before Imputation)
    ("fe", CarFeatureEngineer(ref_year=2020)),
    ("ct", enc_transf_scale),
    ("fs", fs_pipe)
])

In [None]:
# Visualize outputs of each step in the preprocessing pipeline
enc_transf_scale.set_output(transform="pandas")
fs_pipe.set_output(transform="pandas")

show_data = True # needs to be set to False when running the models because 'display' is used
y_data_profiling = False
debug_preprocessor_pipe = Pipeline([
    ('debug_start', DebugTransformer('START', show_data=show_data, y_data_profiling=y_data_profiling)),
    ("group_imputer", GroupImputer(
        group_cols=("Brand", "model"),
        num_cols=orig_numeric_features, # numeric_features + numeric_features_for_log,                      # We have to use the original features here because the others are engineered in the next step
        cat_cols=orig_categorical_features, # categorical_features_ohe + categorical_features_te_median,    # We have to use the original features here because the others are engineered in the next step
        fallback="__MISSING__",
    )),
    ('debug_after_impute', DebugTransformer('AFTER IMPUTATION', show_data=show_data, y_data_profiling=y_data_profiling)),
    ("fe", CarFeatureEngineer(ref_year=2020)),
    ('debug_after_fe', DebugTransformer('AFTER FEATURE ENGINEERING', show_data=show_data, y_data_profiling=y_data_profiling)),
    ("ct", (enc_transf_scale)),
    ('debug_after_ct', DebugTransformer('AFTER COLUMN TRANSFORMER', show_data=show_data, y_data_profiling=y_data_profiling)),
    ("fs", (fs_pipe)),
    ('debug_after_fs', DebugTransformer('AFTER FEATURE SELECTION', show_data=show_data, y_data_profiling=y_data_profiling))
])

print("Show outputs of each step in the preprocessing pipeline:") # Set show_data=True in DebugTransformer to see the data at each step
X_result = debug_preprocessor_pipe.fit_transform(X_train, y_train)

# Reset output to default (numpy arrays) for model training
enc_transf_scale.set_output(transform="default")
fs_pipe.set_output(transform="default")

In [None]:
# Feed the feed names after VT because VT is applied before the majority voting to remove constant features
feature_names_after_vt = debug_preprocessor_pipe.named_steps['fs'].named_steps['vt'].get_feature_names_out()
plot_selector_agreement(
    majority_selector = debug_preprocessor_pipe.named_steps['fs'].named_steps['selector'], 
    feature_names = feature_names_after_vt
)

In [None]:
# # TODO delete following cell later - this is for us to see if the group imputer works - but it is GPT slop

# brand = "VW"
# model = "golf"

# # 1) Get the fitted steps from preprocessor_pipe
# preprocessor_pipe.fit(X_train, y_train)
# fe = preprocessor_pipe.named_steps["fe"]              # CarFeatureEngineer
# imp = preprocessor_pipe.named_steps["group_imputer"]  # GroupImputer

# # 2) Inspect GroupImputer internal numeric stats
# pair_table = getattr(imp, "num_pair_", None)    # indexed by (_g0, _g1) = (Brand, model)
# brand_table = getattr(imp, "num_first_", None)  # indexed by _g0 = Brand
# global_med = getattr(imp, "num_global_", None)  # Series of global medians

# print("Has pair-level medians table:",
#       pair_table is not None and not getattr(pair_table, "empty", True))
# print("Has brand-level medians table:",
#       brand_table is not None and not getattr(brand_table, "empty", True))
# print("Has global median:",
#       global_med is not None and not global_med.empty if global_med is not None else False)
# print()

# _g0 = brand
# _g1 = model

# # 2a) Pair-level
# if pair_table is not None and (_g0, _g1) in pair_table.index:
#     print(f"Pair-level median FOUND for ({brand}, {model}):")
#     display(pair_table.loc[(_g0, _g1)])
# else:
#     print(f"No pair-level median for ({brand}, {model}).")
#     if pair_table is not None and not pair_table.empty:
#         print("Sample of pair-level medians (top 5):")
#         display(pair_table.head())

# # 2b) Brand-level
# if brand_table is not None and _g0 in brand_table.index:
#     print(f"\nBrand-level median for {brand}:")
#     display(brand_table.loc[_g0])
# else:
#     print("\nNo brand-level median for", brand)
#     if brand_table is not None and not brand_table.empty:
#         print("Sample of brand-level medians (top 5):")
#         display(brand_table.head())

# # 2c) Global medians
# print("\nGlobal median (fallback):")
# display(global_med)

# # 3) Apply CarFeatureEngineer + GroupImputer to VW Golf rows and compare
# #    (GroupImputer was fitted after CarFeatureEngineer, so we must mimic that order)

# # 3a) Feature engineering on full X_train
# X_train_fe = fe.transform(X_train)

# # 3b) Filter for VW Golf in the feature-engineered space
# vw_golf = X_train_fe[(X_train_fe["Brand"] == brand) & (X_train_fe["model"] == model)].copy()

# if vw_golf.empty:
#     print("\nNo VW Golf rows found in X_train.")
# else:
#     print(f"\nFound {len(vw_golf)} VW Golf rows in X_train.")

#     # 3c) GroupImputer expects the columns it saw at fit time
#     cols_for_imp = imp.feature_names_in_
#     vw_input = vw_golf.loc[:, cols_for_imp]

#     vw_imp = imp.transform(vw_input)
#     vw_imp_df = pd.DataFrame(vw_imp, columns=cols_for_imp, index=vw_golf.index)

#     print("\nImputed data (first 8 rows):")
#     display(vw_imp_df[["mpg", "mileage", "tax"]].head(8))

#     # 4) Build comparison table (original vs imputed, for selected columns)
#     comp = pd.DataFrame(index=vw_golf.index)
#     comp["orig_mpg"] = vw_golf["mpg"]
#     comp["imp_mpg"] = vw_imp_df["mpg"]
#     comp["orig_tax"] = vw_golf["tax"]
#     comp["imp_tax"] = vw_imp_df["tax"]
#     comp["orig_mileage"] = vw_golf["mileage"]
#     comp["imp_mileage"] = vw_imp_df["mileage"]

#     print("\nOriginal vs imputed (first 12 rows):")
#     display(comp.head(12))

#     # 5) Determine imputation source per row
#     def source_of_imputation(col):
#         srcs = []
#         for idx, row in comp.iterrows():
#             val = row[f"imp_{col}"]
#             src = "other"

#             # Pair-level
#             if pair_table is not None and (_g0, _g1) in pair_table.index and col in pair_table.columns:
#                 pair_val = pair_table.loc[(_g0, _g1), col]
#                 if pd.notna(pair_val) and pd.notna(val) and val == pair_val:
#                     src = "pair"

#             # Brand-level
#             if src == "other" and brand_table is not None and _g0 in brand_table.index and col in brand_table.columns:
#                 brand_val = brand_table.loc[_g0, col]
#                 if pd.notna(brand_val) and pd.notna(val) and val == brand_val:
#                     src = "brand"

#             # Global
#             if src == "other" and global_med is not None and col in global_med.index:
#                 glob_val = global_med[col]
#                 if pd.notna(glob_val) and pd.notna(val) and val == glob_val:
#                     src = "global"

#             srcs.append(src)
#         return srcs

#     comp["src_mpg"] = source_of_imputation("mpg")
#     comp["src_tax"] = source_of_imputation("tax")
#     comp["src_mileage"] = source_of_imputation("mileage")

#     print("\nImputation sources for the shown rows:")
#     display(comp.head(12))

#     # 6) Summary counts: NaN before vs after imputation
#     print("\nSummary counts: NaN before -> NaN after")
#     before = vw_golf[["mpg", "mileage", "tax"]].isna().sum()
#     after = pd.DataFrame({
#         "mpg": comp["imp_mpg"],
#         "mileage": comp["imp_mileage"],
#         "tax": comp["imp_tax"],
#     }).isna().sum()
#     display(pd.DataFrame({"na_before": before, "na_after": after}))

### 4. Model Evaluation Metrics, Baselining, Setup

#### 4.1 Model Evaluation Metrics

**MAE (Mean Absolute Error):**
- average absolute deviation between predicted and true car prices
- easy to interpret in pounds, same metric used by Kaggle competition

**RMSE (Root Mean Squared Error):**
- sensitive to outliers, helps identify large prediction errors

**R²:**
- Coefficient of determination: proportion of variance explained by the model
- 1.0 = perfect predictions, 0.0 = same as predicting mean, < 0.0 = worse than mean

#### 4.2 Baseline (median)

In [None]:
# Baseline: DummyRegressor using the median price as prediction
baseline_pipe = Pipeline([
    ("preprocess", preprocessor_pipe),  
    ("model", DummyRegressor(strategy="median")),
])

baseline_cv = cross_validate(
    baseline_pipe,
    X_train,
    y_train,
    cv=3,
    scoring={
        "neg_mae": "neg_mean_absolute_error",
        "neg_mse": "neg_mean_squared_error",
        "r2": "r2",
    },
    n_jobs=-2,
    verbose=0,
)

baseline_mae = -baseline_cv["test_neg_mae"].mean()
baseline_rmse = np.sqrt(-baseline_cv["test_neg_mse"].mean())
baseline_r2 = baseline_cv["test_r2"].mean()

print("Baseline (median) performance on CV:")
print(f"MAE:  {baseline_mae:,.4f}")
print(f"RMSE: {baseline_rmse:,.4f}")
print(f"R2:   {baseline_r2:,.4f}")

# Baseline (median) with engineered features & CV:
# MAE:  6,801.8131
# RMSE: 9,981.0186
# R2:   -0.0508

#### 4.3 Pipeline Definitions (preprocessor + model)

##### 4.3.1 Using Baseline Preprocessing

In [None]:
elastic_pipe_orig = Pipeline([
    ("preprocess", preprocessor_orig),
    ("model", ElasticNet(
        alpha=0.01,
        l1_ratio=0.5,
        max_iter=30000,
        tol=1e-4,
        selection="cyclic",
        random_state=42,
    )),
])

hgb_pipe_orig = Pipeline([
    ("preprocess", preprocessor_orig),
    ("model", HistGradientBoostingRegressor(
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=20,
        l2_regularization=0.5,
        random_state=42,
    )),
])

rf_pipe_orig = Pipeline([
    ("preprocess", preprocessor_orig),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=3,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
    )),
])

et_pipe_orig = Pipeline([
    ("preprocess", preprocessor_orig),
    ("model", ExtraTreesRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=False,
        n_jobs=-1,
        random_state=42,
    )),
])

svr_pipe_orig = Pipeline([
    ("preprocess", preprocessor_orig),
    ("model", SVR(
        kernel="rbf",
        C=10,
        epsilon=0.1,
        gamma="scale",
    )),
])

stack_pipe_orig = StackingRegressor(
    estimators=[
        ("elastic_orig", elastic_pipe_orig),
        ("hgb_orig", hgb_pipe_orig),
        ("rf_orig", rf_pipe_orig),
    ],
    final_estimator=HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=5,
        l2_regularization=0.5,
        random_state=42,
    ),
    passthrough=False,
    n_jobs=-1,
)

##### 4.3.2 Using optimized Preprocessing

In [None]:
def create_model_pipe(prepro_pipe, model):
    model_pipe = Pipeline([
        ("preprocess", prepro_pipe),
        ("model", model),
    ])
    return model_pipe


### LINEAR MODEL
# ElasticNet
elastic_net_model = ElasticNet(
        alpha=0.01,
        l1_ratio=0.5,
        max_iter=30000,
        tol=1e-4,
        selection="cyclic",
        random_state=42,
    )
elastic_pipe_fe = create_model_pipe(preprocessor_pipe, elastic_net_model)

### TREE MODELS
# HistGradientBoostingRegressor
hgb_model = HistGradientBoostingRegressor(
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=20,
        l2_regularization=0.5,
        random_state=42,
    )
hgb_pipe_fe = create_model_pipe(preprocessor_pipe, hgb_model)

# RandomForestRegressor
rf_model = RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=3,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=True,
        n_jobs=-1,
        random_state=42,
    )
rf_pipe_fe = create_model_pipe(preprocessor_pipe, rf_model)

# ExtraTreesRegressor
et_model = ExtraTreesRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=False,
        n_jobs=-1,
        random_state=42,
    )
et_pipe_fe = create_model_pipe(preprocessor_pipe, et_model)

### KERNEL-BASED MODEL (SVR)
svr_model = SVR(
        kernel="rbf",
        C=10,
        epsilon=0.1,
        gamma="scale",
    )
svr_pipe_fe = create_model_pipe(preprocessor_pipe, svr_model)

### ENSEMBLE META MODEL (Stacking)
stack_pipe_fe = StackingRegressor(
    estimators=[
        ("hgb_fe", hgb_pipe_fe),
        ("rf_fe", rf_pipe_fe),
        ("et_fe", et_pipe_fe),
    ],
    final_estimator=HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=5,
        l2_regularization=0.5,
        random_state=42,
    ),
    passthrough=False,
    n_jobs=-1,
)

#### 4.4 First run of models

In [None]:
# # TODO uncomment (currently its commented to save time during experimentation)

# # First evaluation of metrics based on original and engineered feature pipeline to decide how to proceed


# models_orig = {
#     # "ElasticNet_orig": elastic_pipe_orig,
#     "HGB_orig": hgb_pipe_orig,
#     "RF_orig": rf_pipe_orig,
#     "ET_orig": et_pipe_orig,
#     "SVR_orig": svr_pipe_orig,
#     "Stack_orig": stack_pipe_orig,
# }

# models_fe = {
#     # "ElasticNet_fe": elastic_pipe_fe,
#     "HGB_fe": hgb_pipe_fe,
#     "RF_fe": rf_pipe_fe,
#     "ET_fe": et_pipe_fe,
#     "SVR_fe": svr_pipe_fe,
#     "Stack_fe": stack_pipe_fe,
# }

# results = []

# # for name, model in {**models_orig, **models_fe}.items():
#     print(f"Fitting {name} with cross-validation...")
    
#     # Perform cross-validation on the entire training set
#     cv_results = cross_validate(
#         model, 
#         X_train, 
#         y_train,
#         cv=3,
#         scoring={
#             'neg_mae': 'neg_mean_absolute_error',
#             'neg_mse': 'neg_mean_squared_error',
#             'r2': 'r2'
#         },
#         return_train_score=False,
#         verbose=3,
#         n_jobs=-2
#     )
    
#     # Calculate mean metrics across folds
#     mae = -cv_results['test_neg_mae'].mean()
#     rmse = np.sqrt(-cv_results['test_neg_mse'].mean())
#     r2 = cv_results['test_r2'].mean()
    
#     results.append({
#         "model": name,
#         "feature_set": "original" if name.endswith("_orig") else "engineered",
#         "MAE": mae,
#         "RMSE": rmse,
#         "R2": r2,
#     })

# results_df = (
#     pd.DataFrame(results)
#       .sort_values(["feature_set", "MAE"])
#       .reset_index(drop=True)
# )

# print(results_df)

# # Long Duration (with orig ca 25mins VS without orig ca 6mins VS with CV ca 16mins VS with njobs=-1 ca )

# # Predicted on hold-out val set (20%):
# #       model feature_set          MAE          RMSE        R2
# # 0     RF_fe  engineered  1299.728938  4.509435e+06  0.950490
# # 1  Stack_fe  engineered  1321.130612  4.831609e+06  0.946953
# # 2     ET_fe  engineered  1328.051439  4.707534e+06  0.948315
# # 3    HGB_fe  engineered  1534.496164  5.609255e+06  0.938415
# # 4    SVR_fe  engineered  2955.064750  3.242891e+07  0.643956

# # Predicted using 3-fold CV on entire data:
# #       model feature_set          MAE         RMSE        R2
# # 0     RF_fe  engineered  1336.806163  2375.850617  0.940424
# # 1  Stack_fe  engineered  1357.266391  2505.029128  0.933786
# # 2     ET_fe  engineered  1364.212656  2399.654669  0.939223
# # 3    HGB_fe  engineered  1551.419964  2503.445871  0.933858
# # 4    SVR_fe  engineered  3068.524237  6130.420383  0.603579

### 5. Hyperparameter Tuning and Model Evaluation

**Our approach:**
- After first experiments we decided to skip hyperparameter-tuning for SVR and ET

##### 5.1 ElasticNet

In [None]:
# TODO this cell is commented because we dont evaluate elasticnet for final performance (save time)
# # Hyperparameter Tuning: ElasticNet

# elastic_param_grid = {
#     "model__alpha": [0.001],    # also tried 0.01, 0.05, 0.1, 0.5
#     "model__l1_ratio": [0.9]    # also tried 0.1, 0.3, 0.5, 0.7  
# }

# # CV: Calculate all metrics but use MAE for selecting best model
# elastic_grid = GridSearchCV(
#     elastic_pipe_fe, 
#     param_grid=elastic_param_grid,
#     cv=5,
#     scoring={
#         'mae': 'neg_mean_absolute_error',
#         'mse': 'neg_mean_squared_error',
#         'r2': 'r2'
#     },
#     refit='mae', # Refit the best model based on MAE on the whole training set
#     n_jobs=-2,
#     verbose=3,
#     return_train_score=False
# )
# elastic_grid.fit(X_train, y_train)

# # Get mean metrics across folds
# mae = -elastic_grid.cv_results_['mean_test_mae'][elastic_grid.best_index_]
# mse = -elastic_grid.cv_results_['mean_test_mse'][elastic_grid.best_index_]
# rmse = np.sqrt(mse)
# r2 = elastic_grid.cv_results_['mean_test_r2'][elastic_grid.best_index_]
# print("ElasticNet Results (CV on entire train set):")
# print(f"MAE: {mae:.4f}")
# print(f"RMSE: {rmse:.4f}")
# print(f"R²: {r2:.4f}")
# print("Best ElasticNet params:", elastic_grid.best_params_)

# elastic_best = elastic_grid.best_estimator_ # Final model trained on entire training set with best hyperparameters minimizing MAE

# # Long Duration (Before removal of OHE-categoricals interrupted kernel after 64mins VS after removal ca 1min -> now 15secs with njobs=-2)

# # ElasticNet Results: 
# # MAE: 2353.9112 | RMSE: 13356867.7860 | R2: 0.8534
# # Best ElasticNet params: {'model__alpha': 0.001, 'model__l1_ratio': 0.9}

# # MAE: 2589.6100
# # RMSE: 4104.4515
# # R²: 0.8222
# # Best ElasticNet params: {'model__alpha': 0.001, 'model__l1_ratio': 0.9}

In [None]:
# # TODO this cell is commented because of time constraints
# # Use GridSearchCV for features_to_select
# # Base model: tuned ElasticNet from above
# en_base = clone(elastic_best.named_steps["model"])

# # Pipeline: clean preprocessing -> RFE -> model
# rfe_pipe_linear = Pipeline([
#     ("preprocess", preprocessor_fe_clean),
#     ("rfe", RFE(
#         estimator=en_base,
#         step=0.5,               # drop ~20% per iteration
#         importance_getter="auto"
#     )),
#     ("model", clone(en_base))
# ])

# # Try a few target feature counts (adjust as needed)
# number_of_all_features = preprocessor_fe_clean.transform(X_train).shape[1]
# rfe_param_grid = {
#     "rfe__n_features_to_select": [int(number_of_all_features*0.5)]# , int(number_of_all_features*1)] # use only these two extremes to save time ~J
# }

# rfe_grid = GridSearchCV(
#     rfe_pipe_linear,
#     param_grid=rfe_param_grid,
#     cv=2,
#     scoring="neg_mean_absolute_error",
#     n_jobs=-1,
#     verbose=3,
#     return_train_score=False,
# )

# rfe_grid.fit(X_train, y_train)

# print("Best n_features_to_select:", rfe_grid.best_params_["rfe__n_features_to_select"])
# print("MAE (CV):", -rfe_grid.best_score_)
# rfe_best = rfe_grid.best_estimator_

# # list kept features
# best_rfe = rfe_best.named_steps["rfe"]
# all_feats = rfe_best.named_steps["preprocess"].get_feature_names_out()
# kept = [f for f, keep in zip(all_feats, best_rfe.support_) if keep]
# print("Kept features:", kept)


**Reasoning**: We used 100 features as an initial, arbitrary cutoff for feature selection in the ElasticNet model. Preliminary experiments and insights from the EDA (see separate notebook) indicated that tree-based methods are likely to perform better. Therefore, we prioritized feature selection for the tree-based models based on SHAP values.
 

##### 5.2 HistGradientBoost

In [None]:
hgb_param_dist = {
    "preprocess__fs__vt__threshold": [0.0, 0.005, 0.01],
    # "preprocessor__fs__selector__min_votes": [1, 2, 3],  # TODO try different vote thresholds for MajorityVoteSelectorTransformer in FS pipeline
    "model__learning_rate": uniform(0.01, 0.15),       # samples values
    "model__max_leaf_nodes": randint(50, 170),         
    "model__min_samples_leaf": randint(2, 20),         # samples leaf sizes between 2–20
    "model__max_iter": randint(200, 900),              # tries 200–900 iterations
    "model__l2_regularization": uniform(0.0, 1.0),      # samples small regularization values
    "model__early_stopping": [True],
    "model__validation_fraction": [0.1],
    "model__n_iter_no_change": [20],
    "model__random_state":[42]
}

# optimized the parameter distributions based on previous runs to focus search space
hgb_param_dist = {
    "preprocess__fs__vt__threshold": [0.0],
    # "preprocessor__fs__selector__min_votes": [1, 2, 3],  # TODO try different vote thresholds for MajorityVoteSelectorTransformer in FS pipeline
    "model__learning_rate": [0.05889383578028271],
    "model__max_leaf_nodes": [139],
    "model__min_samples_leaf": [4],
    "model__max_iter": [602],
    "model__l2_regularization": [0.8583588048137198],
    "model__early_stopping": [True],
    "model__validation_fraction": [0.1],
    "model__n_iter_no_change": [20],
    "model__random_state":[42]
}
    
hgb_best = model_hyperparameter_tuning(X_train, y_train, hgb_pipe_fe, hgb_param_dist, n_iter=100, splits=5)

# Before FS (last step was: Add "mpg_x_age", "tax_x_age")
# MAE: 1335.6473
# RMSE: 2308.0557
# R²: 0.9439
# Best Model params: {'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True, 'fs__vt__threshold': 0.0}

# use mean instead of median in  "age_rel_brand" because most of the values were 0 otherwise
# MAE: 1323.1182
# RMSE: 2300.0353
# R²: 0.9443
# Best Model params: {'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True, 'fs__vt__threshold': 0.0}

# Add 'age_rel_model'
# MAE: 1323.7268
# RMSE: 2307.2668
# R²: 0.9439
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True}

# Add 'engine_rel_model'
# MAE: 1314.7631
# RMSE: 2300.8177
# R²: 0.9442
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True}

# Use engine_per_mpg instead of mpg_per_engine
# MAE: 1319.4956
# RMSE: 2301.7438
# R²: 0.9442
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True}

# After adding FS pipe
# MAE: 1305.3789
# RMSE: 2286.3747
# R²: 0.9449
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__validation_fraction': 0.1, 'model__random_state': 42, 'model__n_iter_no_change': 20, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': 139, 'model__max_iter': 602, 'model__learning_rate': 0.05889383578028271, 'model__l2_regularization': 0.8583588048137198, 'model__early_stopping': True}

##### 5.3 RandomForest

In [None]:
# Old parameter distribution
rf_param_dist = {
    "preprocess__fs__vt__threshold": [0.0, 0.005, 0.01],
    "model__n_estimators": randint(200, 600),        # number of trees
    "model__max_depth": randint(5, 40),              # depth of each tree
    "model__min_samples_split": randint(2, 10),      # min samples to split an internal node
    "model__min_samples_leaf": randint(1, 8),        # min samples per leaf
    "model__max_features": ["sqrt"],           # feature sampling strategy (sqrt performed better than log2 and None in previous tests)
    "model__bootstrap": [False]                      # use bootstrapping or not (False performed better than True in previous tests)
}

# So far best parameter distribution based on previous runs to focus search space
rf_param_dist = {
    "preprocess__fs__vt__threshold": [0.005],
    "model__n_estimators": [328],
    "model__max_depth": [20],
    "model__min_samples_split": [5],
    "model__min_samples_leaf": [1],
    "model__max_features": ["sqrt"],
    "model__bootstrap": [False],
}

rf_best_rand = model_hyperparameter_tuning(X_train, y_train, rf_pipe_fe, rf_param_dist)

joblib.dump(rf_best_rand, "rf_best_rand.pkl")


# Long Duration (~1min)

# Before FS (last step was: Add "mpg_x_age", "tax_x_age")
# MAE: 1310.7979
# RMSE: 2313.0862
# R²: 0.9437
# Best Model params: {'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False, 'fs__vt__threshold': 0.005}

# use mean instead of median in  "age_rel_brand" because most of the values were 0 otherwise
# MAE: 1309.6709
# RMSE: 2309.8228
# R²: 0.9438
# Best Model params: {'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False, 'fs__vt__threshold': 0.005}

# Add 'age_rel_model' 
# MAE: 1304.7703
# RMSE: 2302.0800
# R²: 0.9442
# Best Model params: {'preprocess__fs__vt__threshold': 0.005, 'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False}

# Add 'engine_rel_model'
# MAE: 1298.8499
# RMSE: 2299.7010
# R²: 0.9443
# Best Model params: {'preprocess__fs__vt__threshold': 0.005, 'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False}

# Use engine_per_mpg instead of mpg_per_engine
# MAE: 1298.6139
# RMSE: 2299.5233
# R²: 0.9443
# Best Model params: {'preprocess__fs__vt__threshold': 0.005, 'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False}

# After adding FS pipe
# MAE: 1300.1077
# RMSE: 2299.1285
# R²: 0.9443
# Best Model params: {'preprocess__fs__vt__threshold': 0.005, 'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False}


In [None]:
pipe = rf_best_rand if hasattr(rf_best_rand, "named_steps") else rf_best_rand[0]

# Use the debug preprocessor pipeline to get final feature names by hierarchically accessing each step
feature_names_after_fs = debug_preprocessor_pipe.named_steps['fs'].get_feature_names_out()
feat_names = feature_names_after_fs

importances = pipe.named_steps["model"].feature_importances_

feature_importance_df = pd.DataFrame(
    {"feature": feat_names, "importance": importances}
).sort_values("importance", ascending=False)

print("Feature Importances:")
for _, row in feature_importance_df.iterrows():
    print(f"{row['feature']:30s}: {row['importance']:.6f}")

In [None]:
stop here

##### 5.4 Extra Trees

In [None]:
# Old parameter distribution
et_param_dist = {
    "preprocess__fs__vt__threshold": [0.0, 0.005, 0.01],
    "model__n_estimators": randint(200, 600),        # number of trees
    "model__max_depth": randint(5, 40),              # depth of each tree
    "model__min_samples_split": randint(2, 10),      # min samples to split an internal node
    "model__min_samples_leaf": randint(1, 8),        # min samples per leaf
    "model__max_features": ["sqrt"],           # feature sampling strategy (sqrt performed better than log2 and None in previous tests)
    "model__bootstrap": [False]                      # use bootstrapping or not (False performed better than True in previous tests)
}

# So far best parameter distribution based on previous runs to focus search space
et_param_dist = {
    "preprocess__fs__vt__threshold": [0.005],
    "model__n_estimators": [328],
    "model__max_depth": [20],
    "model__min_samples_split": [5],
    "model__min_samples_leaf": [1],
    "model__max_features": ["sqrt"],
    "model__bootstrap": [False],
}

et_best_rand = model_hyperparameter_tuning(X_train, y_train, et_pipe_fe, et_param_dist)

joblib.dump(et_best_rand, "et_best_rand.pkl")

##### 5.4 StackingRegressor

In [None]:
# Old parameter distribution
stack_param_dist = {
    "final_estimator__learning_rate": uniform(0.02, 0.1),
    "final_estimator__max_depth": randint(3, 10),
    "final_estimator__min_samples_leaf": randint(3, 20),
    "final_estimator__l2_regularization": uniform(0.0, 1.0),
}

# So far best parameter distribution based on previous runs to focus search space
stack_param_dist = {
    "final_estimator__learning_rate": [0.061135390505667866],
    "final_estimator__max_depth": [5],
    "final_estimator__min_samples_leaf": [10],
    "final_estimator__l2_regularization": [0.19438003399487302]
}

stack_best = model_hyperparameter_tuning(X_train, y_train, stack_pipe_fe, stack_param_dist, splits=3)
# joblib.dump(stack_best, "stack_best.pkl")


# Long Duration (~3mins)

# MAE: 1351.8682
# RMSE: 2498.2822
# R²: 0.9342

# After RandomizedSearchCV:
# MAE: 1350.4717
# RMSE: 2497.0474
# R²: 0.9343
# Best Model params: {'final_estimator__l2_regularization': np.float64(0.978892858275009), 'final_estimator__learning_rate': np.float64(0.06867421529594551), 'final_estimator__max_depth': 6, 'final_estimator__min_samples_leaf': 13}

# Removed ElasticNet from stacking due to poor performance compared to RF and HGB alone
# canceled but the cv scores didnt seem to show much improvement

# Using transmission and fuelType as OHE instead of TE():
# MAE: 1357.4291
# RMSE: 2516.5470
# R²: 0.9333
# Best Model params: {'final_estimator__l2_regularization': np.float64(0.19438003399487302), 'final_estimator__learning_rate': np.float64(0.061135390505667866), 'final_estimator__max_depth': 5, 'final_estimator__min_samples_leaf': 10}


# Removed fillna(0) in feature engineering for a_x_b and model_freq():
# was worse for hgb and rf so not tested for stacking

# ...

# implemented GroupModeImputer
# MAE: 1329.2379
# RMSE: 2453.0239
# R²: 0.9366
# Best Model params: {'final_estimator__min_samples_leaf': 10, 'final_estimator__max_depth': 5, 'final_estimator__learning_rate': 0.061135390505667866, 'final_estimator__l2_regularization': 0.19438003399487302}

# Fixed GroupImputer and added Feature Engineering to pipeline
# MAE: 1369.6876
# RMSE: 2516.2583
# R²: 0.9333


### 6. Feature Importance of Tree Models (with SHAP)

Use SHAP (SHapley Additive exPlanations) to
  identify feature importance specifically for our selected tree models (HGB&RF)

  **Why SHAP for Trees:**
  - Provides exact feature importance values for tree-based
  models
  - Tree models handle irrelevant features, but noise features
  still impact performance
  - Enables data-driven selection rather than statistical filter
  methods

#### 6.1 Define needed Functions

In [None]:
# Get Feature names aligned with X_proc
def get_pipeline_feature_matrix(pipe, X):
    """
    Given a fitted pipeline with steps:
      'preprocess' -> optional 'vt' -> optional 'fs' -> 'model'
    return:
      X_proc: 2D numpy array of features just before the model step
      feat_names: 1D np.array of feature names aligned with X_proc columns
    """
    X_proc = pipe.named_steps["preprocess"].transform(X)
    feat_names = debug_preprocessor_pipe.named_steps["fs"].get_feature_names_out()

    return X_proc, feat_names


In [None]:
# Compute Shap Importance
def compute_shap_importance(
    pipe,
    X,
    sample_size=1000,
    seed=42,
    model_name=None,
):
    """
    Compute global SHAP feature importances for a fitted pipeline.

    Steps:
      - Transform X with the pipeline up to just before the model.
      - Subsample up to `sample_size` rows.
      - Use TreeExplainer on the model (tree-based models).
      - Return a DataFrame with mean |SHAP| per feature.

    Returns:
      shap_df: DataFrame with columns ['feature', 'importance']
      feat_names: np.array of feature names aligned with importances
    """
    # Extract processed feature matrix and names
    X_proc, feat_names = get_pipeline_feature_matrix(pipe, X)

    # Subsample rows for SHAP (for speed)
    rng = np.random.default_rng(seed)
    n = min(sample_size, len(X_proc))
    idx = rng.choice(len(X_proc), n, replace=False)
    X_sample = X_proc[idx]

    # Underlying model (last step in pipeline)
    model = pipe.named_steps["model"]
    tag = model_name or model.__class__.__name__

    # TreeExplainer is appropriate for tree ensembles (RF, ET, HGB, GB, etc.)
    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_sample)

    # For regression, shap_vals is (n_samples, n_features)
    importance = np.abs(shap_vals).mean(axis=0)

    shap_df = (
        pd.DataFrame({"feature": feat_names, "importance": importance})
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    print(f"Top 20 features by SHAP for {tag}:")
    print(shap_df.head(20).to_string(index=False))

    return shap_df, feat_names


In [None]:
# Plot Shap Importance
def plot_top_shap(shap_df, model_name, top_k=20):
    """
    Horizontal bar plot of top_k features by mean |SHAP|.
    """
    top_df = shap_df.head(top_k).iloc[::-1]  # reverse for nicer barh order

    fig, ax = plt.subplots(figsize=(8, 6))
    ax.barh(top_df["feature"], top_df["importance"])
    ax.set_xlabel("Average |SHAP| value")
    ax.set_title(f"Top {top_k} features by SHAP – {model_name}")
    plt.tight_layout()
    plt.show()


In [None]:
# See if smaller k features improve MAE (top k)
def cv_mae_topk_from_shap(
    pipe,
    shap_importance,
    X,
    y,
    n_features_list,
    folds=5,
    seed=42,
    model_name=None,
):
    """
    For a fitted pipeline `pipe` and its SHAP importances:
      - Build X_proc, feat_names from the pipeline.
      - For each k in n_features_list:
          * Take top-k features by SHAP.
          * Run KFold CV on X_proc[:, idx] with the pipeline's final estimator.
      - Print MAE per k and return the best (k, model, feature list).

    Returns:
      best_model: fitted estimator on full X_proc restricted to best-k features
      best_features: list of feature names used
    """
    # 1) Get processed features and names
    X_proc, feat_names = get_pipeline_feature_matrix(pipe, X)
    feat_names = np.asarray(feat_names, dtype=object)

    # 2) SHAP ranking
    shap_sorted = shap_importance.sort_values("importance", ascending=False)
    shap_order = shap_sorted["feature"].tolist()

    # helper: indices of top-k by SHAP
    def indices_for_topk(k):
        top_feats = shap_order[:k]
        return [i for i, fname in enumerate(feat_names) if fname in top_feats]

    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    model_proto = pipe.named_steps["model"]
    tag = model_name or model_proto.__class__.__name__

    results = []

    for k in n_features_list:
        idx = indices_for_topk(k)
        if len(idx) == 0:
            print(f"Skipping k={k}: no matching feature indices.")
            continue

        mae_folds = []

        for train_idx, val_idx in kf.split(X_proc):
            X_tr, X_val = X_proc[train_idx][:, idx], X_proc[val_idx][:, idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

            est = clone(model_proto)
            est.fit(X_tr, y_tr)
            y_pred = est.predict(X_val)
            mae_folds.append(mean_absolute_error(y_val, y_pred))

        mae_mean = float(np.mean(mae_folds))
        results.append({"k": k, "mae": mae_mean, "idx": idx})

    # pick best k
    if not results:
        raise RuntimeError("No valid k in n_features_list produced results.")

    best = min(results, key=lambda r: r["mae"])
    best_k = best["k"]
    best_mae = best["mae"]
    best_idx = best["idx"]
    best_features = [feat_names[i] for i in best_idx]

    print(f"\nTop-k SHAP feature CV – {tag}")
    for r in results:
        print(f"  k={r['k']:3d} | MAE={r['mae']:.2f}")
    print(f"Best: k={best_k} | MAE={best_mae:.2f}")

    # fit final estimator on full X_proc restricted to best_k features
    final_est = clone(model_proto)
    final_est.fit(X_proc[:, best_idx], y)

    return final_est, best_features


In [None]:
class ShapTopKColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer that selects a fixed subset of columns by name.

    Parameters
    ----------
    selected_features : list of str
        Feature names (after preprocessing) to keep.

    all_feature_names : array-like of str
        Full list of feature names aligned with the columns of X after preprocessing.
        These are typically obtained from get_pipeline_feature_matrix(...).
    """
    def __init__(self, selected_features, all_feature_names):
        self.selected_features = list(selected_features)
        self.all_feature_names = np.asarray(all_feature_names, dtype=object)

    def fit(self, X, y=None):
        # Compute the column indices corresponding to selected_features
        name_to_idx = {name: i for i, name in enumerate(self.all_feature_names)}
        self.idx_ = [
            name_to_idx[name]
            for name in self.selected_features
            if name in name_to_idx
        ]
        if len(self.idx_) == 0:
            raise ValueError(
                "ShapTopKColumnSelector: none of the selected_features were found "
                "in all_feature_names."
            )
        return self

    def transform(self, X):
        # X is the matrix after preprocessing; select only the chosen columns
        return X[:, self.idx_]

    def get_feature_names_out(self, input_features=None):
        # For consistency with sklearn's feature-name API
        return np.asarray(self.selected_features, dtype=object)


In [None]:
def build_shap_topk_pipeline(
    base_pipe,
    best_features,
    all_feature_names,
    step_model_name="model",
):
    """
    Build a final pipeline that:
      - reuses the preprocessing (and vt/fs if present) from `base_pipe`
      - inserts a ShapTopKColumnSelector to keep only `best_features`
      - uses a fresh clone of the base model as final estimator

    Parameters
    ----------
    base_pipe : sklearn Pipeline
        Fitted pipeline with steps: 'preprocess' -> optional 'vt'/'fs' -> 'model'.

    best_features : list of str
        Names of the features (after preprocessing) to keep.

    all_feature_names : array-like of str
        Full list of feature names aligned with the output of preprocessing
        (and vt/fs if they were applied when computing SHAP).

    step_model_name : str, default="model"
        Name of the final estimator step in base_pipe.

    Returns
    -------
    final_pipe : sklearn Pipeline
        Unfitted pipeline. Call final_pipe.fit(X, y) to train on full data.
    """
    steps = []

    # 1) Preprocess step (clone so we refit on full data)
    pre = base_pipe.named_steps["preprocess"]
    steps.append(("preprocess", clone(pre)))

    # 2) Optional VarianceThreshold
    if "vt" in base_pipe.named_steps and base_pipe.named_steps["vt"] is not None:
        steps.append(("vt", clone(base_pipe.named_steps["vt"])))

    # 3) SHAP-based column selector
    shap_selector = ShapTopKColumnSelector(
        selected_features=best_features,
        all_feature_names=all_feature_names,
    )
    steps.append(("shap_select", shap_selector))

    # 4) Final estimator – fresh clone of the base model
    base_model = base_pipe.named_steps[step_model_name]
    steps.append(("model", clone(base_model)))

    final_pipe = Pipeline(steps)
    return final_pipe


#### 6.2 Feature Importance HGB

In [None]:
# Unpack tuned pipeline and search object
hgb_pipe, hgb_search = hgb_best

# Best CV scores from the search
idx = hgb_search.best_index_
mae_cv  = -hgb_search.cv_results_['mean_test_mae'][idx]
rmse_cv = np.sqrt(-hgb_search.cv_results_['mean_test_mse'][idx])
r2_cv   = hgb_search.cv_results_['mean_test_r2'][idx]

# Feature count after preprocess (+ VT)
X_proc_hgb, feat_names_hgb = get_pipeline_feature_matrix(hgb_pipe, X_train)
n_features_total_hgb = X_proc_hgb.shape[1]

print("Baseline HGB (CV on train):")
print(f"MAE:  {mae_cv:.4f}")
print(f"RMSE: {rmse_cv:.4f}")
print(f"R²:   {r2_cv:.4f}")
print(f"Total features used: {n_features_total_hgb}")


In [None]:
shap_importance_hgb, feat_names_hgb = compute_shap_importance(
    hgb_pipe,
    X_train,
    sample_size=1000,
    seed=42,
    model_name="HGB",
)

plot_top_shap(shap_importance_hgb, model_name="HGB", top_k=n_features_total_hgb)


In [None]:
# Check if a smaller feature amount gives better MAE
n_features_list_hgb = list(range(10, n_features_total_hgb)) 
if n_features_total_hgb not in n_features_list_hgb:
    n_features_list_hgb.append(n_features_total_hgb)

best_model_hgb, best_features_hgb = cv_mae_topk_from_shap(
    pipe=hgb_pipe,
    shap_importance=shap_importance_hgb,
    X=X_train,
    y=y_train,
    n_features_list=n_features_list_hgb,
    folds=5,
    seed=42,
    model_name="HGB",
)


In [None]:
# Final Pipe with best k and MAE
hgb_final_shap_pipe = build_shap_topk_pipeline(
    base_pipe=hgb_pipe,
    best_features=best_features_hgb,
    all_feature_names=feat_names_hgb,
    step_model_name="model",   # name of the final estimator step in hgb_pipe
)

# Fit on full training data
hgb_final_shap_pipe.fit(X_train, y_train)

# Save for later use
joblib.dump(hgb_final_shap_pipe, "hgb_final_shap_pipe.pkl")


#### 6.3 Feature Importance RF

In [None]:
# Unpack tuned RF pipeline and search object
rf_pipe, rf_search = rf_best_rand  # result from model_hyperparameter_tuning

# Best CV scores from the search
idx = rf_search.best_index_
mae_cv_rf  = -rf_search.cv_results_['mean_test_mae'][idx]
rmse_cv_rf = np.sqrt(-rf_search.cv_results_['mean_test_mse'][idx])
r2_cv_rf   =  rf_search.cv_results_['mean_test_r2'][idx]

# Feature matrix + names after preprocess (+ vt/fs if present)
X_proc_rf, feat_names_rf = get_pipeline_feature_matrix(rf_pipe, X_train)
n_features_total_rf = X_proc_rf.shape[1]

print("Baseline RandomForest (CV on train):")
print(f"MAE:  {mae_cv_rf:.4f}")
print(f"RMSE: {rmse_cv_rf:.4f}")
print(f"R²:   {r2_cv_rf:.4f}")
print(f"Total features used: {n_features_total_rf}")

In [None]:
shap_importance_rf, feat_names_rf_check = compute_shap_importance(
    rf_pipe,
    X_train,
    sample_size=1000,
    seed=42,
    model_name="RandomForest",
)

plot_top_shap(shap_importance_rf, model_name="RandomForest", top_k=n_features_total_rf)


In [None]:
# Check if a smaller feature amount gives better MAE
n_features_list_rf = list(range(10, n_features_total_rf)) 
if n_features_total_rf not in n_features_list_rf:
    n_features_list_rf.append(n_features_total_rf)

best_model_rf, best_features_rf = cv_mae_topk_from_shap(
    pipe=rf_pipe,
    shap_importance=shap_importance_rf,
    X=X_train,
    y=y_train,
    n_features_list=n_features_list_rf,
    folds=5,
    seed=42,
    model_name="RandomForest",
)

In [None]:
# Build final pipeline:
#   preprocess -> (vt/fs) -> shap_select(best_features_rf) -> RF
rf_final_shap_pipe = build_shap_topk_pipeline(
    base_pipe=rf_pipe,
    best_features=best_features_rf,
    all_feature_names=feat_names_rf,
    step_model_name="model",   # name of the RF step in rf_pipe
)

# Fit final RF SHAP-top-k pipeline on full training data
rf_final_shap_pipe.fit(X_train, y_train)

# Optionally save for later use
joblib.dump(rf_final_shap_pipe, "rf_final_shap_pipe.pkl")

### 7. Kaggle Competition

Extra Task (1 Point): Be in the Top 5 Groups on Kaggle

In [None]:
def predict_on_test(model_pipeline, model_name):
    # Load best model from Joblib and predict on validation set to verify
    pipe_best = joblib.load(model_pipeline)
    
    # Predict on test set
    df_cars_test['price'] = pipe_best.predict(df_cars_test)
    df_cars_test['price'].to_csv(f'Group05_{model_name}_Version10.csv', index=True)

In [None]:
predict_on_test("hgb_final_shap_pipe.pkl", "HGB")

In [None]:
predict_on_test("rf_final_shap_pipe.pkl", "RF")

In [None]:
# predict_on_test("stack_pipe.pkl", "Stack")

In [None]:
# !kaggle competitions submit -c cars4you -f Group05_Version05.csv -m "Message" # Uncomment to submit to Kaggle

In [None]:
!kaggle competitions submissions -c cars4you

### 8. Open-Ended-Section

#### Open-ended Section: Global vs Brand- and Model-Specific Models

##### a) Objective and motivation (0.5v)

We investigated how far Cars4You should specialize its pricing models:

1. **Brand level:** Is a single global price model for all brands sufficient, or do separate brand-specific models reduce pricing error?
2. **Brand–model level:** For frequent models (e.g. “Skoda Octavia”, “VW Golf”), does an even more specialized model per (brand, model) segment bring additional improvements, or does it overfit?

Concretely, we started from our final production pipeline `hgb_final_shap_pipe` (full preprocessing + SHAP-based feature selection + HGB regressor) and compared:

- **Global model:** trained on all cars, evaluated only on a given segment.
- **Brand-specific model:** same preprocessing and SHAP selector, but the regressor re-fitted only on cars of a given brand.
- **Brand–model-specific model:** same preprocessing and SHAP selector, but the regressor re-fitted only on cars of a given (brand, model) pair.

We measured mean absolute error (MAE) and root mean squared error (RMSE) per segment. This answers how much performance we gain by moving from:

> one global model → several brand models → many brand–model models.

---

##### b) Difficulty of tasks (1v)

Extending the existing solution to this multi-level comparison was non-trivial:

- **Complex pipeline with a custom SHAP selector**  
  The final pipeline contains a `ShapTopKColumnSelector` that is not clone-compatible. Standard `cross_val_score` + `clone` would fail. We therefore implemented manual cross-validation:
  - reuse the fitted preprocessing + SHAP selector from `hgb_final_shap_pipe`;
  - only re-fit the final regressor for each fold and segment.

- **Consistent and fair evaluation protocol**  
  We reused the same 5-fold KFold strategy (`n_splits=5`, `shuffle=True`, `random_state=42`) and the same target (`price`) as in the main project. For each fold and segment:
  - the global model is trained on all training rows but evaluated only on validation rows belonging to that segment;
  - the segment-specific model is trained and evaluated only on that segment’s rows.

- **Handling data imbalance**  
  Data is unevenly distributed across brands and models. We therefore:
  - restricted the analysis to brands with at least 500 training samples;
  - for brand–model analysis, kept only frequent pairs (e.g. Skoda Octavia, VW Golf) with a minimum sample threshold per segment;
  - enforced additional checks per fold (minimum training size) to avoid fits on a handful of cars.

- **Manual metric computation**  
  Due to an older `sklearn` version (no `squared=` parameter), RMSE had to be computed manually as `sqrt(MSE)` inside the CV loops instead of relying on built-in scorers.

Overall, the task required custom CV logic, careful reuse of the production pipeline, and multiple levels of segment-wise filtering.

---

##### c) Correctness and efficiency of implementation (1v)

To keep the analysis correct and reasonably efficient we:

- **Reused the production pipeline as-is**  
  All preprocessing (imputation, scaling, encoding, price anchors) and SHAP-based feature selection are exactly the same as in the final model used on the test set. Only the last regressor is re-fit for segment-specific models.

- **Used a single CV design for all comparisons**  
  The same KFold splits (`splits = list(KFold(...).split(X_train, y_train))`) are reused for:
  - global per-brand evaluation;
  - brand-specific evaluation;
  - global per (brand, model) evaluation;
  - brand–model-specific evaluation.  
  This removes extra randomness and makes differences directly comparable.

- **Implemented clear separation between global and segment-specific training**  
  - For brands:  
    - global: fit on all brands, compute metrics only on that brand’s validation rows;  
    - brand-specific: use the fixed preprocessor, fit a fresh regressor only on that brand’s transformed data.
  - For (brand, model) pairs:  
    - global: fit on all cars, compute metrics only on that (brand, model) validation subset;  
    - brand–model-specific: fixed preprocessor + fresh regressor only on that pair.

- **Guarded against tiny segments**  
  Only segments with enough rows at dataset level and per fold are evaluated. Otherwise, metrics are set to NaN and those segments are excluded via `dropna`.

This design produces stable segment-wise estimates without changing the core production pipeline.

---

##### d) Discussion of results (1v)

#### Brand-level comparison

For the main brands, the final summary table (MAE in GBP) is:

| Brand    | MAE (global) | MAE (brand) | ΔMAE (brand – global) | n_samples |
|----------|--------------|-------------|------------------------|-----------|
| Ford     | 966.7        | 929.2       | -37.6                  | 16,371    |
| BMW      | 1,828.0      | 1,792.8     | -35.2                  | 7,540     |
| Mercedes | 1,968.7      | 1,934.6     | -34.1                  | 11,899    |
| VW       | 1,299.7      | 1,287.8     | -11.9                  | 10,572    |
| Audi     | 1,806.0      | 1,794.5     | -11.5                  | 7,456     |
| Skoda    | 1,174.6      | 1,165.9     | -8.7                   | 4,380     |
| Toyota   |   926.7      |   920.9     | -5.9                   | 4,714     |
| Opel     |   777.1      |   774.5     | -2.6                   | 9,530     |

Key observations:

- **High-volume premium brands benefit the most from brand-specific models.**  
  Ford, BMW and Mercedes gain about 35–38 GBP lower MAE per car (≈ 2–4% relative improvement). This is meaningful at scale and based on large sample sizes.

- **Moderate gains for VW, Audi, Skoda, Toyota.**  
  MAE improvements are smaller (5–12 GBP, typically <1% relative), but still consistent in sign.

- **Minimal benefit for Opel.**  
  The improvement for Opel (≈ 2.6 GBP) is negligible relative to its base MAE. The global model already captures Opel’s pricing patterns.

- **RMSE sometimes increases slightly for brand-specific models.**  
  For some brands, RMSE is marginally higher, indicating that brand-specific models reduce typical errors but can perform worse on rare/extreme cases, hinting at mild overfitting in the tails.

Overall, moving from a global to a brand-specific layer consistently does not harm MAE and clearly helps for some large brands, but the absolute gains are moderate.

#### Brand–model-level comparison

For frequent (brand, model) pairs, the analysis shows a more mixed picture. A selection of results (MAE in GBP):

| Brand   | Model        | MAE global | MAE seg | ΔMAE (seg – global) | n_samples |
|---------|--------------|-----------:|--------:|---------------------:|----------:|
| Skoda   | kamiq        | 1,418.6    | 1,107.1 | -311.5               | 109       |
| VW      | amarok       | 2,988.7    | 2,801.3 | -187.4               | 83        |
| Mercedes| x-class      | 3,592.8    | 3,448.9 | -144.0               | 59        |
| Skoda   | scala        | 1,175.7    | 1,100.5 | -75.2                | 147       |
| Ford    | b-max        |   640.2    |   578.1 | -62.1                | 248       |
| Skoda   | octavia      | 1,089.4    | 1,031.9 | -57.5                | 1,021     |
| Skoda   | fabia        |   845.8    |   795.1 | -50.6                | 1,069     |
| VW      | up           |   645.2    |   608.1 | -37.1                | 608       |
| BMW     | 1 series     | 1,158.2    | 1,130.0 | -28.1                | 1,358     |
| VW      | golf         | 1,151.0    | 1,155.8 |  +4.8                | 3,515     |
| Ford    | fiesta       |   753.3    |   762.7 |  +9.3                | 4,470     |
| Toyota  | aygo         |   557.1    |   576.7 | +19.6                | 1,381     |
| BMW     | 7 series     | 3,146.7    | 4,751.9 | +1,605.2             | 71        |
| Mercedes| gls class    | 3,295.8    | 5,906.4 | +2,610.7             | 54        |

Patterns:

- **Some compact, relatively frequent models benefit from model-level specialization.**  
  Examples: Skoda Kamiq, Scala, Octavia and Fabia; VW up; Ford B-MAX.  
  These segments see large MAE reductions (50–300 GBP), and RMSE also tends to decrease. Here, the model-level regressor can exploit consistent, model-specific patterns.

- **For many common volume models, gains are small or negative.**  
  VW Golf, Ford Fiesta, Opel Corsa, Toyota Yaris, etc. often show small positive ΔMAE and/or higher RMSE. For these, splitting by model does not significantly improve typical error and can worsen extreme cases.

- **For rare, high-priced models, model-specific fits severely overfit.**  
  BMW 7 series, BMW X6, Mercedes GLS/S/SL/CLS class, VW Beetle, Toyota Avensis/Verso and others exhibit very large increases in MAE (hundreds to thousands of GBP) and often huge increases in RMSE.  
  These models have small sample sizes (often <100 cars), so a separate model per (brand, model) is clearly not robust.

In short:

- Moving from **global → brand** is often beneficial and relatively safe for high-volume brands.
- Moving further from **brand → brand–model** brings strong improvements only for a small subset of frequent models; for many others, especially rare premium models, it clearly overfits.

---

##### e) Alignment with objectives (0.5v)

This extended open-ended study:

- Directly addresses and expands a suggested topic (“global vs brand-specific models”), and pushes it one step further to **brand–model** specialization.
- Uses fully the final production pipeline and a consistent CV protocol, so the conclusions are directly relevant for deployment.
- Provides a **clear design recommendation**:
  - Use a **single global model** as the base.
  - Optionally introduce **brand-level specialization** for a small set of high-volume brands (e.g. Ford, BMW, Mercedes) where MAE improvements are meaningful.
  - Avoid full **brand–model specialization** except potentially for a handful of very frequent models with demonstrated gains; for most models, especially rare and expensive ones, splitting further clearly overfits.

This shows that we not only tuned a strong model, but also explored the trade-off between model complexity and robustness in a structured, data-driven way.



In [None]:
# Load final production pipeline (preprocessing + SHAP + HGB)
# hgb_final_shap_pipe = load("hgb_final_shap_pipe.pkl")
pipe_global = hgb_final_shap_pipe 

assert "X_train" in globals() and "y_train" in globals(), "Define X_train and y_train before proceeding."

# Identify the brand column (name may be 'Brand' or 'brand')
brand_col = "Brand" if "Brand" in X_train.columns else "brand"
assert brand_col in X_train.columns, (
    f"Brand column not found in X_train. "
    f"First columns: {X_train.columns.tolist()[:20]}"
)

print("Using brand column:", brand_col)

In [None]:
# Inspect brand frequencies
brand_counts = X_train[brand_col].value_counts()
print("Top brands by count:")
print(brand_counts.head(15))

# Select candidate brands
#    - TOP_K: max number of brands to compare.
#    - MIN_SAMPLES: minimum number of rows per brand.

TOP_K = 8
MIN_SAMPLES = 500  # adjust if needed

candidate_brands = [
    b for b, cnt in brand_counts.items()
    if cnt >= MIN_SAMPLES
][:TOP_K]

print("\nCandidate brands used in the comparison:")
print(candidate_brands)


In [None]:
# Cross-validation setup: We reuse the same KFold splits for all evaluations to keep comparisons fair and to reduce randomness.

cv = KFold(n_splits=5, shuffle=True, random_state=42)
splits = list(cv.split(X_train, y_train))


def eval_global_for_brand(model, X, y, brand_col, brand, splits):
    """
    Evaluate the global pipeline for a single brand.

    The model is trained on all brands in each fold, but the error
    is computed only on validation rows belonging to the given brand.
    """
    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        # Split data for this fold
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train global model on ALL brands in this fold
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        # Restrict metrics to the target brand in validation
        mask_b = (X_val[brand_col] == brand)
        if mask_b.sum() == 0:
            continue

        y_val_b = y_val[mask_b]
        y_pred_b = y_pred[mask_b]

        mae = mean_absolute_error(y_val_b, y_pred_b)
        mse = mean_squared_error(y_val_b, y_pred_b)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += mask_b.sum()

    return {
        "MAE_mean": float(np.mean(maes)),
        "MAE_std":  float(np.std(maes)),
        "RMSE_mean": float(np.mean(rmses)),
        "RMSE_std":  float(np.std(rmses)),
        "n": int(n_obs),
    }


def eval_brand_specific(pipe_global, X, y, brand_col, brand, splits,
                        min_train_per_fold=50):
    """
    Evaluate a brand-specific model for a single brand.

    Preprocessing + SHAP selection are kept fixed (from pipe_global).
    In each fold:
      - Transform the brand's data with the fixed preprocessor.
      - Fit a fresh regressor (clone of the final step) only on that brand.
      - Evaluate on validation rows of that brand.
    """
    # Split the pipeline into:
    # - preproc: all steps except the final regressor
    # - base_reg: the final regressor template
    preproc = pipe_global[:-1]
    base_reg = pipe_global[-1]

    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Keep only this brand in train/val
        mask_tr = (X_tr[brand_col] == brand)
        mask_val = (X_val[brand_col] == brand)

        if mask_val.sum() == 0:
            # No validation examples of this brand in this fold
            continue
        if mask_tr.sum() < min_train_per_fold:
            # Too few training examples for a stable brand-specific fit
            continue

        X_tr_b, y_tr_b = X_tr[mask_tr], y_tr[mask_tr]
        X_val_b, y_val_b = X_val[mask_val], y_val[mask_val]

        # Do NOT refit the preprocessor; just transform with the fitted one
        X_tr_b_proc = preproc.transform(X_tr_b)
        X_val_b_proc = preproc.transform(X_val_b)

        # Fresh regressor for this fold
        reg = clone(base_reg)
        reg.fit(X_tr_b_proc, y_tr_b)

        y_pred_b = reg.predict(X_val_b_proc)

        mae = mean_absolute_error(y_val_b, y_pred_b)
        mse = mean_squared_error(y_val_b, y_pred_b)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += len(y_val_b)

    return {
        "MAE_mean": float(np.mean(maes)) if maes else np.nan,
        "MAE_std":  float(np.std(maes))  if maes else np.nan,
        "RMSE_mean": float(np.mean(rmses)) if rmses else np.nan,
        "RMSE_std":  float(np.std(rmses))  if rmses else np.nan,
        "n": int(n_obs),
    }


In [None]:
# Evaluate both models for each candidate brand

pipe_global = hgb_final_shap_pipe 

global_results = []
brand_specific_results = []

for brand in candidate_brands:
    print("Evaluating brand:", brand)

    # 1) Global: train on all brands, measure only this brand in validation
    res_g = eval_global_for_brand(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        brand=brand,
        splits=splits,
    )
    res_g.update({
        "brand": brand,
        "model_type": "global",
    })
    global_results.append(res_g)

    # 2) Brand-specific: preproc fixed, regressor trained only on this brand
    res_b = eval_brand_specific(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        brand=brand,
        splits=splits,
    )
    res_b.update({
        "brand": brand,
        "model_type": "brand_specific",
    })
    brand_specific_results.append(res_b)

# Collect results into DataFrames
df_global = pd.DataFrame(global_results)
df_brand = pd.DataFrame(brand_specific_results)

print("\nGlobal model results per brand:")
display(df_global)

print("\nBrand-specific model results per brand:")
display(df_brand)


In [None]:
# Clean results and compute performance differences

# Drop any brands where evaluation failed (NaNs)
df_global = df_global.dropna(subset=["MAE_mean", "RMSE_mean"])
df_brand  = df_brand.dropna(subset=["MAE_mean", "RMSE_mean"])

# Merge global vs brand-specific results
df_compare = df_global.merge(
    df_brand,
    on="brand",
    suffixes=("_global", "_brand"),
)

# Compute deltas:
#   delta_MAE  < 0  -> brand-specific has lower MAE (better)
#   delta_RMSE < 0  -> brand-specific has lower RMSE (better)
df_compare["delta_MAE"]  = df_compare["MAE_mean_brand"]  - df_compare["MAE_mean_global"]
df_compare["delta_RMSE"] = df_compare["RMSE_mean_brand"] - df_compare["RMSE_mean_global"]

# Sort by delta_MAE (most improvement first)
df_compare_sorted = df_compare.sort_values("delta_MAE")

print("Per-brand comparison (head):")
display(df_compare_sorted)


In [None]:
# Visualizations: bar plots for MAE and ΔMAE

# Global vs Brand-specific MAE per brand
plt.figure(figsize=(8, 4))
x = np.arange(len(df_compare_sorted))
width = 0.35

plt.bar(
    x - width / 2,
    df_compare_sorted["MAE_mean_global"],
    width,
    label="Global model",
)
plt.bar(
    x + width / 2,
    df_compare_sorted["MAE_mean_brand"],
    width,
    label="Brand-specific model",
)

plt.xticks(x, df_compare_sorted["brand"], rotation=45, ha="right")
plt.ylabel("MAE (GBP)")
plt.title("Global vs Brand-specific models (MAE per brand)")
plt.legend()
plt.tight_layout()
plt.show()

# 1ΔMAE per brand (negative = improvement with specialization)
plt.figure(figsize=(8, 3))
plt.bar(df_compare_sorted["brand"], df_compare_sorted["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Δ MAE (brand - global)")
plt.title("Effect of model specialization per brand\n(negative = brand-specific MAE is lower)")
plt.tight_layout()
plt.show()


**Brand-Model Segmentation**

In [None]:
# Evaluation helpers for brand–model segments

def eval_global_for_brand_model(model, X, y, brand_col, model_col,
                                brand, model_name, splits):
    """
    Evaluate the global pipeline for a specific (brand, model) pair.

    In each fold:
      - Train on all cars.
      - Compute MAE / RMSE only on validation rows where
        Brand == brand AND model == model_name.
    """
    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train global model on ALL brands and models
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        # Restrict to this brand–model in validation
        mask_seg = (
            (X_val[brand_col] == brand) &
            (X_val[model_col] == model_name)
        )
        if mask_seg.sum() == 0:
            continue

        y_val_seg = y_val[mask_seg]
        y_pred_seg = y_pred[mask_seg]

        mae = mean_absolute_error(y_val_seg, y_pred_seg)
        mse = mean_squared_error(y_val_seg, y_pred_seg)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += mask_seg.sum()

    return {
        "MAE_mean": float(np.mean(maes)),
        "MAE_std":  float(np.std(maes)),
        "RMSE_mean": float(np.mean(rmses)),
        "RMSE_std":  float(np.std(rmses)),
        "n": int(n_obs),
    }


def eval_brand_model_specific(pipe_global, X, y, brand_col, model_col,
                              brand, model_name, splits,
                              min_train_per_fold=40):
    """
    Evaluate a brand–model-specific regressor.

    Preprocessing + SHAP selection stay fixed (from pipe_global).
    In each fold:
      - Keep only rows with this (brand, model).
      - Transform them with the fixed preprocessor.
      - Fit a fresh regressor only on this segment.
      - Evaluate on validation rows of the same segment.
    """
    preproc = pipe_global[:-1]
    base_reg = pipe_global[-1]

    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Restrict to this brand–model in train/val
        mask_tr = (
            (X_tr[brand_col] == brand) &
            (X_tr[model_col] == model_name)
        )
        mask_val = (
            (X_val[brand_col] == brand) &
            (X_val[model_col] == model_name)
        )

        if mask_val.sum() == 0:
            continue
        if mask_tr.sum() < min_train_per_fold:
            continue

        X_tr_seg, y_tr_seg = X_tr[mask_tr], y_tr[mask_tr]
        X_val_seg, y_val_seg = X_val[mask_val], y_val[mask_val]

        # Transform with fixed preprocessor
        X_tr_seg_proc = preproc.transform(X_tr_seg)
        X_val_seg_proc = preproc.transform(X_val_seg)

        # Fresh regressor for this fold
        reg = clone(base_reg)
        reg.fit(X_tr_seg_proc, y_tr_seg)
        y_pred_seg = reg.predict(X_val_seg_proc)

        mae = mean_absolute_error(y_val_seg, y_pred_seg)
        mse = mean_squared_error(y_val_seg, y_pred_seg)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += len(y_val_seg)

    return {
        "MAE_mean": float(np.mean(maes)) if maes else np.nan,
        "MAE_std":  float(np.std(maes))  if maes else np.nan,
        "RMSE_mean": float(np.mean(rmses)) if rmses else np.nan,
        "RMSE_std":  float(np.std(rmses))  if rmses else np.nan,
        "n": int(n_obs),
    }


In [None]:
# Run evaluation for each (brand, model) pair

bm_global_results = []
bm_specific_results = []

for (brand, model_name), cnt in candidate_pairs.items():
    print(f"Evaluating pair: {brand} / {model_name} (n={cnt})")

    # Global model on this brand–model segment
    res_g = eval_global_for_brand_model(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        model_col=model_col,
        brand=brand,
        model_name=model_name,
        splits=splits,
    )
    res_g.update({
        "brand": brand,
        "model": model_name,
        "segment_type": "global",
    })
    bm_global_results.append(res_g)

    # Brand–model-specific regressor
    res_bm = eval_brand_model_specific(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        model_col=model_col,
        brand=brand,
        model_name=model_name,
        splits=splits,
    )
    res_bm.update({
        "brand": brand,
        "model": model_name,
        "segment_type": "brand_model_specific",
    })
    bm_specific_results.append(res_bm)

df_bm_global = pd.DataFrame(bm_global_results)
df_bm_spec   = pd.DataFrame(bm_specific_results)

print("\nGlobal results per (brand, model):")
display(df_bm_global)

print("\nBrand–model-specific results:")
display(df_bm_spec)


In [None]:
# Compare global vs brand–model-specific performance

# Drop failed / NaN segments
df_bm_global = df_bm_global.dropna(subset=["MAE_mean", "RMSE_mean"])
df_bm_spec   = df_bm_spec.dropna(subset=["MAE_mean", "RMSE_mean"])

df_bm_compare = df_bm_global.merge(
    df_bm_spec,
    on=["brand", "model"],
    suffixes=("_global", "_bm"),
)

df_bm_compare["delta_MAE"]  = df_bm_compare["MAE_mean_bm"]  - df_bm_compare["MAE_mean_global"]
df_bm_compare["delta_RMSE"] = df_bm_compare["RMSE_mean_bm"] - df_bm_compare["RMSE_mean_global"]

df_bm_sorted = df_bm_compare.sort_values("delta_MAE")

print("Brand–model comparison (most improvement first):")
display(df_bm_sorted)

# Optional readable table
bm_display_cols = [
    "brand", "model",
    "MAE_mean_global", "MAE_mean_bm", "delta_MAE",
    "RMSE_mean_global", "RMSE_mean_bm", "delta_RMSE",
    "n_global",
]
df_bm_display = (
    df_bm_sorted[bm_display_cols]
    .copy()
    .rename(columns={"n_global": "n_samples"})
)

for c in df_bm_display.columns:
    if "MAE" in c or "RMSE" in c or "delta" in c:
        df_bm_display[c] = df_bm_display[c].round(1)

print("\nReadable brand–model summary:")
display(df_bm_display)


In [None]:
# Extra plots for brand–model specialization

# Focus on segments with at least 100 samples for more stable numbers
df_bm_plot = df_bm_display[df_bm_display["n_samples"] >= 100]

# Sort by delta_MAE (most improvement first)
df_bm_plot = df_bm_plot.sort_values("delta_MAE")

# 1) Bar plot of ΔMAE for brand–model segments (filtered)
plt.figure(figsize=(10, 4))
x = np.arange(len(df_bm_plot))
plt.bar(x, df_bm_plot["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xticks(x, [f"{b} {m}" for b, m in zip(df_bm_plot["brand"], df_bm_plot["model"])],
           rotation=90, ha="right")
plt.ylabel("Δ MAE (brand–model - global)")
plt.title("Effect of brand–model specialization\n(negative = lower MAE than global)")
plt.tight_layout()
plt.show()

# 2) Scatter plot: n_samples vs ΔMAE to visualise overfitting at low sample sizes
plt.figure(figsize=(6, 4))
plt.scatter(df_bm_display["n_samples"], df_bm_display["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xlabel("Number of samples per (brand, model)")
plt.ylabel("Δ MAE (brand–model - global)")
plt.title("ΔMAE vs segment size\n(negative = brand–model-specific is better)")
plt.tight_layout()
plt.show()
