In [None]:
# Open Questions (maybe ask Ricardo):
# - Allowed to use QuantileEncoder from category_encoders or do we have to use TargetEncoder from sklearn? Can we use both because it improves our MAE or is it generally bad practice to involve multiple encodings of one feature?

**Your work will be evaluated according to the following criteria:**
- Project Structure and Notebook(s) Quality (4/20)
- Data Exploration & Initial Preprocessing (4/20)
- Regression Benchmarking and Optimization (7/20)
- Open-Ended Section (4/20)
- Deployment (1/20)
- Extra Point: Have Project Be Publicly Available on GitHub (1/20)


<div style="
    background: rgba(25, 25, 25, 0.55);
    backdrop-filter: blur(16px) saturate(150%);
    -webkit-backdrop-filter: blur(16px) saturate(150%);
    border: 1px solid rgba(255, 255, 255, 0.12);
    border-radius: 18px;
    padding: 45px 30px;
    text-align: center;
    font-family: 'Inter', 'Segoe UI', 'Helvetica Neue', Arial, sans-serif;
    color: #e0e0e0;
    box-shadow: 0 0 30px rgba(0, 0, 0, 0.35);
    margin: 40px auto;
    max-width: 800px;
">

  <h1 style="
      font-size: 2.8em;
      font-weight: 700;
      margin: 0 0 8px 0;
      letter-spacing: -0.02em;
      background: linear-gradient(90deg, #00e0ff, #9c7eff);
      -webkit-background-clip: text;
      -webkit-text-fill-color: transparent;
  ">
      Machine Learning Project
  </h1>

  <h2 style="
      font-size: 1.6em;
      font-weight: 500;
      margin: 0 0 25px 0;
      color: #b0b0b0;
      letter-spacing: 0.5px;
  ">
      Cars 4 You - Predicting Car Prices
  </h2>

  <p style="
      font-size: 1.25em;
      font-weight: 500;
      color: #c0c0c0;
      margin-bottom: 6px;
  ">
      Group 5 - Lukas Belser, Samuel Braun, Elias Karle, Jan Thier
  </p>

  <p style="
      font-size: 1.05em;
      font-weight: 400;
      color: #8a8a8a;
      font-style: italic;
      letter-spacing: 0.5px;
  ">
      Machine Learning End Results · 22.12.2025
  </p>
</div>


### **Table of Contents**
 
- [1. Import Packages and Data](#1-import-packages-and-data)  
  - [1.1 Import Required Packages](#11-import-required-packages)  
  - [1.2 Load Datasets](#12-load-datasets)  
  - [1.3 Kaggle Setup](#13-kaggle-setup)  
- [2. Preprocessing](#2-data-cleaning-feature-engineering-split--preprocessing)  
  - [2.1 Data Cleaning](#21-data-cleaning)  
  - [2.2 Feature Engineering](#22-feature-engineering)  
  - [2.3 (No) Data Split](#23-data-split)  
  - [2.4 Encoding, Transforming and Scaling](#24-preprocessing)  
  - [2.5. Feature Selection](#3-feature-selection)  
- [4. Model Evaluation Metrics, Baselining, Setup](#4-model-evaluation-metrics-baselining-setup)  
- [5. Hyperparameter Tuning and Model Evaluation](#5-hyperparameter-tuning-and-model-evaluation)  
  - [5.1 ElasticNet](#51-elasticnet)  
  - [5.2 HistGradientBoost](#52-histgradientboost)  
  - [5.3 RandomForest](#53-randomforest)  
  - [5.4 ExtraTrees](#54-extratrees)  
- [6. Feature Importance of Tree Models (with SHAP)](#6-feature-importance-of-tree-models-with-shap)  
  - [6.1 HGB](#61-hgb)  
  - [6.2 RF](#62-rf)  
- [7. Kaggle Competition](#7-kaggle-competition)  

TODO finish + update toc > at the end of project

<img src="images/process_ML.png" alt="Drawing" style="width: 1000px;"/>

**Group Member Contribution**    
What part(s) of the work were done by each member and an estimated %
contribution of each member towards the final work.

In [None]:
# TODO Project Outline (Abstract)

**Identifying Business Needs**      
Overview and main goals of the project:
- ...

Description of the overall process and identification of model assessment approach adopted in the work:
- Cross-Validation approach for model assessment following the steps outlined in the image.
- For more details on the respective steps in the pipeline, refer to the specific part in the notebook below.

[image]

In [None]:
# TODO add visualization of the pipe in a markdown here

**Data Exploration**     
For Data Exploration of the original and the engineered features including the consequences for preprocessing, refer to notebook `group05_exploratory_data_analysis.ipynb`.

Top 3 Key insights:
- ...

The findings from the EDA are used for the following steps taken to clean and prepare the data.

### 1. Import Packages and Data

#### 1.1 Import Required Packages

In [None]:
!pip install kaggle
!pip install shap
!pip install -U scikit-learn
!pip install category_encoders
!pip install ydata-profiling

In [None]:
# TODO Imports klassifizieren, was für welchen Part genutzt wird
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt; plt.rcParams.update({"figure.max_open_warning": 0, "figure.dpi": 100})
import joblib
import shap

from collections import Counter
from sklearn.feature_selection import VarianceThreshold, RFE
from scipy.stats import spearmanr, uniform, randint
from sklearn.metrics import mean_absolute_error
 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, StandardScaler, FunctionTransformer, RobustScaler
from sklearn.base import clone, BaseEstimator, TransformerMixin
 
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV, cross_validate, cross_val_score
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from tqdm.auto import tqdm

from category_encoders import QuantileEncoder # used for median target encoding (sklearn only supports mean target encoding with their TargetEncoder class)
 
from pipeline_functions import CarDataCleaner, OutlierHandler, GroupImputer, CarFeatureEngineer, NamedFunctionTransformer, model_hyperparameter_tuning, DebugTransformer, MajorityVoteSelectorTransformer, MutualInfoThresholdSelector, SpearmanRelevancyRedundancySelector, create_model_pipe, get_cv_results
from visualization_functions import plot_selector_agreement, plot_val_mae_comparison, plot_train_val_comparison

from collections import Counter
from sklearn.inspection import permutation_importance
from tqdm.auto import tqdm

import pickle

#### 1.2 Load Datasets

In [None]:
df_cars_train = pd.read_csv("train.csv").rename(columns={"Brand": "brand"})
df_cars_test = pd.read_csv("test.csv").rename(columns={"Brand": "brand"})

# TODO rename columns here instead of in cleaning file

#### 1.3 Kaggle Setup

In [None]:
# Kaggle API Connect

# Folder containing kaggle.json
os.environ['KAGGLE_CONFIG_DIR'] = "/Workspace/Users/20250355@novaims.unl.pt" #add your own kaggle.json api token

# Test
!echo $KAGGLE_CONFIG_DIR

### 2. Data Preparation

#### 2.1 Data Split

**Our approach:**
- Train and Val: We use `Cross-Validation` in the `sklearn pipeline` on the available training data to make use of all data while validating different approaches.    
-> We fix the random states everywhere to ensure that all models use the same split to ensure a fair model comparison
- Test: Use external hold-out set from kaggle as final test set (remains completely unseen to avoid leakage)
-> An additional val set is therefore not necessary and would waste training data

1. **Training Set (n-1 folds from CV)**: Used to fit models.
2. **Validation Set (1 fold from CV)**: Used to evaluate performance of models and tune hyperparameters, detect overfitting. 
3. **Test Set (Kaggle)**: Used only once at the end of the entire process to evaluate final model performance. Not considered before to prevent leakage.



<u>Place in the pipe:</u> The split is decided here because the data has to be split before the preprocessing steps to avoid data leakage. All of the following steps are part of the sklearn pipeline while the CV is not an explicit part of the pipeline but rather the technique that calls the pipeline with its separate folds.

In [None]:
# Create CV (shuffle to ensure randomness in splits, random_state to make it reproducible and comparable across models)
rs = 5
cv = KFold(n_splits=5, shuffle=True, random_state=rs) # TODO maybe increase split or use RepeatedKFold(n_splits=10, n_repeats=3) for final tuning
# => This cv will be passed for hyperparameter tuning later when training the models

# Split features and target
X_train = df_cars_train.drop(columns='price')
y_train = df_cars_train['price']

**Our findings:**
- CV achieves better results than using a hold-out set

**Consequences/Interpretation:**
- Usage of all available data is better for the model than 'wasting' training data for a hold-out set

#### 2.2 Data Cleaning

**Our approach:**
- We `clean data inconsistencies` and data entry errors that we found in the EDA
- These columns will be `set to NaN` for that specific entry to not lose rows in the data due to removing
- Afterwards, this value will be imputed (see Section 2.3)

| **Feature** | **Allowed thresholds** | **Reasoning** | **# filtered below threshold** | **# filtered above threshold** |
| :--- | :--- | :--- | :---: | :---: |
| `year` | 1886 to 2020 (inclusive) | The first car was built in 1886; the dataset is from 2020, so newer cars are logically impossible. | . | ... |
| `mileage` | ≥ 0 | Negative mileage is not possible. | . | . |
| `tax` | ≥ 0 | Negative tax is not possible. | . | . |
| `mpg` | 5 to 60 (inclusive) | Realistic range for mass-market road cars. U.S. EPA list (2025): least efficient Bugatti Mistral (9 mpg), most efficient Toyota Prius (57 mpg). | . | . |
| `engineSize` | 0.6 to 12.7 (inclusive) | Practical bounds: smallest mass-production cars are in the Japanese kei class (~0.66L); very large historical production engines up to Bugatti Type 41 Royale (12.7L). | . | . |
| `paintQuality%` → `paintQuality` | 0 to 100 (inclusive) | Percentage values must be between 0 and 100. | . | . |
| `previousOwners` | ≥ 0 | Negative owner counts are not possible. | . | . |
| `hasDamage` | - | Only 0 and NaN values in the data -> no thresholding | . | . |
| TODO Categoricals | | | | |

**Legend**
- `-` means “not applicable”.

In [None]:
# Visualize the data cleaning by running it on raw df and inspect uniques
cleaner = CarDataCleaner(handle_electric="other", set_carid_index=False, use_fuzzy=True)

df_cars_train_clean = cleaner.fit_transform(df_cars_train)
df_cars_test_clean  = cleaner.transform(df_cars_test)

# Don't print values for test set because that would be data leakage (test set remains unseen until final prediction)
print("CLEANED TRAIN uniques")
for col in df_cars_train_clean.columns:
    print(col, df_cars_train_clean[col].unique())

**Our findings:**
- The findings are already included in the table above for easier overview and direct comparison
    - 'Number of filtered values below threshold'
    - 'Number of filtered values above threshold'

==> In total, [TODO] values are identified as data errors in the available training data and are set to NaN

**Consequences/Interpretation:**
- ...

#### 2.3 Outlier Handling

OLD by Jan     
**Our approach:**
- After the obvious logical inconsistencies in the data were already targetted in the data cleaning section we aim to identify other extreme values here
- These extreme values have one of two natures: 
    - wrong values for that specific model but missed in the data cleaning 
    - extreme but valid values that distort the distribution
- Outlier detection through multiple methods to increase the probability that it's actually an outlier
- ...


<u>Place in the pipe</u>: Before imputation to use original distribution for identifying the outliers (otherwise we would inflate the distributions with the imputed values)


**Our approach:**
- We treat outliers as a **data quality + robustness problem**, not as a reason to delete rare cars.
- We keep the process **leakage-safe** by implementing outlier handling as an sklearn transformer inside the pipeline (thresholds learned on training folds only).
- We explicitly separate:
  - **logical inconsistencies** found in the EDA (handled deterministically in `CarDataCleaner`),
  - vs. **statistical extremes** (handled in `OutlierHandler`).
- **Identification** of outliers
  - **Voting of robust univariate detectors:**
    - **Tukey IQR fences (1.5×IQR)**
      - Flags a value if it lies outside:
        - `Q1 − 1.5·IQR` or `Q3 + 1.5·IQR`
      - Strength: non-parametric, robust, widely used baseline (boxplot rule).
    - **Modified Z-score using Median Absolute Deviation (MAD)**
      - Robust alternative to z-scores:
        - uses the **median** instead of mean
        - uses **MAD** instead of standard deviation
      - Typical threshold: `|modified_z| > 3.5`
      - Strength: less sensitive to extreme tails than mean/std-based z-scores.
    - **Voting rule** (for robustness):
      - A value is treated as an outlier only if both methods agree (`min_votes=2`).
      - This reduces false positives compared to using only IQR fences on skewed distributions.
- **Treatment of Outliers:**
  - **Winsorization** (clip extreme values):
    - We keep every car in the dataset (no row deletion).
    - We reduce the influence of extreme values while still preserving information and rank order in the feature.
    - We **clip** flagged values to conservative bounds (`action="clip"`):
      - For each numeric feature we compute robust lower/upper bounds (from IQR and MAD-based thresholds).
      - Values outside those bounds are replaced by the nearest bound (winsorization).
- **Benefits** of this approach:
  - Keeps rare cars (no row deletion).
  - Avoids replacing informative extremes with typical medians (which can hurt tree models).
  - Stabilizes downstream steps (imputation, feature engineering, scaling) without collapsing signal into missingness.

<u>Place in the pipe</u>: 
- Before imputation to use original distribution for identifying the outliers (otherwise we would inflate the distributions with the imputed values)
- Then in imputation, fill the original gaps based on a distribution that does not includes the massive outliers (skewing the mean/median)    
  -> kill the outliers first (set to NaN) so the imputation for everyone becomes cleaner

Practical considerations:
- **EVs:** handled in Cleaning because there are only 4 cars and 2 entries are wrong. In addition, their price and characteristics structure differs strongly (cannot generalize) from combustion cars.
- **Zeros in tax:** not treated as a global outlier by default; handled via robust pipeline + hierarchical imputation. If EDA confirms systematic tax=0 errors in specific segments, we can add a deterministic cleaner rule.
- **Per-model outlier rules:** can be added later as a refinement (only when sample size per model is large enough).
- **Model-family sensitivity:** winsorization is especially helpful for linear/SVR models (reduces leverage points) and remains safe for trees; we keep one unified default pipeline for comparability.

Unused techniques:
- **Drop outside 1.5*IQR:** We decided against the classical “drop rows outside 1.5×IQR” because of:
    - The classical 1.5×IQR boxplot rule (Tukey fences) is a strong baseline, but real-world car variables (especially mileage) are often skewed / heavy-tailed, which can over-flag valid high values.
    - Dropping rows removes rare but valid cars (e.g., very high mileage vehicles), which is undesirable for production.
- **NaN into Imputation:** Set outliers found by Voting of robust univariate detectors to NaN and impute later with (`action="nan"`), but this significantly hurt the best averaged CV MAE

In [None]:
# TODO
# e.g. how to handle Zeros in tax (use groupimputer?) -> features that are computed with tax are also affected and need to be handled then ~J
# e.g. maybe outlier handling per model (if sample size big enough) ~J

**Our findings**:
- ...

**Consequences/Interpretation:**
- ...

#### 2.4 Missing Values Handling

OLD by Jan     
**Our approach:**
- `Group Imputer`: We use a custom GroupImputer that imputes the missing values to be the median of entries within the same group
    - For that we use a hierarchical structure to identify the most similar group to the one with the missing value:
        - 1st level: ...
        - 2nd level: ...
        - ...
        - 4th level: Model
        - 5th level: Brand
    - The group values are only computed on the respective train folds and transformed on the val set to prevent leakage.   
        -> When refitting the entire model, the entire train set is used to fit and the kaggle test set is transformed using the fitted values

<u>Place in the pipe:</u> The Imputation is decided here because the data has to be imputed on original values before engineering new features

**Our approach:**
- We use a custom **hierarchical GroupImputer** to impute missing values in a way that matches the structure of the car market.
- Instead of imputing from the full dataset only (global statistics), we first try to impute from **the most similar cars**:
  - same `brand` and same `model` (closest peer group),
  - otherwise same `brand`,
  - otherwise the global dataset.
- This is more realistic than a single global median because many variables (e.g., `engineSize`, `mpg`, `tax`) are strongly segment-dependent.

**Leakage safety:**
- The `GroupImputer` is implemented as an sklearn transformer in the pipeline.
- Therefore, during cross-validation it learns all medians/modes **only on the training fold** in `fit()` and applies them to the validation fold in `transform()` (no leakage).

---

##### Place in the pipe

> `CarDataCleaner` → `OutlierHandler` → `GroupImputer` → `CarFeatureEngineer` → encoding/scaling → FS → model

**Justification:**
- Imputation must happen on **original features** first, because feature engineering creates ratios/interactions (e.g., `miles_per_year`, `engine_per_mpg`) that would otherwise explode or become undefined when inputs are missing.
- We impute **before** feature engineering to ensure engineered features are computed on complete, consistent base variables.

---

##### Why medians/modes:

- **Median** is robust to skewed distributions (common in `mileage`, `tax`) and less sensitive to extreme values than the mean.
- **Mode** is the natural robust default for categorical variables.

---

##### Implementation notes:

- `group_cols` are used only to define groups; they themselves are **not imputed**.
- The transformer is deterministic: ties in categorical mode are handled consistently (pandas `.mode()` → first entry).
---


In [None]:
# TODO maybe a visual here that shows how the group imputer works in practice

**Our findings:**
- GroupImputation improves performance over a simple strategy by ...

**Consequences/Interpretation:**
- ...

#### 2.5 Feature Engineering

**Our approach:**
- We implement feature engineering as an sklearn transformer (`CarFeatureEngineer`) **inside the pipeline**.
  - This makes the process **CV-safe / leakage-free**: all fold-specific statistics (e.g., model frequency, mean ages) are learned only on the training fold in `fit()` and applied to the validation fold in `transform()`.
- We engineer features with two goals:
  1. **Inject domain structure** (age, usage intensity, efficiency, “big engine + old car” effects).
  2. **Create stronger signals for models** by expressing ratios and interactions that are difficult to learn reliably from raw variables.

**Input columns used (after cleaning + imputation):**
- Numeric: `year`, `mileage`, `tax`, `mpg`, `engineSize`, `previousOwners`
- Categorical: `brand`, `model`, `transmission`, `fuelType`

**Important design notes:**
- Interaction features use `(age + 1)` to avoid division by zero for cars in the reference year.

---

| **New Feature** | **Calculation** | **Nature** | **Reasoning** |
| :--- | :--- | :--- | :--- |
| `age` | `ref_year - year` | Base | Captures depreciation; turns a calendar value into a meaningful pricing variable. |
| `miles_per_year` | `mileage / (age + 1)` | Interaction (ratio) | Normalizes mileage by lifetime: 60k miles on a 3-year car is very different from 60k on a 10-year car; reduces collinearity between `mileage` and `age`. |
| `mpg_x_engine` | `mpg * engineSize` | Interaction (product) | Joint signal for “performance vs efficiency” patterns (high engine + low mpg vs small engine + high mpg). |
| `engine_x_age` | `engineSize * (age + 1)` | Interaction (product) | Differentiates large engines in older cars vs newer cars; helps model capture age-dependent valuation of engine size. |
| `mileage_x_age` | `mileage * (age + 1)` | Interaction (product) | Amplifies the “old + heavily used” signal which is typically strongly negative for price. |
| `mpg_x_age` | `mpg * (age + 1)` | Interaction (product) | Captures age-dependent fuel-efficiency patterns (e.g., older fleets / technology differences) that can correlate with price. |
| `tax_x_age` | `tax * (age + 1)` | Interaction (product) | Models that tax effects can differ by car age (policy/regime + car segment composition). |
| `tax_per_mpg` | `tax / mpg` | Interaction (ratio) | “Cost pressure” proxy: high tax relative to efficiency can reflect segment / running cost patterns. |
| `engine_per_mpg` | `engineSize / mpg` | Interaction (ratio) | Performance-style signal: high engine with low mpg tends to indicate sporty/luxury configurations. |
| `brand_fuel` | `brand + "_" + fuelType` | Interaction (categorical) | Creates configuration groups for target encoding (e.g., Diesel BMW differs from Petrol BMW). |
| `brand_trans` | `brand + "_" + transmission` | Interaction (categorical) | Creates configuration groups for target encoding (e.g., Automatic Mercedes vs Manual Mercedes). |
| `model_freq` | `P(model)` from training fold | Popularity | Approximates market supply/demand stability: common models have more stable pricing; learned CV-safe in `fit()`. |
| `age_rel_brand` | `age - mean_age(brand)` | Relative / group-stat | Measures whether a car is newer/older than typical within its brand (brand-relative positioning). |
| `age_rel_model` | `age - mean_age(model)` | Relative / group-stat | Measures whether a car is newer/older than typical within its model (model-relative positioning). |
| `engine_rel_model` | `engineSize / mean_engineSize(model)` | Relative / group-stat | Captures whether a car is under-/over-engined relative to its model’s typical configuration. |

---

Legend (feature “nature”)

- **Base Features**: derived from a single original variable (e.g. `age` from `year`)
- **Interaction Features**: combine multiple variables to capture non-additive effects
  - products (“amplifiers”) and ratios (“normalizers”)
- **Popularity Features**: learned from the training fold distribution (e.g. model frequency)
- **Relative / Group-stat Features**: compare a car to typical peers within `brand` or `model`
  - learned in `fit()` and applied in `transform()` to avoid leakage

---

Relation to encoding (Target Encoded Features)

We also create categorical “group keys” (`brand_fuel`, `brand_trans`) specifically so that our later encoding step (median target encoding / QuantileEncoder inside the preprocessing pipeline) can learn stable, configuration-specific signals.  
This encoding is handled **after** feature engineering and is **CV-safe** because it is part of the pipeline.

---


In [None]:
# TODO model_freq is basically frequency_encoding of model -> maybe add it this way because its cleaner?

In [None]:
# TODO maybe add ('poly', PolynomialFeatures(degree=2)) to the pipeline for interaction terms ~J
# TODO maybe add a plot of correlation with target of new features or show feature importance of new features ~J
# Beware that this should only be for visualization processes because if including this in the decision making process it would be data leakage ~J

**Our findings:**
- The engineered features are main drivers for performance improvement

(This part is already kind of an ablation study if we can determine the impact)

| Feature | Impact |
| :--- | :--- |
| age | ... |


**Consequences/Interpretation:**
- ...


#### 2.6 Encoding, Transforming and Scaling

**Our approach:**
- We separate features in their `groups of variables` and combine their different treatments in the ColumnTransformer
    - Numerics vs. Categoricals (e.g. transformation vs. encoding)
    - Unused features:
        - year: dropped because replaced by derived feature 'age'
        - paintQuality: dropped because added by mechanic so not available for our predictions in production
- We have one `baseline pipe` and one `optimized pipe` to compare basic preprocessing to optimized preprocessing
    - The baseline pipe does the bare minimum for the algorithms to work cleanly
    - The optimized pipe was adjusted iteratively through multiple experiments and trials during the process

##### Summary of preprocessing: Baseline vs Optimized (including outliers)

| | **Baseline** | **Optimized** |
| :--- | :--- | :--- |
| **Data cleaning** | minimal manual cleaning | `CarDataCleaner` (Section 2.2) |
| **Outlier handling** | none | `OutlierHandler` (Section 2.3) (IQR + MAD voting) → set to `NaN` |
| **Imputation** | SimpleImputer median/mode <br>(simplicity; median more robust than mean)</br> | `GroupImputer` (Section 2.4) |
| **Feature engineering** | none | `CarFeatureEngineer` (Section 2.5) |
| **Transformation** | none | Transform selected skewed numerics (Section 2.6) |
| **Scaling** | StandardScaler | Scaling (Section 2.6) |
| **Encoding** | OneHotEncoder | OneHotEncoder + Target Encoding (Section 2.6) |
| **Feature selection** | none | VT + Majority voting (Section 2.7) |

In [None]:
# Original features
orig_numeric_features = ["year", "mileage", "tax", "mpg", "engineSize", "previousOwners", "hasDamage"]
# TODO create origic_boolean_features for hasDamage ~J
orig_categorical_features = ["brand", "model", "transmission", "fuelType"]

In [None]:
# TODO maybe this step can also be automated within the pipeline ~J
# -> Automatically categorize the features into numeric, boolean and categorical features for further processing in the pipeline
# --> Then look at the numeric ones to see which one might benefit from (log) transformation
numeric_features = [
    "hasDamage",
    "age", "tax", "mpg", "engineSize", "previousOwners",        # Original features (mileage is handled separately because of log transformation)
    "mpg_x_engine",                                             # TODO this feature does not really make sense, however it improves MAE slightly (3) ~J
    "engine_x_age", "mpg_x_age", "tax_x_age",                   # multiplication interaction features (multiplying for amplification)                                   
    "engine_per_mpg", "tax_per_mpg",                            # division interaction features (division for normalization for ratios (efficiency))                 
    "model_freq",
    "age_rel_brand", "age_rel_model", "engine_rel_model"
]
numeric_features_for_log = ["mileage", "miles_per_year"] #, "mileage_x_age"] # mileage_x_age decreases performance slightly
boolean_features = ["hasDamage"]                                # TODO create logic for boolean features in GroupImputer and ColumnTransformer
categorical_features_ohe = ["transmission", "fuelType"]
# categorical_features_te_mean = ["brand", "model"]             # TODO currently not used because median TE is used
categorical_features_te_median = ["brand", "model",             # original features
                                  "brand_fuel", "brand_trans"]  # engineered features for anchors
unused_columns = ["year"]                                       # replaced by age

all_feature_names_before_encoding = numeric_features + numeric_features_for_log + boolean_features + categorical_features_ohe + categorical_features_te_median
print(len(all_feature_names_before_encoding))

**Baseline Pipe:**
- Only the necessary steps for the original variables in the baseline pipe
    - Scaling Numerics (Standard)
    - Encoding Categoricals (OHE)

##### 2.6.2 Optimized Pipe

**Our approach:**
- `Transformation` for skewed Numerics:
    - Log-transform for right-skewed variables
    - box-cox not used in final pipe because ...
- `Scaler` for Numerics:
    - StandardScaler because ...
    - MinMaxScaler performed worse because ...
    - RobustScaler performed worse because ...     
    -> Scaling only on training data to avoid data leakage and then scale val and later test set with the fitted scaler of the training set

- `Encoding` for Categoricals:
    - Low cardinality:
        - OHE because best performance with tree-based models
    - High Cardinality:
        - Median TE on categorical features because performs better than Mean TE

| **Feature** | Nature | Transformation | Encoding | Scaling |
| :--- | :--- | :--- | :--- | :--- |
| age | Numerical | - | - | Standard |
| ... | ... | ... | ... | ... |
| mileage | Numerical | Log | - | Standard |
| ... | ... | ... | ... | ... |
| hasDamage | Boolean | - | - | TODO |
| transmission | Categorical | - | OHE | - |
| ... | ... | ... | ... | ... |
| Brand | Categorical | - | TE | Standard |

==> All operations are combined in a `ColumnTransformer` which applies the different steps to different columns of the data in one unified pipeline (reproducible and prevents data leakage)    
  -> outputs a combined feature matrix

In [None]:
log_transformer_and_scaler = Pipeline([
    ("log",    NamedFunctionTransformer(np.log1p, feature_names=numeric_features_for_log, validate=False)),  # log1p handles zeros safely
    ("scaler", RobustScaler()),
])

numeric_scaler = Pipeline([
    ("scaler", RobustScaler()),
])

categorical_transformer_ohe = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), # TODO maybe drop='first', (like in prac03) # Use sparse_output=False to get dense array back (e.g. necessary for hgb)
])

# Keep mean target encoder in the code but dont use it for now because median TE seems more robust and we use only one method for consistency ~J
# categorical_transformer_te_mean = Pipeline([ 
#     ("encoder", TargetEncoder(target_type='continuous', cv=cv, smooth='auto', random_state=rs)), # Prevents data leakage with CV (e.g. for the samples in Fold 1, it calculates the target mean using the data from Folds 2, 3, 4, and 5) # TODO If it overfits test data too much, increasing the smoothing parameter can help
#     ("scaler", StandardScaler()),
# ])

# Names for median-TE features (one per input column, since QuantileEncoder outputs 1 column per feature)
median_te_feature_names = [f"{col}_median_te" for col in categorical_features_te_median]
categorical_transformer_te_median = Pipeline(steps=[
    ('median_encoder', QuantileEncoder(quantile=0.5, m=10.0)), # not specifying the cols means it encodes all columns (m is the smoothing parameter -> smoothing mitigates but doesnt eliminate leakage) # TODO tune m?
    ('scaler', RobustScaler()),
    ('name_wrapper', NamedFunctionTransformer(feature_names=median_te_feature_names, validate=False)),
])

# TODO put handling of feature groups directly in columntrasnformer instead of declaring them separately ~J
enc_transf_scale = ColumnTransformer([
    ("log", log_transformer_and_scaler, numeric_features_for_log),
    ("num", numeric_scaler, numeric_features),
    ("cat", categorical_transformer_ohe, categorical_features_ohe),
    # ("mean_te", categorical_transformer_te_mean, categorical_features_te_mean), # Mean TE is currently not used but we keep it in the code for reference or later experimenting ~J
    ("median_te", categorical_transformer_te_median, categorical_features_te_median)
])

#### 2.7 Feature Selection

**Our approach:**     
We apply an automatic feature selection approach in addition to the previously removed features (data cleaning, feature engineering)
- year: dropped because replaced by derived feature 'age'
- paintQuality: dropped because filled by mechanic so not available for our predictions in production  as the car prediction skips the mechanic

The goal is to create a very robust feature selection approach that finds features that are most likely actually irrelevant/redundant and therefore generate noise in the model that might lead to overfitting.     
To achieve that goal, we apply **two steps** inside the feature selection:
1) `Variance Threshold` (Filter) to filter constant variables ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html))
2) Majority voter:
    - `Spearman` handles the clean, obvious trends and cleans up redundancy ([docu](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html))
    - `MI` catches more complex relations that Spearman misses ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html))
    - `RF feature importance` to account for importance of the features ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#selectfrommodel))


==> The different voters capture different aspects:
| **Voter** | **Nature** | **Role & Responsibility** |
| :--- | :--- | :--- |
| **Spearman Voter** <br> *(SpearmanRedundancySelector)* | Filter | **Linear/Monotonic**<br>Captures obvious, strong relationships (e.g., "Newer cars are expensive"). Also handles Redundancy by filtering out features that are exact duplicates of better ones. |
| **MI Voter** <br> *(MutualInfoThresholdSelector)* | Filter | **Non-Linear**<br>Captures complex "physics" and non-monotonic patterns that correlation misses. |
| **RF Voter** <br> *(SelectFromModel)* | Embedded | **Interactions**<br>Captures features that are only important in combination with others. |

==> The feature selection is performed inside the pipelines cross-validation and consistent across all models, ensuring no data leakage and consistent feature selection logic.

<u>Place in the pipe:</u> The Feature Selection is placed after the scaling to have the features on one scale (just like in the lab)

!!!!! OLD markdown !!!!! More for us to understand the techniques and be able to explain them in the project defense

*Filter* methods to make an initial screening of the statistical properties of the data: 
- `Correlation Indices` to filter irrelevant and redundant features (Maximum Relevance, Minimum Redundancy (mRMR)-style pruning).     
    - Metric: We use Spearman because we want a single, unified pipeline step after encoding even though it treats binary OHE columns as "ranks," which is a fine but rough approximation. Spearman because not all features are normally distributed as it would be necessary for Pearson.
    - Irrelevant: Little correlation with the target
    - Redundant: Important because other methods like MI and RF will likely keep redundant features as both of them are important if they contain valuable information. However, one of them should be eliminated for cleaner model interpretation of trees and correct model building for models that work better without multicolinearity between features. Of the redundant features, we keep the one with a higher correlation with the target.

*Wrapper* methods create multiple models and use their performance as a proxy for the relevance of the features instead of relying on statistical properties of the data by themselves
    - `RFECV` with Random Forest as the base estimator (removes least important feature based on feature importancy by base estimator) ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html)).

*Embedded* methods perform feature selection as part of the model training process itself -> FS is integrated into the model and is not a separate step (Train model on all features -> Get FI -> Select based on FI)
    - `Random Forest` (Tree-based method): Reduce impurity of the tree using sklearn SelectFromModel 

In [None]:
######################## Pre-Filter ########################
# Variance Threshold (If using a different value than threshold 0, VarianceThreshold has to be applied before scaling)
vt = VarianceThreshold(threshold=0.0)

# TODO Use caching for the prefiltering step to avoid re-computation during hyperparameter tuning ~J

######################## Voters ########################
##### Filter: Statistical Feature Selectors use statistical properties of the data to select features

# Voter: Combined Relevance and Redundancy Spearman Correlation (sort by relevance and remove redundant features based on correlation to target) # TODO maybe use this as a prefilter as well instead of a majority voter
stat_voter_relevancy_redundancy_corr = SpearmanRelevancyRedundancySelector(relevance_threshold=0.05, redundancy_threshold=0.95) # If we set redundancy threshold to 1.01, this becomes similar to just relevance filtering

# Voter: Mutual Information (Non-Linear Dependency)
stat_voter_nonlinear_mi = MutualInfoThresholdSelector(threshold=0.01, n_neighbors=10) # Increasing n_neighbors makes the estimation more stable but computationally slower


##### Wrapper:
# [Unused] Voter: RFE (Recursive Feature Elimination) is excluded for now because it is very expensive
# TODO maybe use RFECV in the end to find optimal number of features considering also the feature interactions after removing variables compared to the embedded model which does not do that (optimizes strictly against target metric MAE)
# rf_for_fs = RandomForestRegressor(n_jobs=-1, max_depth=50)
# rfecv_rf = RFECV(estimator = rf_for_fs, step=1, random_state=rs, cv=cv, scoring='neg_mean_absolute_error', min_features_to_select=5)
# -> Unused because of high computational cost

##### Embedded:
# Voter: Tree Importance (SelectFromModel trains the model once and selects features based on importance scores above threshold)
rf_for_fs = RandomForestRegressor(n_estimators=100, max_depth=8, n_jobs=-1, random_state=rs)  # max_depts not too low (miss interactions) and not too high (selecting noise -> overfitting)
select_from_rf = SelectFromModel(rf_for_fs, threshold='0.001*mean')        # threshold relative because it sums to 1 and if we have many features, many features will have a low importance but are still important


# TODO Maybe add printer/loggers in the majority voter to log which features were selected by which voters (see plots in lab1 FS) ~J

# ==> Final FS pipeline
fs_pipe = Pipeline([
    ("vt", vt), # Apply VT first to remove constant features (it serves as a "dictator" and not a "voter" in our pipeline)
    ('selector', MajorityVoteSelectorTransformer(
        selectors=[
            stat_voter_relevancy_redundancy_corr,
            stat_voter_nonlinear_mi,
            select_from_rf],
        min_votes=2))
])

**Our findings:**
- While trees are comparatively robust to unnecessary features, applying the feature selection pipeline improves the performance slighty (TODO add MAE difference here when including fs pipe vs. not including fs pipe)


**Consequences/Interpretation:**
- ...

________

SKlearn elements we also considered but decided not to use:
- Filter Methods: & SelectPercentile
    - SelectFwe (Family-Wise Error Rate)   
    -> too strict and we don't want to be too conservative in our feature selection (we prefer to keep weak but useful signals)
    - SelectKBest     
    -> we didn't want to fix k (number of selected features)
- Wrapper Methods: 
    - RFECV   
    -> too expensive
    - SequentialFeatureSelector (forward, backward selection)   
    -> too expensive
- Embedded:
    - Regularization Method (Lasso)     
    -> considers only linear relationships so discarded

#### 2.8 Create Final Preprocessing Pipeline

**Our approach:**
- The `Pipeline` combines feature engineering, group imputation and the column transformer into the final preprocessing pipe ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html))
    - Data Cleaning (Section 2.2)
    - Outlier Handling (Section 2.3)
    - Group Imputer (Section 2.4)
    - Feature Engineering (Section 2.5)
    - Column Transformer (Section 2.6)
    - Feature Selection (Section 2.7)
- Through calling the pipeline for data preparation, we ensure that the data is preprocessed independently for each training fold (filling missing values, scaling, encoding, etc.)     
    -> prevent leakage

In [None]:
preprocessor_orig = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), orig_numeric_features),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) # TODO maybe drop='first', (like in prac03) # Use sparse_output=False to get dense array back (e.g. necessary for hgb)
    ]), orig_categorical_features)
])

In [None]:
simple_imputation_ct = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), orig_numeric_features),
    ("cat", SimpleImputer(strategy="most_frequent"), orig_categorical_features)
], verbose_feature_names_out=False)  # Don't add prefixes to column names

# Set output to pandas so CarFeatureEngineer receives DataFrame
simple_imputation_ct.set_output(transform="pandas")

preprocessor_pipe = Pipeline([
    ("clean", CarDataCleaner(handle_electric="other", set_carid_index=False, use_fuzzy=True)),
    # ("outliers", OutlierHandler(
    #     cols=[c for c in orig_numeric_features if c != "mileage"],      # only original numeric features here, no mileage because of log transform later
    #     methods=("iqr", "mod_z"),                                       # robust voting
    #     min_votes=2,                                                    # outlier if BOTH methods agree
    #     iqr_k=1.5,
    #     z_thresh=3.5,
    #     action="clip",                                                   
    #     verbose=False,
    # )),
    # ("group_imputer", GroupImputer(
    #     group_cols=("brand", "model"),
    #     num_cols=orig_numeric_features,                                 # We have to use the original features here because the others are engineered in the next step
    #     cat_cols=orig_categorical_features,                             # We have to use the original features here because the others are engineered in the next step
    #     fallback="__MISSING__",
    # )),
    ("simple_imputer", simple_imputation_ct),
    ("fe", CarFeatureEngineer(ref_year=2020)),
    ("ct", enc_transf_scale),
    ("fs", fs_pipe)
])

# Save preprocessor for reuse in DL experiments
with open('preprocessor_pipe.pkl', 'wb') as f:
    pickle.dump(preprocessor_pipe, f)

In [None]:
# Visualize outputs of each step in the preprocessing pipeline
# Set output to pandas DataFrames for easier inspection while we use numpy arrays for efficient model training (default)
enc_transf_scale.set_output(transform="pandas")
fs_pipe.set_output(transform="pandas")

show_data = True
y_data_profiling = True
debug_preprocessor_pipe = Pipeline([
    ('debug_start', DebugTransformer('START', show_data=show_data, y_data_profiling=y_data_profiling)),
    ("clean", CarDataCleaner(handle_electric="other", set_carid_index=False, use_fuzzy=True)),
    ('debug_after_clean', DebugTransformer('AFTER CLEANING', show_data=show_data, y_data_profiling=y_data_profiling)),

    # ("outliers", OutlierHandler(
    #     cols=[c for c in orig_numeric_features if c != "mileage"],
    #     methods=("iqr", "mod_z"),
    #     min_votes=2,
    #     iqr_k=1.5,
    #     z_thresh=3.5,
    #     action="clip",
    #     verbose=True,  # useful in debug pipe
    # )),
    # ('debug_after_outliers', DebugTransformer('AFTER OUTLIER HANDLING', show_data=show_data, y_data_profiling=y_data_profiling)),

    # ("group_imputer", GroupImputer(
    #     group_cols=("brand", "model"),
    #     num_cols=orig_numeric_features, # numeric_features + numeric_features_for_log,                      # We have to use the original features here because the others are engineered in the next step
    #     cat_cols=orig_categorical_features, # categorical_features_ohe + categorical_features_te_median,    # We have to use the original features here because the others are engineered in the next step
    #     fallback="__MISSING__",
    # )),
    ("simple_imputer", simple_imputation_ct),
    ('debug_after_impute', DebugTransformer('AFTER IMPUTATION', show_data=show_data, y_data_profiling=y_data_profiling)),

    ("fe", CarFeatureEngineer(ref_year=2020)),
    ('debug_after_fe', DebugTransformer('AFTER FEATURE ENGINEERING', show_data=show_data, y_data_profiling=y_data_profiling)),

    # ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    # ('debug_after_poly', DebugTransformer('AFTER POLYNOMIAL FEATURES', show_data=show_data, y_data_profiling=y_data_profiling)),

    ("ct", (enc_transf_scale)),
    ('debug_after_ct', DebugTransformer('AFTER COLUMN TRANSFORMER', show_data=show_data, y_data_profiling=y_data_profiling)),
    
    ("fs", (fs_pipe)),
    ('debug_after_fs', DebugTransformer('AFTER FEATURE SELECTION', show_data=show_data, y_data_profiling=y_data_profiling))
])

print("Show outputs of each step in the preprocessing pipeline:") # Set show_data=True in DebugTransformer to see the data at each step
# We call fit_tranform here on the entire training data to just visualize the result. The insights from here are not used for anything else in model decisions so it's not leakage
X_result = debug_preprocessor_pipe.fit_transform(X_train, y_train)

# Reset output to default (numpy arrays) for model training
enc_transf_scale.set_output(transform="default")
fs_pipe.set_output(transform="default")

In [None]:
# # TODO delete following cell later - this is for us to see if the group imputer works - but it is GPT slop

# brand = "VW"
# model = "golf"

# # 1) Get the fitted steps from preprocessor_pipe
# preprocessor_pipe.fit(X_train, y_train)
# fe = preprocessor_pipe.named_steps["fe"]              # CarFeatureEngineer
# imp = preprocessor_pipe.named_steps["group_imputer"]  # GroupImputer

# # 2) Inspect GroupImputer internal numeric stats
# pair_table = getattr(imp, "num_pair_", None)    # indexed by (_g0, _g1) = (Brand, model)
# brand_table = getattr(imp, "num_first_", None)  # indexed by _g0 = Brand
# global_med = getattr(imp, "num_global_", None)  # Series of global medians

# print("Has pair-level medians table:",
#       pair_table is not None and not getattr(pair_table, "empty", True))
# print("Has brand-level medians table:",
#       brand_table is not None and not getattr(brand_table, "empty", True))
# print("Has global median:",
#       global_med is not None and not global_med.empty if global_med is not None else False)
# print()

# _g0 = brand
# _g1 = model

# # 2a) Pair-level
# if pair_table is not None and (_g0, _g1) in pair_table.index:
#     print(f"Pair-level median FOUND for ({brand}, {model}):")
#     display(pair_table.loc[(_g0, _g1)])
# else:
#     print(f"No pair-level median for ({brand}, {model}).")
#     if pair_table is not None and not pair_table.empty:
#         print("Sample of pair-level medians (top 5):")
#         display(pair_table.head())

# # 2b) Brand-level
# if brand_table is not None and _g0 in brand_table.index:
#     print(f"\nBrand-level median for {brand}:")
#     display(brand_table.loc[_g0])
# else:
#     print("\nNo brand-level median for", brand)
#     if brand_table is not None and not brand_table.empty:
#         print("Sample of brand-level medians (top 5):")
#         display(brand_table.head())

# # 2c) Global medians
# print("\nGlobal median (fallback):")
# display(global_med)

# # 3) Apply CarFeatureEngineer + GroupImputer to VW Golf rows and compare
# #    (GroupImputer was fitted after CarFeatureEngineer, so we must mimic that order)

# # 3a) Feature engineering on full X_train
# X_train_fe = fe.transform(X_train)

# # 3b) Filter for VW Golf in the feature-engineered space
# vw_golf = X_train_fe[(X_train_fe["Brand"] == brand) & (X_train_fe["model"] == model)].copy()

# if vw_golf.empty:
#     print("\nNo VW Golf rows found in X_train.")
# else:
#     print(f"\nFound {len(vw_golf)} VW Golf rows in X_train.")

#     # 3c) GroupImputer expects the columns it saw at fit time
#     cols_for_imp = imp.feature_names_in_
#     vw_input = vw_golf.loc[:, cols_for_imp]

#     vw_imp = imp.transform(vw_input)
#     vw_imp_df = pd.DataFrame(vw_imp, columns=cols_for_imp, index=vw_golf.index)

#     print("\nImputed data (first 8 rows):")
#     display(vw_imp_df[["mpg", "mileage", "tax"]].head(8))

#     # 4) Build comparison table (original vs imputed, for selected columns)
#     comp = pd.DataFrame(index=vw_golf.index)
#     comp["orig_mpg"] = vw_golf["mpg"]
#     comp["imp_mpg"] = vw_imp_df["mpg"]
#     comp["orig_tax"] = vw_golf["tax"]
#     comp["imp_tax"] = vw_imp_df["tax"]
#     comp["orig_mileage"] = vw_golf["mileage"]
#     comp["imp_mileage"] = vw_imp_df["mileage"]

#     print("\nOriginal vs imputed (first 12 rows):")
#     display(comp.head(12))

#     # 5) Determine imputation source per row
#     def source_of_imputation(col):
#         srcs = []
#         for idx, row in comp.iterrows():
#             val = row[f"imp_{col}"]
#             src = "other"

#             # Pair-level
#             if pair_table is not None and (_g0, _g1) in pair_table.index and col in pair_table.columns:
#                 pair_val = pair_table.loc[(_g0, _g1), col]
#                 if pd.notna(pair_val) and pd.notna(val) and val == pair_val:
#                     src = "pair"

#             # Brand-level
#             if src == "other" and brand_table is not None and _g0 in brand_table.index and col in brand_table.columns:
#                 brand_val = brand_table.loc[_g0, col]
#                 if pd.notna(brand_val) and pd.notna(val) and val == brand_val:
#                     src = "brand"

#             # Global
#             if src == "other" and global_med is not None and col in global_med.index:
#                 glob_val = global_med[col]
#                 if pd.notna(glob_val) and pd.notna(val) and val == glob_val:
#                     src = "global"

#             srcs.append(src)
#         return srcs

#     comp["src_mpg"] = source_of_imputation("mpg")
#     comp["src_tax"] = source_of_imputation("tax")
#     comp["src_mileage"] = source_of_imputation("mileage")

#     print("\nImputation sources for the shown rows:")
#     display(comp.head(12))

#     # 6) Summary counts: NaN before vs after imputation
#     print("\nSummary counts: NaN before -> NaN after")
#     before = vw_golf[["mpg", "mileage", "tax"]].isna().sum()
#     after = pd.DataFrame({
#         "mpg": comp["imp_mpg"],
#         "mileage": comp["imp_mileage"],
#         "tax": comp["imp_tax"],
#     }).isna().sum()
#     display(pd.DataFrame({"na_before": before, "na_after": after}))

### 3. Model Assessment Strategy

**Our approach:**
- Several metrics are used to compare model performance (on train and val data to evaluate overfitting):
    - MAE (Mean Absolute Error):
        - average absolute deviation between predicted and true car prices
        - easy to interpret in pounds, same metric used in Kaggle competition
    - MAE std across folds:
        - Because MAE is our primary metric we also look at the std across folds to see how much the performance varies on different folds
        - For the other metrics this is not necessary because we get a good idea of the variance from this one std
    - RMSE (Root Mean Squared Error):
        - sensitive to outliers, helps identify large prediction errors
    - R²:
        - Coefficient of determination: proportion of variance explained by the model
        - 1.0 = perfect predictions, 0.0 = same as predicting mean, < 0.0 = worse than mean
- First we run multiple models with their default parameters to find the ones that will most likely be top candidates for the best model
- Finally, we run hyperparameter tuning on the top candidates to find the best model for our use case (best performance on primary metric MAE)

### 4. Model Baseline

Log-transforming the target (price) because EDA showed that it is heavily right-skewed.  The model predicts the log-price (handling outliers easily) and automatically convert it back to pounds at the end.

#### 4.1 Compare Default Models: Original vs. Optimized Preprocessing

Use **default parameters** to get a first result of models potential to decide on which ones to use for further optimizing (hyperparameter tuning). Only use the same random_state for reproducibility and n_jobs to speed up computations.    
Baseline: DummyRegressor using the median price as prediction

The **log-transform** of the target is performed here because it is the most straightforwared implementation using the TransformedTargetRegressor ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html)). It handles transformation and afterwards uses the inverse automatically.

In [None]:
baseline_median_pipe_orig = create_model_pipe(preprocessor_orig, DummyRegressor(strategy="median"))
baseline_median_pipe_adjusted = create_model_pipe(preprocessor_pipe, DummyRegressor(strategy="median"))

### Linear Models ###
linear_reg_default = LinearRegression()
linear_reg_default = TransformedTargetRegressor(
    regressor=linear_reg_default,
    func=np.log1p,
    inverse_func=np.expm1
)
linear_reg_pipe_orig = create_model_pipe(preprocessor_orig, linear_reg_default)
linear_reg_pipe_adjusted = create_model_pipe(preprocessor_pipe, linear_reg_default)


elasticnet_default = ElasticNet(random_state=rs)
elasticnet_default = TransformedTargetRegressor(
    regressor=elasticnet_default,
    func=np.log1p,
    inverse_func=np.expm1
)
elastic_pipe_orig = create_model_pipe(preprocessor_orig, elasticnet_default)
elastic_pipe_adjusted = create_model_pipe(preprocessor_pipe, elasticnet_default)
# Long Duration (~30sec)


### Instance-Based ###
knn_default = KNeighborsRegressor(n_jobs=-3)
knn_default = TransformedTargetRegressor(
    regressor=knn_default,
    func=np.log1p,
    inverse_func=np.expm1
)
knn_pipe_orig = create_model_pipe(preprocessor_orig, knn_default)
knn_pipe_adjusted = create_model_pipe(preprocessor_pipe, knn_default)
# Long Duration (~4min)
# => Better performance than linear models but still worse than tree-based models -> not further optimized


### Neural Networks ###
mlp_default = MLPRegressor(random_state=rs) # TODO argue some rule of thumb parameters
mlp_default = TransformedTargetRegressor(
    regressor=mlp_default,
    func=np.log1p,
    inverse_func=np.expm1
)
mlp_pipe_orig = create_model_pipe(preprocessor_orig, mlp_default)
mlp_pipe_adjusted = create_model_pipe(preprocessor_pipe, mlp_default)
# Long Duration (~4min)
# => Worse performance than KNN and tree-based models (notably, orig better than preprocessed)


### Tree-Based Models ###
hgb_default = HistGradientBoostingRegressor(random_state=rs, loss='squared_error')
hgb_default = TransformedTargetRegressor(
    regressor=hgb_default,
    func=np.log1p,
    inverse_func=np.expm1
)
hgb_pipe_orig = create_model_pipe(preprocessor_orig, hgb_default)
hgb_pipe_adjusted = create_model_pipe(preprocessor_pipe, hgb_default)
# Long Duration (~1mins)


rf_default = RandomForestRegressor(random_state=rs, n_jobs=-1, criterion='squared_error')
rf_default = TransformedTargetRegressor(
    regressor=rf_default,
    func=np.log1p,
    inverse_func=np.expm1
)
rf_pipe_orig = create_model_pipe(preprocessor_orig, rf_default)
rf_pipe_adjusted = create_model_pipe(preprocessor_pipe, rf_default)
# Long Duration (~5mins)
# Good performance -> further hyperparameter tuning

et_default = ExtraTreesRegressor(random_state=rs, n_jobs=-1, criterion='squared_error')
et_default = TransformedTargetRegressor(
    regressor=et_default,
    func=np.log1p,
    inverse_func=np.expm1
)
et_pipe_orig = create_model_pipe(preprocessor_orig, et_default)
et_pipe_adjusted = create_model_pipe(preprocessor_pipe, et_default)
# Long Duration (~7mins)
# Good performance -> further hyperparameter tuning

### Kernel-Based Models ###
svr_default = SVR()
svr_default = TransformedTargetRegressor(
    regressor=svr_default,
    func=np.log1p,
    inverse_func=np.expm1
)
svr_pipe_orig = create_model_pipe(preprocessor_orig, svr_default)
svr_pipe_adjusted = create_model_pipe(preprocessor_pipe, svr_default)
# Long Duration (~12mins)
# => Much worse performance than other models -> not further optimized


### Ensemble Meta Model ###
# The 'final_estimator' (Meta-Learner) looks at the predictions from the estimators and decides how to combine them.
stacking_model = StackingRegressor(
    estimators=[
        ('rf_main', rf_pipe_adjusted),
        ('linear_extrapolator', elastic_pipe_adjusted)
    ],
    final_estimator=LinearRegression(), # A linear final estimator allows the prediction to go beyond bounds (extrapolate)
    n_jobs=-1,
    passthrough=False # False = Meta-learner only sees the PREDICTIONS of the base models
)

#### 4.2 Run the models

In [None]:
default_models = {
    # "Baseline_Median_orig": baseline_median_pipe_orig,
    # "Baseline_Median": baseline_median_pipe_adjusted,
    # "ElasticNet_orig": elastic_pipe_orig,
    # "ElasticNet": elastic_pipe_adjusted,
    # "KNN_orig": knn_pipe_orig,
    # "KNN": knn_pipe_adjusted,
    "HGB_orig": hgb_pipe_orig,
    "HGB": hgb_pipe_adjusted,
    # "RF_orig": rf_pipe_orig,
    "RF": rf_pipe_adjusted,
    # "ET_orig": et_pipe_orig,
    "ET": et_pipe_adjusted,
    # "SVR_orig": svr_pipe_orig, # TODO remove comment for final run and submission, (currently too slow)
    # "SVR": svr_pipe_adjusted,
    # "Stack_orig": stack_pipe_orig,
    # "Stack": stack_pipe_adjusted,
}

default_models_results_df = get_cv_results(default_models, X_train, y_train, cv=cv, rs=rs)
display(default_models_results_df)

# Long Duration (~15mins)

### Results
# After setting up everything

# model	preprocessing	val_MAE	std_MAE	val_RMSE	val_R2	train_MAE	train_std_MAE	train_RMSE	train_R2
# 0	RF	    optimized	1337.5711	20.7784	2394.1089	0.9396	511.1459	2.0723	921.3749	0.9910
# 1	ET	    optimized	1352.4122	16.6569	2344.0351	0.9421	15.5697	0.3377	168.3326	0.9997
# 2	ET_orig	original	1442.9137	12.9364	2552.8816	0.9313	4.0813	0.3644	88.9362	0.9999
# 3	RF_orig	original	1474.5770	22.9152	2576.9752	0.9300	547.5251	1.5878	969.5442	0.9901
# 4	HGB	    optimized	    1491.8818	17.1434	2422.1513	0.9382	1443.1274	5.4173	2239.4684	0.9471
# 5	KNN	    optimized	    1563.6059	23.1749	2782.3605	0.9185	1263.3461	1.4033	2249.3184	0.9466
# 6	HGB_orig	original	1721.0859	20.6166	2799.4723	0.9174	1666.1467	4.7953	2624.5468	0.9273
# 7	KNN_orig	original	1767.8346	12.5795	3090.0236	0.8994	1424.8752	3.1977	2475.4926	0.9354
# 8	ElasticNet	    optimized	2738.8684	17.0733	4556.0807	0.7812	2753.6263	5.3896	4557.6904	0.7809
# 9	ElasticNet_orig	original	3674.8651	10.9441	5803.9831	0.6448	3674.0435	4.3854	5801.7026	0.6450
# 10	Baseline_Median_orig	original	6801.3202	15.0554	9976.8558	-0.0499	6801.1833	3.7168	9976.8186	-0.0499
# 11	Baseline_Median	        optimized	6801.3202	15.0554	9976.8558	-0.0499	6801.1833	3.7168	9976.8186	-0.0499

# Replace values with np.NaN instead of pd.NA and use simple-imputer
# model	preprocessing	val_MAE	std_MAE	val_RMSE	val_R2	train_MAE	train_std_MAE	train_RMSE	train_R2
# 0	RF	    optimized	1311.1342	18.5152	2306.1141	0.9440	496.8170	1.6132	880.5758	0.9918
# 1	ET	    optimized	1320.0298	10.5134	2243.3910	0.9470	23.8594	0.6903	223.5786	0.9995
# 2	HGB	    optimized	1463.2612	19.6563	2344.3935	0.9421	1397.8480	4.9193	2155.0675	0.9510
# 3	KNN	    optimized	1528.4661	19.1572	2706.9532	0.9228	1223.5450	5.3128	2167.5167	0.9504
# 4	ET_orig	original	1442.9137	12.9364	2552.8816	0.9313	4.0813	0.3644	88.9362	0.9999
# 5	RF_orig	original	1474.5770	22.9152	2576.9752	0.9300	547.5251	1.5878	969.5442	0.9901
# 6	HGB_orig	original	1721.0859	20.6166	2799.4723	0.9174	1666.1467	4.7953	2624.5468	0.9273
# 7	KNN_orig	original	1767.8346	12.5795	3090.0236	0.8994	1424.8752	3.1977	2475.4926	0.9354


# Log-Transform Target
# 0	RF	optimized	1288.7907	15.2675	2242.0406	0.9470	493.7640	1.6063	954.6323	0.9904
# 1	ET	optimized	1315.4824	11.2401	2253.5793	0.9465	23.7744	0.7060	224.6614	0.9995
# 2	HGB	optimized	1454.5998	10.7600	2443.8428	0.9371	1404.8214	5.5232	2326.7533	0.9429
# 3	ET_orig	original	1442.8195	13.4039	2568.2720	0.9305	4.0362	0.3626	89.2101	0.9999
# 4	RF_orig	original	1464.0531	18.1173	2620.6917	0.9276	551.0627	1.9496	1092.9956	0.9874
# 5	HGB_orig	original	1715.8092	9.9868	2969.5871	0.9070	1677.5670	4.7293	2870.0856	0.9131

# transmission unknown to nan
# model	preprocessing	val_MAE	std_MAE	val_RMSE	val_R2	train_MAE	train_std_MAE	train_RMSE	train_R2
# 0	RF	optimized	1288.1381	15.0811	2239.6206	0.9471	493.6988	1.4552	954.1244	0.9904
# 1	ET	optimized	1316.3009	9.7471	2256.6712	0.9463	23.8059	0.7017	224.7270	0.9995
# 2	HGB	optimized	1453.9956	13.2153	2458.2873	0.9363	1401.2912	4.6736	2328.1083	0.9428
# 3	HGB_orig	original	1715.8092	9.9868	2969.5871	0.9070	1677.5670	4.7293	2870.0856	0.9131


# Fixed GI

#### 4.5 Findings

**Our findings:**
- All models perform better on the adjusted pipeline with the following added components
    - Imputation
    - Feature Engineering
    - Encoding
    - Transforming and Scaling
    - Feature Selection


| **Model** | **Performance** | **Reasoning**  | **Next steps** |
| :--- | :--- | :--- | :--- |
| **ElasticNet** <br> *(Linear)* | ... | ... | Discard |
| **KNN** <br> *(Instance-based)* | ... | ... | ... |
| **Bagging** <br> *(Tree-based)* | ... | ... | ... |
| **RF** <br> *(Tree-based)* | ... | ... | Optimize |
| **ET** <br> *(Tree-based)* | ... | ... | ... |
| **HGB** <br> *(Tree-based)* | ... | ... | ... |
| **SVR** <br> *(Kernel-based)* | ... | ... | ... |

### 5. Hyperparameter Tuning of Preselected Models

**Our approach:**
- After the first runs we only keep the top candidates for further hyperparameter tuning to focus on most promising approaches and not waste computing power.
- After first experiments we decided to skip hyperparameter-tuning for...
- We tune using RandomizedSearchCV ([docu](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)) which calls the pipeline object for consistent preprocessing. An example by sklearn of calling the pipeline similar to this can be found [here](https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html#sphx-glr-auto-examples-compose-plot-compare-reduction-py).

##### 5.2 [Tree-Based] HistGradientBoost

In [None]:
# hgb_param_dist = {
#     "preprocess__fs__vt__threshold": [0.0, 0.005, 0.01],
#     "model__loss": ['absolute_error'],
#     # 'fs__selector__relevance_threshold': [0.01, 0.05],          # TODO 
#     # 'fs__selector__redundancy_threshold': [0.85, 0.95, 1.01],   # TODO If this is 1.01, redundancy filtering is disabled -> hp-tuning will tell whether redundancy selection improves model performance
#     # "preprocess__fs__selector__min_votes": [1, 2, 3],         # TODO try different vote thresholds for MajorityVoteSelectorTransformer in FS pipeline
#     "model__learning_rate": uniform(0.01, 0.15),                
#     "model__max_leaf_nodes": randint(50, 170),         
#     "model__min_samples_leaf": randint(2, 20),         
#     "model__max_iter": randint(200, 900),              
#     "model__l2_regularization": uniform(0.0, 1.0),      
#     "model__early_stopping": [True],
#     "model__validation_fraction": [0.1],
#     "model__n_iter_no_change": [20],
#     "model__random_state":[rs]
# }

# # optimized the parameter distributions based on previous runs to focus search space
# hgb_param_dist = {
#     "preprocess__fs__vt__threshold": [0.0],
#     "model__loss": ['absolute_error'],
#     # 'preprocess__fs__selector__selectors__0__relevance_threshold': [0.01, 0.05],          # TODO 
#     # 'preprocess__fs__selector__selectors__0__redundancy_threshold': [0.85, 0.95, 1.01],   # TODO If this is 1.01, redundancy filtering is disabled -> hp-tuning will tell whether redundancy selection improves model performance
#     # "preprocess__fs__selector__min_votes": [2],                             # TODO try different vote thresholds for MajorityVoteSelectorTransformer in FS pipeline
#     "model__learning_rate": [0.05889383578028271],
#     "model__max_leaf_nodes": [139],
#     "model__min_samples_leaf": [4],
#     "model__max_iter": [602],
#     "model__l2_regularization": [0.8583588048137198],
#     "model__early_stopping": [True],
#     "model__validation_fraction": [0.1],
#     "model__n_iter_no_change": [20],
#     "model__random_state":[rs]
# }

# hgb_tuned_pipe, hgb_random_search_object, hgb_scores_dict = model_hyperparameter_tuning(X_train, y_train, cv, hgb_pipe_adjusted, hgb_param_dist, n_iter=50)
# joblib.dump(hgb_tuned_pipe, "hgb_tuned_pipe.pkl")

# # Results

##### 5.3 [Tree-Based] RandomForest

In [None]:
# Old parameter distribution
rf_param_dist = {
    "preprocess__fs__vt__threshold": [0.0],
    # 'fs__selector__relevance_threshold': [0.01, 0.05],          # TODO 
    # 'fs__selector__redundancy_threshold': [0.85, 0.95, 1.01],   # TODO If this is 1.01, redundancy filtering is disabled -> hp-tuning will tell whether redundancy selection improves model performance
    # "preprocess__fs__selector__min_votes": [1, 2, 3],         # TODO try different vote thresholds for MajorityVoteSelectorTransformer in FS pipeline
    "model__criterion": ["absolute_error"],                 # use MAE as split criterion
    "model__n_estimators": randint(200, 600),               # number of trees
    "model__max_depth": randint(5, 40),                     # depth of each tree
    "model__min_samples_split": randint(2, 10),             # min samples to split an internal node
    "model__min_samples_leaf": randint(1, 12),               # min samples per leaf (increse to not overfit)
    "model__min_weight_fraction_leaf": uniform(0.0, 0.1),   # min weighted fraction per leaf
    "model__max_features": ["sqrt"],                        # feature sampling strategy (sqrt performed better than log2 and None in previous tests)
    # "model__max_leaf_nodes": randint(20, 100),            # max number of leaf nodes
    "model__min_impurity_decrease": uniform(0.0, 0.05),     # min impurity decrease to split
    "model__bootstrap": [True, False],                      # use bootstrapping or not (False performed better than True in previous tests)
    # "model__oob_score": [True, False],                      # whether to use out-of-bag samples to estimate the R² on unseen data
}

# So far best parameter distribution based on previous runs to focus search space
rf_param_dist = {
    "preprocess__fs__vt__threshold": [0.0],
    "model__regressor__criterion": ['squared_error'], # Use “absolute_error” to optimize for MAE but its significantly slower than when using “squared_error” (~5x) (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    "model__regressor__n_estimators": [328],
    "model__regressor__max_depth": [20],
    "model__regressor__min_samples_split": [5],
    "model__regressor__min_samples_leaf": [1],
    "model__regressor__max_features": ["sqrt"],
    "model__regressor__bootstrap": [False],
    "model__regressor__oob_score": [False], # TODO try True
}

rf_tuned_pipe, rf_random_search_object, rf_scores_dict = model_hyperparameter_tuning(X_train, y_train, cv, rf_pipe_adjusted, rf_param_dist, n_iter=1)
joblib.dump(rf_tuned_pipe, "rf_tuned_pipe.pkl")

# Long Duration (~6min)

# Set transmission Unknown to NaN
# MAE: 1233.3946
# RMSE: 2170.1968
# R²: 0.9504
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__regressor__oob_score': False, 'model__regressor__n_estimators': 328, 'model__regressor__min_samples_split': 5, 'model__regressor__min_samples_leaf': 1, 'model__regressor__max_features': 'sqrt', 'model__regressor__max_depth': 20, 'model__regressor__criterion': 'squared_error', 'model__regressor__bootstrap': False}

# Use Median AND Mean TE to let the model decide which to use at what node (probably increases performance but decreases interpretation)


# TODO change to “absolute_error” again for final best performance but for other tests use squared_error to save time

In [None]:
stop here

In [None]:
# TODO Remove looking at FI here (not part of tuning) -> just for inspection of feature importances after tuning ~J

# Use the debug preprocessor pipeline to get final feature names by hierarchically accessing each step
feature_names_after_fs = debug_preprocessor_pipe.named_steps['fs'].get_feature_names_out()
feat_names = feature_names_after_fs
importances = rf_tuned_pipe.named_steps["model"].regressor_.feature_importances_
df_feat_importance_rf = pd.DataFrame({"feature": feat_names, "importance": importances}).sort_values("importance", ascending=False)

print("Feature Importances:")
for _, row in df_feat_importance_rf.iterrows():
    print(f"{row['feature']:30s}: {row['importance']:.6f}")

##### 5.4 [Tree-Based] Extra Trees

In [None]:
# Old parameter distribution
et_param_dist = {
    "preprocess__fs__vt__threshold": [0.0],
    "model__criterion": ["absolute_error"],
    "model__n_estimators": randint(200, 600),        # number of trees
    "model__max_depth": randint(5, 40),              # depth of each tree
    "model__min_samples_split": randint(2, 10),      # min samples to split an internal node
    "model__min_samples_leaf": randint(1, 8),        # min samples per leaf
    "model__min_weight_fraction_leaf": [0.0, 0.1],    # min weighted fraction of total sum of weights required at a leaf node
    "model__max_features": ["sqrt"],           # feature sampling strategy (sqrt performed better than log2 and None in previous tests)
    "model__bootstrap": [False]                      # use bootstrapping or not (False performed better than True in previous tests)
}

# So far best parameter distribution based on previous runs to focus search space
et_param_dist = {
    "preprocess__fs__vt__threshold": [0.0],
    "model__criterion": ["squared_error"],
    "model__n_estimators": [328],
    "model__max_depth": [20],
    "model__min_samples_split": [5],
    "model__min_samples_leaf": [1],
    "model__max_features": ["sqrt"],
    "model__bootstrap": [False],
}

et_tuned_pipe, et_random_search_object, et_scores_dict = model_hyperparameter_tuning(X_train, y_train, cv, et_pipe_adjusted, et_param_dist, n_iter=50)

joblib.dump(et_tuned_pipe, "et_tuned_pipe.pkl")

# Before HP-tuning
# MAE: 1389.3234
# RMSE: 2471.9891
# R²: 0.9356
# Best Model params: {'preprocess__fs__vt__threshold': 0.0, 'model__n_estimators': 328, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__criterion': 'absolute_error', 'model__bootstrap': False}

# TODO use “absolute_error” for final best performance

# After HP-tuning

##### 5.6 [Combination] StackingRegressor

In [None]:
# Old parameter distribution
stack_param_dist = {
    "final_estimator__learning_rate": uniform(0.02, 0.1),
    "final_estimator__max_depth": randint(3, 10),
    "final_estimator__min_samples_leaf": randint(3, 20),
    "final_estimator__l2_regularization": uniform(0.0, 1.0),
}

# So far best parameter distribution based on previous runs to focus search space
stack_param_dist = {
    "final_estimator__learning_rate": [0.061135390505667866],
    "final_estimator__max_depth": [5],
    "final_estimator__min_samples_leaf": [10],
    "final_estimator__l2_regularization": [0.19438003399487302]
}

stack_tuned_pipe, stack_random_search_object, stack_scores_dict = model_hyperparameter_tuning(X_train, y_train, stack_pipe_fe, stack_param_dist, splits=3)
# joblib.dump(stack_tuned_pipe, "stack_best.pkl")


# Long Duration (~3mins)

# MAE: 1351.8682
# RMSE: 2498.2822
# R²: 0.9342

# After RandomizedSearchCV:
# MAE: 1350.4717
# RMSE: 2497.0474
# R²: 0.9343
# Best Model params: {'final_estimator__l2_regularization': np.float64(0.978892858275009), 'final_estimator__learning_rate': np.float64(0.06867421529594551), 'final_estimator__max_depth': 6, 'final_estimator__min_samples_leaf': 13}

# Removed ElasticNet from stacking due to poor performance compared to RF and HGB alone
# canceled but the cv scores didnt seem to show much improvement

# Using transmission and fuelType as OHE instead of TE():
# MAE: 1357.4291
# RMSE: 2516.5470
# R²: 0.9333
# Best Model params: {'final_estimator__l2_regularization': np.float64(0.19438003399487302), 'final_estimator__learning_rate': np.float64(0.061135390505667866), 'final_estimator__max_depth': 5, 'final_estimator__min_samples_leaf': 10}


# Removed fillna(0) in feature engineering for a_x_b and model_freq():
# was worse for hgb and rf so not tested for stacking

# ...

# implemented GroupModeImputer
# MAE: 1329.2379
# RMSE: 2453.0239
# R²: 0.9366
# Best Model params: {'final_estimator__min_samples_leaf': 10, 'final_estimator__max_depth': 5, 'final_estimator__learning_rate': 0.061135390505667866, 'final_estimator__l2_regularization': 0.19438003399487302}

# Fixed GroupImputer and added Feature Engineering to pipeline
# MAE: 1369.6876
# RMSE: 2516.2583
# R²: 0.9333


### 6. Comparison of Fine-Tuned Models

**Our approach:**
- The performance metrics are compared on the same data split to ensure a fair comparison (same CV seed)
- We compare the performance of the preselected models based on 3 metrics with 1 primary metric
    - MAE
    - RMSE
    - R2
- We compare the mean results on the training and on the validation data to evaluate overfitting of the model

In [None]:
# Use object from randomizedsearch to retrieve the mean metrics of the best model (that was also refit on entire data for final predictions later)
model_scores = {
    "hgb_tuned": hgb_scores_dict,
    "rf_tuned": rf_scores_dict,
    # "et_tuned": et_scores_dict,
    # "stack_tuned": stacked_scores_dict,
}

# Convert dictionary to DataFrame (transpose to have models as rows)
df_scores = pd.DataFrame(model_scores).T 
df_scores = df_scores[['val_mae', 'val_rmse', 'val_r2','train_mae', 'train_rmse', 'train_r2']]

# Sort by val_mae (primary metric)
df_scores = df_scores.sort_values(by='val_mae')

print("Model Comparison Table:")
display(df_scores)

In [None]:
plot_val_mae_comparison(df_scores)
plot_train_val_comparison(df_scores)

**Our findings:**
- Primary Metric val MAE:
    - RF performs best...
- Train vs. Val Score (Overfitting):
    - RF overfits significantly more...
- Secondary Metrics
    - RMSE:
    - R2:


==> Final decision: Use RF because it has the lowest MAE and our final goal is to minimize the MAE. Therefore, we acccept the fact the RF is overfitting...

**Comparison of optimized model with previous models:**
- Hyperparameter tuning massively improves the performance:

| **Model** | **Performance** | **Biggest Change in HPs compared to default model** |
| :--- | :--- | :--- |
| **HGB** <br> *(Tree-based)* | ... | ... |
| **RF** <br> *(Tree-based)* | ... | ... |
| **ET** <br> *(Tree-based)* | ... | ... |

### 7. Deployment

In [None]:
# TODO add code for deployment here

### 8. Prediction on Test setKaggle Competition

In [None]:
def predict_on_test(model_pipeline, model_name):
    # Load best model from Joblib and predict on validation set to verify
    pipe_best = joblib.load(model_pipeline)
    
    # Predict on test set
    df_cars_test['price'] = pipe_best.predict(df_cars_test)
    df_cars_test[['carID', 'price']].to_csv(f'Group05_{model_name}_Version12.csv', index=False)

In [None]:
predict_on_test("hgb_final_shap_pipe.pkl", "HGB")

In [None]:
predict_on_test("rf_tuned_pipe.pkl", "RF")

In [None]:
# predict_on_test("stack_pipe.pkl", "Stack")

In [None]:
# !kaggle competitions submit -c cars4you -f Group05_Version05.csv -m "Message" # Uncomment to submit to Kaggle

In [None]:
!kaggle competitions submissions -c cars4you

### 9. Visualizations and Analysis of Best Model (Pipeline Processes)

#### 9.7 Feature Selection

The votes of each contributor are shown, resulting in the final decision whether to keep the feature or not.

In [None]:
# Feed the feed names after VT because VT is applied before the majority voting to remove constant features
feature_names_after_vt = debug_preprocessor_pipe.named_steps['fs'].named_steps['vt'].get_feature_names_out()
plot_selector_agreement(
    majority_selector = debug_preprocessor_pipe.named_steps['fs'].named_steps['selector'], 
    feature_names = feature_names_after_vt
)

In [None]:
# TODO we need good argument to justify the final features (notes from lab)

### 10. Findings and Outlook

**Tree-based models performed best:**     
Regarding the baseline models we used in Section 5, it becomes clear that the tree-based models outperform the other models. This is probably due to ...

**Constraints:**     
However, the nature of tree-based models constraints the predictions to never be lower than the lowest or higher than the highest price in the train set. This is because tree-based models return the average price of the cars in the leaf node.

**Outlook:**     
A potential solution of this could be a Stacking Regressor that combines a tree-based model with another model that is better in extrapolation (e.g. RF + Ridge). A final Meta-Learner (also Linear) can combine their predictions. If the RF predicts "Max Value" but the Linear Model predicts "Higher Value," the Meta-Learner can follow the Linear trend upward.


### 8. Open-Ended-Section

%md
#### 8.1 SHAP Interpretability for Our Final Tree Model (Informative Only)

##### a) Objective and motivation (0.5v)

After building a strong pipeline (data cleaning → imputation → feature engineering → encoding/scaling → VT + majority-vote FS → tuned tree model), we use **SHAP (SHapley Additive exPlanations)** purely for **interpretability**.

Goals:
- Identify the **most influential features** for our final tuned tree model (`hgb_tuned_pipe`).
- Validate whether feature effects are **plausible** (age, mileage, engine, etc.).
- Understand whether **target encodings** dominate and how engineered interactions contribute.

Important: **SHAP does not change the model or feature set.** We do not build a new pipeline based on SHAP.

---

##### b) Difficulty of the task (1v)

This was non-trivial because SHAP must explain the model input **after** our preprocessing:

- The model does not see raw columns. It sees:
  - engineered features,
  - OHE columns,
  - median target-encoded columns,
  - and the reduced subset after **VT + majority voting**.
- We therefore implemented a helper to reconstruct:
  - the exact **post-preprocess feature matrix**, and
  - aligned **feature names** after applying both selection masks (VT support + majority selector mask).
- For `HistGradientBoostingRegressor`, SHAP’s **additivity check** can fail even with correct shapes. We handle this safely by disabling it (`check_additivity=False`) and keeping a robust fallback explainer if needed.
- Runtime: SHAP is expensive, so we compute explanations on a **subsample** (`sample_size=1000`) plus a small background set.

---

##### c) Correctness and efficiency (1v)

We kept the analysis correct and consistent with the production pipeline:

- **No leakage / no optimization loop:** SHAP is computed on the already fitted tuned model and used only to interpret it.
- **Exact alignment:** feature names are derived from the ColumnTransformer output and then filtered by VT + majority voting masks.
- **Global SHAP importance:** we rank features by mean absolute contribution:
  
  $$
  Importance(feature_j) = \frac{1}{N}\sum_{i=1}^{N} |SHAP_{i,j}|
  $$

- **Efficient computation:** stable ranking via subsampling.

---

##### d) Results and interpretation (1v)

Model context:
- Final tuned model: `hgb_tuned_pipe`
- Total features after preprocessing + FS: **26**

Top drivers (mean |SHAP|), excerpt:

| Feature | Importance | Interpretation |
|---|---:|---|
| `median_te__model_median_te` | 2850.28 | Model-level median target encoding (strong market-value proxy) |
| `num__mpg_x_age` | 1659.43 | Interaction: MPG × age |
| `num__engineSize` | 1367.63 | Engine size (segment/performance proxy) |
| `log__mileage` | 1151.75 | Log mileage (diminishing marginal effect) |
| `num__age_rel_model` | 642.22 | Age relative to typical age within the model |
| `median_te__brand_trans_median_te` | 500.44 | Brand × transmission median target encoding |
| `num__engine_per_mpg` | 497.18 | Performance/efficiency ratio |
| `cat__transmission_Manual` | 406.37 | Manual transmission effect |

Key takeaways:
- **Target encodings dominate** global importance, especially the model-level encoding. This is expected because model identity carries a large fraction of price signal.
- **Engineered interactions matter** (`mpg_x_age`, `engine_per_mpg`, `mpg_x_engine`), confirming that our feature engineering adds useful non-additive structure.
- **Mileage and age appear in strong, intuitive forms** (log mileage, relative age vs model/brand), supporting both predictive performance and interpretability.

Beeswarm plot (distribution of effects):
- **`median_te__model_median_te` dominates** and shows a wide SHAP spread → model identity (via median target encoding) is the strongest pricing signal.
- **Mileage effect is non-linear** (`log__mileage`): low mileage produces strong positive contributions; high mileage pushes predictions down, but with diminishing marginal impact (consistent with log-transform).
- **Engine/performance features matter across many cars** (`num__engineSize`, `num__engine_per_mpg`, `num__mpg_x_age`) and show heterogeneous spreads → effects differ by segment (e.g., sporty vs economy cars).
- **Relative positioning features stabilize predictions** (`num__age_rel_model`, `num__engine_rel_model`): the model compares a car to what is “typical” within its model/segment, not only absolute values.
- **Transmission signal is consistent** (`cat__transmission_Manual`): manual cars tend to shift predictions in one direction (dataset-dependent), but the spread indicates exceptions (model/brand interactions).


---

##### e) Alignment with objectives (0.5v)

This section adds transparency without changing the modeling procedure:

- Feature selection stays **VT + majority voting** (robust, leakage-safe, model-agnostic).
- SHAP is used **only** to explain the final tuned model.
- The resulting drivers (target encodings + age/mileage/engine + interactions) are consistent with domain logic and support trust in the final pipeline.

---


In [None]:
# Get Feature names aligned with X_proc (after preprocess incl. VT + majority voting)
def get_pipeline_feature_matrix(pipe, X):
    """
    Given a fitted model pipeline with steps:
      'preprocess' -> 'model'
    where preprocess itself is a Pipeline:
      clean -> group_imputer -> fe -> ct -> fs(vt + selector)
    return:
      X_proc: 2D numpy array of features just before the model step
      feat_names: 1D np.array of feature names aligned with X_proc columns
    """
    pre = pipe.named_steps["preprocess"]

    # 1) Transform to model-ready matrix
    X_proc = pre.transform(X)

    # 2) Reconstruct feature names: ct -> vt mask -> majority selector mask
    ct = pre.named_steps["ct"]
    feat_names = np.asarray(ct.get_feature_names_out(), dtype=object)

    fs = pre.named_steps.get("fs", None)
    if fs is not None:
        # VT (dictator) first
        vt = fs.named_steps.get("vt", None)
        if vt is not None and hasattr(vt, "get_support"):
            feat_names = feat_names[vt.get_support()]

        # Majority selector next
        sel = fs.named_steps.get("selector", None)
        if sel is not None and hasattr(sel, "support_mask_") and sel.support_mask_ is not None:
            feat_names = feat_names[sel.support_mask_]

    return X_proc, feat_names


In [None]:
# Compute SHAP Importance
def compute_shap_importance(
    pipe,
    X,
    sample_size=1000,
    seed=rs,
    model_name=None,
):
    """
    Compute global SHAP feature importances for a fitted pipeline (informative only).

    Fix:
      - TreeExplainer additivity check can fail for some sklearn tree implementations (incl. HGB).
        We disable it via check_additivity=False.
      - If TreeExplainer still fails, fall back to a model-agnostic SHAP explainer.
    """
    # Extract processed feature matrix and names
    X_proc, feat_names = get_pipeline_feature_matrix(pipe, X)

    # Subsample rows for SHAP (for speed)
    rng = np.random.default_rng(seed)
    n = min(sample_size, len(X_proc))
    idx = rng.choice(len(X_proc), n, replace=False)
    X_sample = X_proc[idx]

    # Underlying model (last step in pipeline)
    model = pipe.named_steps["model"]
    tag = model_name or model.__class__.__name__

    # Background for SHAP (small subset)
    bg_n = min(200, len(X_sample))
    bg_idx = rng.choice(len(X_sample), bg_n, replace=False)
    X_bg = X_sample[bg_idx]

    # --- Try TreeExplainer first (fast for tree models) ---
    try:
        explainer = shap.TreeExplainer(model, X_bg)
        shap_vals = explainer.shap_values(X_sample, check_additivity=False)

        # shap_vals can be list-like in some setups; regression should be 2D
        if isinstance(shap_vals, list):
            shap_vals = shap_vals[0]

        base_vals = getattr(explainer, "expected_value", 0.0)
        shap_values = shap.Explanation(
            values=shap_vals,
            base_values=np.full((len(X_sample),), base_vals) if np.isscalar(base_vals) else base_vals,
            data=X_sample,
            feature_names=feat_names,
        )

    except Exception as e:
        # --- Fallback: model-agnostic explainer (slower but robust) ---
        explainer = shap.Explainer(model.predict, X_bg, feature_names=feat_names)
        shap_values = explainer(X_sample)

    importance = np.abs(shap_values.values).mean(axis=0)

    shap_df = (
        pd.DataFrame({"feature": feat_names, "importance": importance})
        .sort_values("importance", ascending=False)
        .reset_index(drop=True)
    )

    print(f"Top 20 features by SHAP for {tag}:")
    print(shap_df.head(20).to_string(index=False))

    return shap_df, feat_names, shap_values, X_sample


In [None]:
# SHAP Plots
def plot_top_shap_bar(shap_df, model_name, top_k):
    """
    Horizontal bar plot of top_k features by mean |SHAP|.
    """
    top_df = shap_df.head(top_k).iloc[::-1]  # reverse for nicer barh order
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.barh(top_df["feature"], top_df["importance"])
    ax.set_xlabel("Average |SHAP| value")
    ax.set_title(f"Top {top_k} features by SHAP – {model_name}")
    plt.tight_layout()
    plt.show()


def plot_shap_beeswarm(shap_values, X_sample, feat_names, model_name, max_display=20):
    """
    SHAP summary (beeswarm) plot for top features.
    """
    X_df = pd.DataFrame(X_sample, columns=feat_names)

    # Create one figure and tell SHAP not to auto-show
    plt.figure(figsize=(10, 6))
    shap.summary_plot(shap_values.values, X_df, max_display=max_display, show=False)

    plt.title(f"SHAP Beeswarm – {model_name}")
    plt.tight_layout()
    plt.show()


In [None]:
# ARCHIVE: "top-k from SHAP" code commented out
# ----------------------------------------------------------------------------------------
# The following would create a new model/pipeline based on SHAP-top-k features.
# We keep it for reference but do not use it because:
# - We already have a robust, leakage-safe FS pipeline (VT + majority voting).
# - Here we want SHAP to be interpretability only.

# def cv_mae_topk_from_shap(
#     pipe,
#     shap_importance,
#     X,
#     y,
#     n_features_list,
#     folds=5,
#     seed=rs,
#     model_name=None,
# ):
#     """
#     For a fitted pipeline `pipe` and its SHAP importances:
#       - Build X_proc, feat_names from the pipeline.
#       - For each k in n_features_list:
#           * Take top-k features by SHAP.
#           * Run KFold CV on X_proc[:, idx] with the pipeline's final estimator.
#       - Print MAE per k and return the best (k, model, feature list).
#
#     Returns:
#       best_model: fitted estimator on full X_proc restricted to best-k features
#       best_features: list of feature names used
#     """
#     # 1) Get processed features and names
#     X_proc, feat_names = get_pipeline_feature_matrix(pipe, X)
#     feat_names = np.asarray(feat_names, dtype=object)
#
#     # 2) SHAP ranking
#     shap_sorted = shap_importance.sort_values("importance", ascending=False)
#     shap_order = shap_sorted["feature"].tolist()
#
#     # helper: indices of top-k by SHAP
#     def indices_for_topk(k):
#         top_feats = shap_order[:k]
#         return [i for i, fname in enumerate(feat_names) if fname in top_feats]
#
#     kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
#     model_proto = pipe.named_steps["model"]
#     tag = model_name or model_proto.__class__.__name__
#
#     results = []
#
#     for k in n_features_list:
#         idx = indices_for_topk(k)
#         if len(idx) == 0:
#             print(f"Skipping k={k}: no matching feature indices.")
#             continue
#
#         mae_folds = []
#
#         for train_idx, val_idx in kf.split(X_proc):
#             X_tr, X_val = X_proc[train_idx][:, idx], X_proc[val_idx][:, idx]
#             y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
#
#             est = clone(model_proto)
#             est.fit(X_tr, y_tr)
#             y_pred = est.predict(X_val)
#             mae_folds.append(mean_absolute_error(y_val, y_pred))
#
#         mae_mean = float(np.mean(mae_folds))
#         results.append({"k": k, "mae": mae_mean, "idx": idx})
#
#     # pick best k
#     if not results:
#         raise RuntimeError("No valid k in n_features_list produced results.")
#
#     best = min(results, key=lambda r: r["mae"])
#     best_k = best["k"]
#     best_mae = best["mae"]
#     best_idx = best["idx"]
#     best_features = [feat_names[i] for i in best_idx]
#
#     print(f"\nTop-k SHAP feature CV – {tag}")
#     for r in results:
#         print(f"  k={r['k']:3d} | MAE={r['mae']:.2f}")
#     print(f"Best: k={best_k} | MAE={best_mae:.2f}")
#
#     # fit final estimator on full X_proc restricted to best_k features
#     final_est = clone(model_proto)
#     final_est.fit(X_proc[:, best_idx], y)
#
#     return final_est, best_features
#
#
# class ShapTopKColumnSelector(BaseEstimator, TransformerMixin):
#     """
#     Transformer that selects a fixed subset of columns by name.
#
#     Parameters
#     ----------
#     selected_features : list of str
#         Feature names (after preprocessing) to keep.
#
#     all_feature_names : array-like of str
#         Full list of feature names aligned with the columns of X after preprocessing.
#         These are typically obtained from get_pipeline_feature_matrix(...).
#     """
#     def __init__(self, selected_features, all_feature_names):
#         self.selected_features = list(selected_features)
#         self.all_feature_names = np.asarray(all_feature_names, dtype=object)
#
#     def fit(self, X, y=None):
#         # Compute the column indices corresponding to selected_features
#         name_to_idx = {name: i for i, name in enumerate(self.all_feature_names)}
#         self.idx_ = [
#             name_to_idx[name]
#             for name in self.selected_features
#             if name in name_to_idx
#         ]
#         if len(self.idx_) == 0:
#             raise ValueError(
#                 "ShapTopKColumnSelector: none of the selected_features were found "
#                 "in all_feature_names."
#             )
#         return self
#
#     def transform(self, X):
#         # X is the matrix after preprocessing; select only the chosen columns
#         return X[:, self.idx_]
#
#     def get_feature_names_out(self, input_features=None):
#         # For consistency with sklearn's feature-name API
#         return np.asarray(self.selected_features, dtype=object)
#
#
# def build_shap_topk_pipeline(
#     base_pipe,
#     best_features,
#     all_feature_names,
#     step_model_name="model",
# ):
#     """
#     Build a final pipeline that:
#       - reuses the preprocessing (and vt/fs if present) from `base_pipe`
#       - inserts a ShapTopKColumnSelector to keep only `best_features`
#       - uses a fresh clone of the base model as final estimator
#
#     Parameters
#     ----------
#     base_pipe : sklearn Pipeline
#         Fitted pipeline with steps: 'preprocess' -> optional 'vt'/'fs' -> 'model'.
#
#     best_features : list of str
#         Names of the features (after preprocessing) to keep.
#
#     all_feature_names : array-like of str
#         Full list of feature names aligned with the output of preprocessing
#         (and vt/fs if they were applied when computing SHAP).
#
#     step_model_name : str, default="model"
#         Name of the final estimator step in base_pipe.
#
#     Returns
#     -------
#     final_pipe : sklearn Pipeline
#         Unfitted pipeline. Call final_pipe.fit(X, y) to train on full data.
#     """
#     steps = []
#
#     # 1) Preprocess step (clone so we refit on full data)
#     pre = base_pipe.named_steps["preprocess"]
#     steps.append(("preprocess", clone(pre)))
#
#     # 2) Optional VarianceThreshold
#     if "vt" in base_pipe.named_steps and base_pipe.named_steps["vt"] is not None:
#         steps.append(("vt", clone(base_pipe.named_steps["vt"])))
#
#     # 3) SHAP-based column selector
#     shap_selector = ShapTopKColumnSelector(
#         selected_features=best_features,
#         all_feature_names=all_feature_names,
#     )
#     steps.append(("shap_select", shap_selector))
#
#     # 4) Final estimator – fresh clone of the base model
#     base_model = base_pipe.named_steps[step_model_name]
#     steps.append(("model", clone(base_model)))
#
#     final_pipe = Pipeline(steps)
#     return final_pipe


##### HGB SHAP

In [None]:
# HGB baseline report + feature count
hgb_pipe = hgb_tuned_pipe

# Feature count after preprocess (clean+impute+fe+ct+fs)
X_proc_hgb, feat_names_hgb = get_pipeline_feature_matrix(hgb_pipe, X_train)
n_features_total_hgb = X_proc_hgb.shape[1]

print("HGB (tuned pipe) – feature space info:")
print(f"Total features used: {n_features_total_hgb}")


In [None]:
# HGB SHAP
shap_importance_hgb, feat_names_hgb, shap_vals_hgb, X_sample_hgb = compute_shap_importance(
    hgb_pipe,
    X_train,
    sample_size=1000,
    seed=rs,
    model_name="HGB",
)

plot_top_shap_bar(shap_importance_hgb, model_name="HGB", top_k=n_features_total_hgb)
plot_shap_beeswarm(shap_vals_hgb, X_sample_hgb, feat_names_hgb, model_name="HGB", max_display=n_features_total_hgb)

##### RF SHAP

In [None]:
# RandomForest baseline report + SHAP
rf_pipe = rf_tuned_pipe

# Feature matrix + names after preprocess (clean+impute+fe+ct+fs)
X_proc_rf, feat_names_rf = get_pipeline_feature_matrix(rf_pipe, X_train)
n_features_total_rf = X_proc_rf.shape[1]

print("RandomForest (tuned pipe) – feature space info:")
print(f"Total features used: {n_features_total_rf}")

shap_importance_rf, feat_names_rf, shap_vals_rf, X_sample_rf = compute_shap_importance(
    rf_pipe,
    X_train,
    sample_size=1000,
    seed=rs,
    model_name="RandomForest",
)

plot_top_shap_bar(shap_importance_rf, model_name="RandomForest", top_k=20)
plot_shap_beeswarm(shap_vals_rf, X_sample_rf, feat_names_rf, model_name="RandomForest", max_display=20)

In [None]:
# RF SHAP
shap_importance_rf, feat_names_rf, shap_vals_rf, X_sample_rf = compute_shap_importance(
    rf_pipe,
    X_train,
    sample_size=1000,
    seed=rs,
    model_name="RandomForest",
)

plot_top_shap_bar(shap_importance_rf, model_name="RandomForest", top_k=20)
plot_shap_beeswarm(shap_vals_rf, X_sample_rf, feat_names_rf, model_name="RandomForest", max_display=20)

#### 8.2 Global vs Brand- and Model-Specific Models

##### a) Objective and motivation (0.5v)

We investigated how far Cars4You should specialize its pricing models:

1. **Brand level:** Is a single global price model for all brands sufficient, or do separate brand-specific models reduce pricing error?
2. **Brand–model level:** For frequent models (e.g. “Skoda Octavia”, “VW Golf”), does an even more specialized model per (brand, model) segment bring additional improvements, or does it overfit?

Concretely, we started from our final production pipeline `hgb_final_shap_pipe` (full preprocessing + SHAP-based feature selection + HGB regressor) and compared:

- **Global model:** trained on all cars, evaluated only on a given segment.
- **Brand-specific model:** same preprocessing and SHAP selector, but the regressor re-fitted only on cars of a given brand.
- **Brand–model-specific model:** same preprocessing and SHAP selector, but the regressor re-fitted only on cars of a given (brand, model) pair.

We measured mean absolute error (MAE) and root mean squared error (RMSE) per segment. This answers how much performance we gain by moving from:

> one global model → several brand models → many brand–model models.

---

##### b) Difficulty of tasks (1v)

Extending the existing solution to this multi-level comparison was non-trivial:

- **Complex pipeline with a custom SHAP selector**  
  The final pipeline contains a `ShapTopKColumnSelector` that is not clone-compatible. Standard `cross_val_score` + `clone` would fail. We therefore implemented manual cross-validation:
  - reuse the fitted preprocessing + SHAP selector from `hgb_final_shap_pipe`;
  - only re-fit the final regressor for each fold and segment.

- **Consistent and fair evaluation protocol**  
  We reused the same 5-fold KFold strategy (`n_splits`, `shuffle`, `random_state`) and the same target (`price`) as in the main project. For each fold and segment:
  - the global model is trained on all training rows but evaluated only on validation rows belonging to that segment;
  - the segment-specific model is trained and evaluated only on that segment’s rows.

- **Handling data imbalance**  
  Data is unevenly distributed across brands and models. We therefore:
  - restricted the analysis to brands with at least 500 training samples;
  - for brand–model analysis, kept only frequent pairs (e.g. Skoda Octavia, VW Golf) with a minimum sample threshold per segment;
  - enforced additional checks per fold (minimum training size) to avoid fits on a handful of cars.

- **Manual metric computation**  
  Due to an older `sklearn` version (no `squared=` parameter), RMSE had to be computed manually as `sqrt(MSE)` inside the CV loops instead of relying on built-in scorers.

Overall, the task required custom CV logic, careful reuse of the production pipeline, and multiple levels of segment-wise filtering.

---

##### c) Correctness and efficiency of implementation (1v)

To keep the analysis correct and reasonably efficient we:

- **Reused the production pipeline as-is**  
  All preprocessing (imputation, scaling, encoding, price anchors) and SHAP-based feature selection are exactly the same as in the final model used on the test set. Only the last regressor is re-fit for segment-specific models.

- **Used a single CV design for all comparisons**  
  The same KFold splits (`splits = list(KFold(...).split(X_train, y_train))`) are reused for:
  - global per-brand evaluation;
  - brand-specific evaluation;
  - global per (brand, model) evaluation;
  - brand–model-specific evaluation.  
  This removes extra randomness and makes differences directly comparable.

- **Implemented clear separation between global and segment-specific training**  
  - For brands:  
    - global: fit on all brands, compute metrics only on that brand’s validation rows;  
    - brand-specific: use the fixed preprocessor, fit a fresh regressor only on that brand’s transformed data.
  - For (brand, model) pairs:  
    - global: fit on all cars, compute metrics only on that (brand, model) validation subset;  
    - brand–model-specific: fixed preprocessor + fresh regressor only on that pair.

- **Guarded against tiny segments**  
  Only segments with enough rows at dataset level and per fold are evaluated. Otherwise, metrics are set to NaN and those segments are excluded via `dropna`.

This design produces stable segment-wise estimates without changing the core production pipeline.

---

##### d) Discussion of results (1v)

#### Brand-level comparison

For the main brands, the final summary table (MAE in GBP) is:

| Brand    | MAE (global) | MAE (brand) | ΔMAE (brand – global) | n_samples |
|----------|--------------|-------------|------------------------|-----------|
| Ford     | 966.7        | 929.2       | -37.6                  | 16,371    |
| BMW      | 1,828.0      | 1,792.8     | -35.2                  | 7,540     |
| Mercedes | 1,968.7      | 1,934.6     | -34.1                  | 11,899    |
| VW       | 1,299.7      | 1,287.8     | -11.9                  | 10,572    |
| Audi     | 1,806.0      | 1,794.5     | -11.5                  | 7,456     |
| Skoda    | 1,174.6      | 1,165.9     | -8.7                   | 4,380     |
| Toyota   |   926.7      |   920.9     | -5.9                   | 4,714     |
| Opel     |   777.1      |   774.5     | -2.6                   | 9,530     |

Key observations:

- **High-volume premium brands benefit the most from brand-specific models.**  
  Ford, BMW and Mercedes gain about 35–38 GBP lower MAE per car (≈ 2–4% relative improvement). This is meaningful at scale and based on large sample sizes.

- **Moderate gains for VW, Audi, Skoda, Toyota.**  
  MAE improvements are smaller (5–12 GBP, typically <1% relative), but still consistent in sign.

- **Minimal benefit for Opel.**  
  The improvement for Opel (≈ 2.6 GBP) is negligible relative to its base MAE. The global model already captures Opel’s pricing patterns.

- **RMSE sometimes increases slightly for brand-specific models.**  
  For some brands, RMSE is marginally higher, indicating that brand-specific models reduce typical errors but can perform worse on rare/extreme cases, hinting at mild overfitting in the tails.

Overall, moving from a global to a brand-specific layer consistently does not harm MAE and clearly helps for some large brands, but the absolute gains are moderate.

#### Brand–model-level comparison

For frequent (brand, model) pairs, the analysis shows a more mixed picture. A selection of results (MAE in GBP):

| Brand   | Model        | MAE global | MAE seg | ΔMAE (seg – global) | n_samples |
|---------|--------------|-----------:|--------:|---------------------:|----------:|
| Skoda   | kamiq        | 1,418.6    | 1,107.1 | -311.5               | 109       |
| VW      | amarok       | 2,988.7    | 2,801.3 | -187.4               | 83        |
| Mercedes| x-class      | 3,592.8    | 3,448.9 | -144.0               | 59        |
| Skoda   | scala        | 1,175.7    | 1,100.5 | -75.2                | 147       |
| Ford    | b-max        |   640.2    |   578.1 | -62.1                | 248       |
| Skoda   | octavia      | 1,089.4    | 1,031.9 | -57.5                | 1,021     |
| Skoda   | fabia        |   845.8    |   795.1 | -50.6                | 1,069     |
| VW      | up           |   645.2    |   608.1 | -37.1                | 608       |
| BMW     | 1 series     | 1,158.2    | 1,130.0 | -28.1                | 1,358     |
| VW      | golf         | 1,151.0    | 1,155.8 |  +4.8                | 3,515     |
| Ford    | fiesta       |   753.3    |   762.7 |  +9.3                | 4,470     |
| Toyota  | aygo         |   557.1    |   576.7 | +19.6                | 1,381     |
| BMW     | 7 series     | 3,146.7    | 4,751.9 | +1,605.2             | 71        |
| Mercedes| gls class    | 3,295.8    | 5,906.4 | +2,610.7             | 54        |

Patterns:

- **Some compact, relatively frequent models benefit from model-level specialization.**  
  Examples: Skoda Kamiq, Scala, Octavia and Fabia; VW up; Ford B-MAX.  
  These segments see large MAE reductions (50–300 GBP), and RMSE also tends to decrease. Here, the model-level regressor can exploit consistent, model-specific patterns.

- **For many common volume models, gains are small or negative.**  
  VW Golf, Ford Fiesta, Opel Corsa, Toyota Yaris, etc. often show small positive ΔMAE and/or higher RMSE. For these, splitting by model does not significantly improve typical error and can worsen extreme cases.

- **For rare, high-priced models, model-specific fits severely overfit.**  
  BMW 7 series, BMW X6, Mercedes GLS/S/SL/CLS class, VW Beetle, Toyota Avensis/Verso and others exhibit very large increases in MAE (hundreds to thousands of GBP) and often huge increases in RMSE.  
  These models have small sample sizes (often <100 cars), so a separate model per (brand, model) is clearly not robust.

In short:

- Moving from **global → brand** is often beneficial and relatively safe for high-volume brands.
- Moving further from **brand → brand–model** brings strong improvements only for a small subset of frequent models; for many others, especially rare premium models, it clearly overfits.

---

##### e) Alignment with objectives (0.5v)

This extended open-ended study:

- Directly addresses and expands a suggested topic (“global vs brand-specific models”), and pushes it one step further to **brand–model** specialization.
- Uses fully the final production pipeline and a consistent CV protocol, so the conclusions are directly relevant for deployment.
- Provides a **clear design recommendation**:
  - Use a **single global model** as the base.
  - Optionally introduce **brand-level specialization** for a small set of high-volume brands (e.g. Ford, BMW, Mercedes) where MAE improvements are meaningful.
  - Avoid full **brand–model specialization** except potentially for a handful of very frequent models with demonstrated gains; for most models, especially rare and expensive ones, splitting further clearly overfits.

This shows that we not only tuned a strong model, but also explored the trade-off between model complexity and robustness in a structured, data-driven way.



In [None]:
# Load final production pipeline (preprocessing + SHAP + HGB)
# hgb_final_shap_pipe = load("hgb_final_shap_pipe.pkl")
pipe_global = hgb_final_shap_pipe 

assert "X_train" in globals() and "y_train" in globals(), "Define X_train and y_train before proceeding."

# Identify the brand column (name may be 'Brand' or 'brand')
brand_col = "Brand" if "Brand" in X_train.columns else "brand"
assert brand_col in X_train.columns, (
    f"Brand column not found in X_train. "
    f"First columns: {X_train.columns.tolist()[:20]}"
)

print("Using brand column:", brand_col)

In [None]:
# Inspect brand frequencies
brand_counts = X_train[brand_col].value_counts()
print("Top brands by count:")
print(brand_counts.head(15))

# Select candidate brands
#    - TOP_K: max number of brands to compare.
#    - MIN_SAMPLES: minimum number of rows per brand.

TOP_K = 8
MIN_SAMPLES = 500  # adjust if needed

candidate_brands = [
    b for b, cnt in brand_counts.items()
    if cnt >= MIN_SAMPLES
][:TOP_K]

print("\nCandidate brands used in the comparison:")
print(candidate_brands)


In [None]:
# Cross-validation setup: We reuse the same KFold splits for all evaluations to keep comparisons fair and to reduce randomness.

cv = KFold(n_splits=5, shuffle=True, random_state=rs)
splits = list(cv.split(X_train, y_train)) # TODO no random_state necessary here?


def eval_global_for_brand(model, X, y, brand_col, brand, splits):
    """
    Evaluate the global pipeline for a single brand.

    The model is trained on all brands in each fold, but the error
    is computed only on validation rows belonging to the given brand.
    """
    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        # Split data for this fold
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train global model on ALL brands in this fold
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        # Restrict metrics to the target brand in validation
        mask_b = (X_val[brand_col] == brand)
        if mask_b.sum() == 0:
            continue

        y_val_b = y_val[mask_b]
        y_pred_b = y_pred[mask_b]

        mae = mean_absolute_error(y_val_b, y_pred_b)
        mse = mean_squared_error(y_val_b, y_pred_b)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += mask_b.sum()

    return {
        "MAE_mean": float(np.mean(maes)),
        "MAE_std":  float(np.std(maes)),
        "RMSE_mean": float(np.mean(rmses)),
        "RMSE_std":  float(np.std(rmses)),
        "n": int(n_obs),
    }


def eval_brand_specific(pipe_global, X, y, brand_col, brand, splits,
                        min_train_per_fold=50):
    """
    Evaluate a brand-specific model for a single brand.

    Preprocessing + SHAP selection are kept fixed (from pipe_global).
    In each fold:
      - Transform the brand's data with the fixed preprocessor.
      - Fit a fresh regressor (clone of the final step) only on that brand.
      - Evaluate on validation rows of that brand.
    """
    # Split the pipeline into:
    # - preproc: all steps except the final regressor
    # - base_reg: the final regressor template
    preproc = pipe_global[:-1]
    base_reg = pipe_global[-1]

    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Keep only this brand in train/val
        mask_tr = (X_tr[brand_col] == brand)
        mask_val = (X_val[brand_col] == brand)

        if mask_val.sum() == 0:
            # No validation examples of this brand in this fold
            continue
        if mask_tr.sum() < min_train_per_fold:
            # Too few training examples for a stable brand-specific fit
            continue

        X_tr_b, y_tr_b = X_tr[mask_tr], y_tr[mask_tr]
        X_val_b, y_val_b = X_val[mask_val], y_val[mask_val]

        # Do NOT refit the preprocessor; just transform with the fitted one
        X_tr_b_proc = preproc.transform(X_tr_b)
        X_val_b_proc = preproc.transform(X_val_b)

        # Fresh regressor for this fold
        reg = clone(base_reg)
        reg.fit(X_tr_b_proc, y_tr_b)

        y_pred_b = reg.predict(X_val_b_proc)

        mae = mean_absolute_error(y_val_b, y_pred_b)
        mse = mean_squared_error(y_val_b, y_pred_b)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += len(y_val_b)

    return {
        "MAE_mean": float(np.mean(maes)) if maes else np.nan,
        "MAE_std":  float(np.std(maes))  if maes else np.nan,
        "RMSE_mean": float(np.mean(rmses)) if rmses else np.nan,
        "RMSE_std":  float(np.std(rmses))  if rmses else np.nan,
        "n": int(n_obs),
    }


In [None]:
# Evaluate both models for each candidate brand

pipe_global = hgb_final_shap_pipe 

global_results = []
brand_specific_results = []

for brand in candidate_brands:
    print("Evaluating brand:", brand)

    # 1) Global: train on all brands, measure only this brand in validation
    res_g = eval_global_for_brand(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        brand=brand,
        splits=splits,
    )
    res_g.update({
        "brand": brand,
        "model_type": "global",
    })
    global_results.append(res_g)

    # 2) Brand-specific: preproc fixed, regressor trained only on this brand
    res_b = eval_brand_specific(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        brand=brand,
        splits=splits,
    )
    res_b.update({
        "brand": brand,
        "model_type": "brand_specific",
    })
    brand_specific_results.append(res_b)

# Collect results into DataFrames
df_global = pd.DataFrame(global_results)
df_brand = pd.DataFrame(brand_specific_results)

print("\nGlobal model results per brand:")
display(df_global)

print("\nBrand-specific model results per brand:")
display(df_brand)


In [None]:
# Clean results and compute performance differences

# Drop any brands where evaluation failed (NaNs)
df_global = df_global.dropna(subset=["MAE_mean", "RMSE_mean"])
df_brand  = df_brand.dropna(subset=["MAE_mean", "RMSE_mean"])

# Merge global vs brand-specific results
df_compare = df_global.merge(
    df_brand,
    on="brand",
    suffixes=("_global", "_brand"),
)

# Compute deltas:
#   delta_MAE  < 0  -> brand-specific has lower MAE (better)
#   delta_RMSE < 0  -> brand-specific has lower RMSE (better)
df_compare["delta_MAE"]  = df_compare["MAE_mean_brand"]  - df_compare["MAE_mean_global"]
df_compare["delta_RMSE"] = df_compare["RMSE_mean_brand"] - df_compare["RMSE_mean_global"]

# Sort by delta_MAE (most improvement first)
df_compare_sorted = df_compare.sort_values("delta_MAE")

print("Per-brand comparison (head):")
display(df_compare_sorted)


In [None]:
# Visualizations: bar plots for MAE and ΔMAE

# Global vs Brand-specific MAE per brand
plt.figure(figsize=(8, 4))
x = np.arange(len(df_compare_sorted))
width = 0.35

plt.bar(
    x - width / 2,
    df_compare_sorted["MAE_mean_global"],
    width,
    label="Global model",
)
plt.bar(
    x + width / 2,
    df_compare_sorted["MAE_mean_brand"],
    width,
    label="Brand-specific model",
)

plt.xticks(x, df_compare_sorted["brand"], rotation=45, ha="right")
plt.ylabel("MAE (GBP)")
plt.title("Global vs Brand-specific models (MAE per brand)")
plt.legend()
plt.tight_layout()
plt.show()

# 1ΔMAE per brand (negative = improvement with specialization)
plt.figure(figsize=(8, 3))
plt.bar(df_compare_sorted["brand"], df_compare_sorted["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Δ MAE (brand - global)")
plt.title("Effect of model specialization per brand\n(negative = brand-specific MAE is lower)")
plt.tight_layout()
plt.show()


**Brand-Model Segmentation**

In [None]:
# Evaluation helpers for brand–model segments

def eval_global_for_brand_model(model, X, y, brand_col, model_col,
                                brand, model_name, splits):
    """
    Evaluate the global pipeline for a specific (brand, model) pair.

    In each fold:
      - Train on all cars.
      - Compute MAE / RMSE only on validation rows where
        Brand == brand AND model == model_name.
    """
    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train global model on ALL brands and models
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)

        # Restrict to this brand–model in validation
        mask_seg = (
            (X_val[brand_col] == brand) &
            (X_val[model_col] == model_name)
        )
        if mask_seg.sum() == 0:
            continue

        y_val_seg = y_val[mask_seg]
        y_pred_seg = y_pred[mask_seg]

        mae = mean_absolute_error(y_val_seg, y_pred_seg)
        mse = mean_squared_error(y_val_seg, y_pred_seg)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += mask_seg.sum()

    return {
        "MAE_mean": float(np.mean(maes)),
        "MAE_std":  float(np.std(maes)),
        "RMSE_mean": float(np.mean(rmses)),
        "RMSE_std":  float(np.std(rmses)),
        "n": int(n_obs),
    }


def eval_brand_model_specific(pipe_global, X, y, brand_col, model_col,
                              brand, model_name, splits,
                              min_train_per_fold=40):
    """
    Evaluate a brand–model-specific regressor.

    Preprocessing + SHAP selection stay fixed (from pipe_global).
    In each fold:
      - Keep only rows with this (brand, model).
      - Transform them with the fixed preprocessor.
      - Fit a fresh regressor only on this segment.
      - Evaluate on validation rows of the same segment.
    """
    preproc = pipe_global[:-1]
    base_reg = pipe_global[-1]

    maes, rmses = [], []
    n_obs = 0

    for train_idx, val_idx in splits:
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Restrict to this brand–model in train/val
        mask_tr = (
            (X_tr[brand_col] == brand) &
            (X_tr[model_col] == model_name)
        )
        mask_val = (
            (X_val[brand_col] == brand) &
            (X_val[model_col] == model_name)
        )

        if mask_val.sum() == 0:
            continue
        if mask_tr.sum() < min_train_per_fold:
            continue

        X_tr_seg, y_tr_seg = X_tr[mask_tr], y_tr[mask_tr]
        X_val_seg, y_val_seg = X_val[mask_val], y_val[mask_val]

        # Transform with fixed preprocessor
        X_tr_seg_proc = preproc.transform(X_tr_seg)
        X_val_seg_proc = preproc.transform(X_val_seg)

        # Fresh regressor for this fold
        reg = clone(base_reg)
        reg.fit(X_tr_seg_proc, y_tr_seg)
        y_pred_seg = reg.predict(X_val_seg_proc)

        mae = mean_absolute_error(y_val_seg, y_pred_seg)
        mse = mean_squared_error(y_val_seg, y_pred_seg)
        rmse = float(np.sqrt(mse))

        maes.append(mae)
        rmses.append(rmse)
        n_obs += len(y_val_seg)

    return {
        "MAE_mean": float(np.mean(maes)) if maes else np.nan,
        "MAE_std":  float(np.std(maes))  if maes else np.nan,
        "RMSE_mean": float(np.mean(rmses)) if rmses else np.nan,
        "RMSE_std":  float(np.std(rmses))  if rmses else np.nan,
        "n": int(n_obs),
    }


In [None]:
# Run evaluation for each (brand, model) pair

bm_global_results = []
bm_specific_results = []

for (brand, model_name), cnt in candidate_pairs.items():
    print(f"Evaluating pair: {brand} / {model_name} (n={cnt})")

    # Global model on this brand–model segment
    res_g = eval_global_for_brand_model(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        model_col=model_col,
        brand=brand,
        model_name=model_name,
        splits=splits,
    )
    res_g.update({
        "brand": brand,
        "model": model_name,
        "segment_type": "global",
    })
    bm_global_results.append(res_g)

    # Brand–model-specific regressor
    res_bm = eval_brand_model_specific(
        pipe_global,
        X_train,
        y_train,
        brand_col=brand_col,
        model_col=model_col,
        brand=brand,
        model_name=model_name,
        splits=splits,
    )
    res_bm.update({
        "brand": brand,
        "model": model_name,
        "segment_type": "brand_model_specific",
    })
    bm_specific_results.append(res_bm)

df_bm_global = pd.DataFrame(bm_global_results)
df_bm_spec   = pd.DataFrame(bm_specific_results)

print("\nGlobal results per (brand, model):")
display(df_bm_global)

print("\nBrand–model-specific results:")
display(df_bm_spec)


In [None]:
# Compare global vs brand–model-specific performance

# Drop failed / NaN segments
df_bm_global = df_bm_global.dropna(subset=["MAE_mean", "RMSE_mean"])
df_bm_spec   = df_bm_spec.dropna(subset=["MAE_mean", "RMSE_mean"])

df_bm_compare = df_bm_global.merge(
    df_bm_spec,
    on=["brand", "model"],
    suffixes=("_global", "_bm"),
)

df_bm_compare["delta_MAE"]  = df_bm_compare["MAE_mean_bm"]  - df_bm_compare["MAE_mean_global"]
df_bm_compare["delta_RMSE"] = df_bm_compare["RMSE_mean_bm"] - df_bm_compare["RMSE_mean_global"]

df_bm_sorted = df_bm_compare.sort_values("delta_MAE")

print("Brand–model comparison (most improvement first):")
display(df_bm_sorted)

# Optional readable table
bm_display_cols = [
    "brand", "model",
    "MAE_mean_global", "MAE_mean_bm", "delta_MAE",
    "RMSE_mean_global", "RMSE_mean_bm", "delta_RMSE",
    "n_global",
]
df_bm_display = (
    df_bm_sorted[bm_display_cols]
    .copy()
    .rename(columns={"n_global": "n_samples"})
)

for c in df_bm_display.columns:
    if "MAE" in c or "RMSE" in c or "delta" in c:
        df_bm_display[c] = df_bm_display[c].round(1)

print("\nReadable brand–model summary:")
display(df_bm_display)


In [None]:
# Extra plots for brand–model specialization

# Focus on segments with at least 100 samples for more stable numbers
df_bm_plot = df_bm_display[df_bm_display["n_samples"] >= 100]

# Sort by delta_MAE (most improvement first)
df_bm_plot = df_bm_plot.sort_values("delta_MAE")

# 1) Bar plot of ΔMAE for brand–model segments (filtered)
plt.figure(figsize=(10, 4))
x = np.arange(len(df_bm_plot))
plt.bar(x, df_bm_plot["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xticks(x, [f"{b} {m}" for b, m in zip(df_bm_plot["brand"], df_bm_plot["model"])],
           rotation=90, ha="right")
plt.ylabel("Δ MAE (brand–model - global)")
plt.title("Effect of brand–model specialization\n(negative = lower MAE than global)")
plt.tight_layout()
plt.show()

# 2) Scatter plot: n_samples vs ΔMAE to visualise overfitting at low sample sizes
plt.figure(figsize=(6, 4))
plt.scatter(df_bm_display["n_samples"], df_bm_display["delta_MAE"])
plt.axhline(0, linestyle="--")
plt.xlabel("Number of samples per (brand, model)")
plt.ylabel("Δ MAE (brand–model - global)")
plt.title("ΔMAE vs segment size\n(negative = brand–model-specific is better)")
plt.tight_layout()
plt.show()


### 9. Ablation study

In [None]:
# TODO try different transformations (box-cox, yeo-johnson)
# TODO use i.e. 3 parallel exact same pipelines with different scalers to see the difference (input from lab)
# TODO use different encoding ohe, target, label, frequency (frequency encoder is ricardos favorite encoder)