### ToDo

In [0]:
# statsmodels implementieren und anschauen (linear reg solution lab)

# Project

### Import & load Data

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import re

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from data_cleaning import clean_car_dataframe

df_cars_train = pd.read_csv("train.csv")
df_cars_test = pd.read_csv("test.csv")

def print_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R2: {r2:.4f}")
    return

### EDA & Cleaning

In [0]:
df_cars_train.describe(include="all").T

In [0]:
df_cars_train = clean_car_dataframe(df_cars_train)
df_cars_test = clean_car_dataframe(df_cars_test)

In [0]:
# print all unique values of all columns of df_cars_train // df_cars_test

for col in df_cars_train.columns:
    print(col, df_cars_train[col].unique())

print("X"*150)

for col in df_cars_test.columns:
    print(col, df_cars_test[col].unique())

### Feature Engineering, Split & Preprocessing

In [0]:
# add column age
df_cars_train['age'] = 2025 - df_cars_train['year']
df_cars_test['age'] = 2025 - df_cars_test['year']

In [0]:
X = df_cars_train.drop(columns='price')
y = df_cars_train['price']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 42, 
                                                  #stratify = y,
                                                  shuffle = True)

In [0]:
numeric_features = ["year", "mileage", "age", "tax", "mpg", "engineSize", "paintQuality", "previousOwners", "hasDamage"]
categorical_features = ["Brand", "model", "transmission", "fuelType"]

log_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", StandardScaler())
])

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("mileage", log_transformer, ["mileage"]),
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="drop")

preprocessor.fit(X_train)

# ColumnTransformer lets you apply different transformations to different feature subsets:
#       > Numeric → impute mean + scale
#       > Categorical → impute "Unknown" + OneHotEncode
#       > Mileage → impute log-transform
#   This is key, because numeric and categorical data need different math, you can't scale strings or one-hot encode continuous numbers.


# Pipeline bundles preprocessing + model training:
#     > Cross-validation applies preprocessing inside each fold (no data leakage).
#     > The final model object (after .fit()) knows exactly how to preprocess new data.
#     > When saving the pipeline with joblib, everything (scaler, encoder, model) is saved together.

### Models Setup and Baselining

In [0]:
ridge_pipe = Pipeline([
    ("preprocess", preprocessor), 
    ("model", Ridge())
])

lasso_pipe = Pipeline([
    ("preprocess", preprocessor), 
    ("model", Lasso(max_iter=20000))
])

elastic_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=20000))
])

# GradientBoostingRegressor: baseline, has to beat Ridge/Lasso, if not something’s wrong with data preprocessing, not the model.
gbr_pipe = Pipeline([
    ("preprocess", preprocessor), 
    ("model", GradientBoostingRegressor())
])

# HistGradientBoostingRegressor: modern and very fast, handles missing values natively (no imputation needed!). often matches or beats XGBoost/LightGBM 
hgb_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", HistGradientBoostingRegressor(
        max_depth=None, 
        learning_rate=0.05, 
        max_iter=500,           
        l2_regularization=1.0,
        random_state=42
    ))
    # possible to grid search over learning_rate, max_leaf_nodes, min_samples_leaf
])

# RandomForestRegressor: excellent general baseline ensemble, handles non-linearities well, doesn’t overfit easily but can be slow for large data
rf_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300, 
        max_depth=None, 
        n_jobs=-1, 
        random_state=42
    ))
])

# ExtraTreesRegressor: similar to RandomForest but with more randomization → often better generalization
et_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", ExtraTreesRegressor(
        n_estimators=300, 
        max_depth=None, 
        n_jobs=-1, 
        random_state=42
    ))
])

# SVR: powerful, but slow on large data, sensitive to scaling → already handled in preprocessing
svr_pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", SVR(C=10, epsilon=0.2, kernel="rbf"))
])

# StackingRegressor: stacks/blends multiple models → typically gives a small but consistent boost in leaderboard competitions
stack_pipe = StackingRegressor(
    estimators=[
        ("ridge", ridge_pipe),
        ("lasso", lasso_pipe),
        ("rf", rf_pipe),
        ("gbr", gbr_pipe),
    ],
    final_estimator=HistGradientBoostingRegressor(learning_rate=0.05, max_depth=5),
    n_jobs=-1
)


In [0]:
mean_pred = y_train.mean()
median_pred = y_train.median()

print("baseline mean predictor: ")
print_metrics(y_val, [mean_pred]*len(y_val))
print("-"*150)
print("baseline median predictor: ") 
print_metrics(y_val, [median_pred]*len(y_val))

### Hyperparameter Tuning

#### HT: linear Models

In [0]:
param_grids_linear = {
    #ridge
    "ridge": {"model__alpha": [0.1, 1.0, 10.0]},

    #lasso
    "lasso": {"model__alpha": [0.001, 0.01, 0.1, 1.0]},

    #elasticnet
    "elastic": {
        "model__alpha": [0.01, 0.1, 1.0],
        "model__l1_ratio": [0.2, 0.5, 0.8]
    }
}

pipes_linear = {
    "ridge": ridge_pipe,
    "lasso": lasso_pipe,  # iter 20000 because of convergence warning
    "elastic": elastic_pipe  # iter 20000 because of convergence warning
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, pipe in pipes_linear.items():
    print(f"\n Results of {name.upper()} : ")
    
    grid = GridSearchCV(
        pipe, param_grids_linear[name], 
        cv=cv, scoring="r2", n_jobs=-1, verbose=1
    )
    
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    val_pred = best_model.predict(X_val)
    
    print_metrics(y_val, val_pred)

#### HT: tree Models

In [0]:
rf_param_grid = {
    "model__max_depth": [None, 10, 20],
    "model__max_features": ["sqrt", "log2"]
}

rf_grid = GridSearchCV(
    rf_pipe, rf_param_grid,
    cv=5, scoring="r2",
    n_jobs=-1, verbose=1
)

rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
rf_val_pred = rf_best.predict(X_val)

print("Random Forest Results: ")
print_metrics(y_val, rf_val_pred)

#MAE: 1431.4111 | RMSE: 5764825.2457 | R2: 0.9379

In [0]:
et_param_grid = {
    "model__max_depth": [None, 10, 20],
    "model__max_features": ["sqrt", "log2"]
}

et_grid = GridSearchCV(
    et_pipe, et_param_grid,
    cv=5, scoring="r2",
    n_jobs=-1, verbose=1
)

et_grid.fit(X_train, y_train)
et_best = et_grid.best_estimator_
et_val_pred = et_best.predict(X_val)

print("ExtraTrees Results: ")
print_metrics(y_val, et_val_pred)

In [0]:
gbr_param_grid = {
    "model__n_estimators": [200, 500],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [3, 5]
}

gbr_grid = GridSearchCV(
    gbr_pipe, gbr_param_grid,
    cv=5, scoring="r2",
    n_jobs=-1, verbose=1
)

gbr_grid.fit(X_train, y_train)
gbr_best = gbr_grid.best_estimator_
gbr_val_pred = gbr_best.predict(X_val)

print("GradientBoosting Results: ")
print_metrics(y_val, gbr_val_pred)

#### HT: advanced Models

In [0]:
hgb_param_grid = {
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_leaf_nodes": [15, 31, None],
    "model__min_samples_leaf": [10, 20]
}

hgb_grid = GridSearchCV(
    hgb_pipe, hgb_param_grid,
    cv=cv, scoring="r2", n_jobs=-1, verbose=1
)

hgb_grid.fit(X_train, y_train)
hgb_best = hgb_grid.best_estimator_

hgb_val_pred = hgb_best.predict(X_val)
r2_hgb, rmse_hgb, mae_hgb = print_metrics(y_val, hgb_val_pred)

In [0]:
svr_param_grid = {
    "model__C": [1, 10, 100],
    "model__epsilon": [0.1, 0.2],
    "model__kernel": ["rbf"]
}

svr_grid = GridSearchCV(
    svr_pipe, svr_param_grid,
    cv=cv, scoring="r2", n_jobs=-1, verbose=1
)

svr_grid.fit(X_train, y_train)
svr_best = svr_grid.best_estimator_

svr_val_pred = svr_best.predict(X_val)
r2_svr, rmse_svr, mae_svr = print_metrics(y_val, svr_val_pred)

In [0]:
# stackingregressor (lasso, ridge, rf, gbr)

print("\n Training StackingRegressor ...")
stack_pipe.fit(X_train, y_train)
val_pred = stack_pipe.predict(X_val)
r2, rmse, mae = print_metrics(y_val, val_pred)

# (MAE: 1433.9603 | RMSE: 2536.9591 | R²: 0.9307)

### Evaluation, Logging, Kaggle, Feature Importance, ...

In [0]:
# What/Why: Evaluate each model on the validation set (the data not used for training) and log metrics. Compare models and pick the one with best validation performance.

# Benchmarking note: your professor’s “benchmarking” asks for such a log — always compare to mean baseline and to a simple linear baseline (Ridge/Lasso) before complex models.


In [0]:
# What/Why: Only after you pick the model using the validation set do you evaluate on test — this gives an unbiased final estimate.

# Why: test must never influence tuning; otherwise you leak information.


In [0]:
# What/Why: Different models give different “importance” signals:
# Lasso → coefficients (linear importance); zero → removed feature.
# Tree ensembles (GBR) → feature_importances_ (importance in splits).
# For rigorous interpretation, use SHAP for consistent feature attributions across models.


# Important: yes — each model may select different features. That’s expected. Use the model type that matches your use-case:
# If you need a sparse, interpretable linear model → use Lasso.
# If you need best predictive power on tabular data → use ensemble/boosting and interpret via SHAP.

In [0]:
# What/Why: Save the entire pipeline (preprocessing + model) so new observations are processed consistently.

"""
# (cell) Save and reload
import joblib, os
os.makedirs("models", exist_ok=True)
joblib.dump(best_pipeline, "models/car_price_model.pkl")

loaded = joblib.load("models/car_price_model.pkl")
# predict on new example
new_car = pd.DataFrame([{
    "mileage": 60000, "year": 2018, "engine_size": 2.0, "horsepower": 150, "doors": 4, "owners": 2,
    "brand": "BMW", "fuel_type": "Petrol", "transmission": "Auto", "color": "Black",
    "region": "Urban", "condition": "Used", "warranty": "No", "dealer_type": "Independent"
}])
print("Predicted price:", loaded.predict(new_car)[0])
"""

#### Iterative Loop Checklist

The loop starts after you look at baseline performance and diagnostics:
- Baseline → Check metrics on validation (and residuals).
- Inspect failures / residual plots / feature importances (did a certain brand get consistently over/under predicted?)
- Hypothesize (e.g. add interaction year * mileage, try log transform for horsepower, create age = current_year - year).
- Implement changes in pipeline (e.g. add FunctionTransformer for log(horsepower) or PolynomialFeatures on a small set).
- Re-run CV/hyperparameter search and evaluate again.
- Log results, repeat.

Note on feature selection: yes — different models will select different subsets. Typical approaches:
- Use Lasso or SelectFromModel as a filter for linear pipelines.
- Use tree-based model importances or SHAP to select features for simpler models.
- Or let the best predictive model use all features (trees are robust to redundancy).

Final notes (recommended best-practices)
- Always fit preprocessing only on training data (pipelines do this automatically if you use them inside CV).
- Start simple: mean baseline → Ridge/Lasso → tree-based. Use the simple models for interpretability and as sanity checks.
- For heavy hyperparameter searches use RandomizedSearchCV or Optuna if the space is big.
- When comparing models, report multiple metrics (R², MAE, RMSE). For price prediction MAE is often most interpretable.
- For reproducibility, store your dataset version, random seed, code, and results log.