In [1]:
# Baseline

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from Preprocessing.imputation import get_imputation_maps, apply_imputation
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns

# Eigene Module
from Preprocessing.split_new import split_data

from utils.eval_call import evaluate_model


In [2]:
# Version 0.0 - Baseline: Mean per Brand
def baseline_brand():
    X_train, X_test, y_train, y_test, categorical_features, numeric_features = split_data()

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Imputationsmappings vorbereiten
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')
    range_maps = get_imputation_maps(X_train, target_col='electric_range')

    # Imputation anwenden
    for col, maps in [('fuel_consumption_l_100km', fuel_maps),
                      ('power_ps', ps_maps),
                      ('electric_range', range_maps)]:
        X_train = apply_imputation(X_train, target_col=col, maps=maps)
        X_test = apply_imputation(X_test, target_col=col, maps=maps)

    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Durchschnittspreis pro Automarke berechnen
    brand_avg_price = X_train.copy()
    brand_avg_price['target'] = y_train
    brand_mean = brand_avg_price.groupby('brand')['target'].mean().to_dict()

    # Vorhersagen basierend auf der Marke
    def predict_with_brand_mean(x):
        return brand_mean.get(x['brand'], y_train.mean())  # Fallback: Gesamtdurchschnitt

    y_pred_brand = X_test.apply(predict_with_brand_mean, axis=1)

    # Bewertung
    evaluate_model(y_test, y_pred_brand, "Durchschnitt pro Marke")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = X_test.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred_brand, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

best_model = baseline_brand()


Fehlende Werte vor Imputation:
fuel_consumption:
17935 in X_train
4493 in X_test
power_ps:
102 in X_train
25 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
346 in X_train
72 in X_test
power_ps:
8 in X_train
1 in X_test
------------------------------
Durchschnitt pro Marke Performance Metrics:
MAE: 13029.89
MSE: 1487139792.79
RMSE: 38563.45
R²: 0.29
------------------------------

--- Evaluation nach fuel_type ---

Fuel Type: Diesel
  RMSE: 17368.91
  MAE:  12063.35
  R²:   0.05

Fuel Type: Petrol
  RMSE: 48569.24
  MAE:  13585.06
  R²:   0.29

Fuel Type: Other
  RMSE: 18402.88
  MAE:  12532.68
  R²:   0.12

Fuel Type: Electric
  RMSE: 25564.48
  MAE:  16877.94
  R²:   0.04

Fuel Type: Hybrid
  RMSE: 19535.45
  MAE:  11497.13
  R²:   0.56

Fuel Type: Diesel Hybrid
  RMSE: 24991.10
  MAE:  18164.51
  R²:   -0.62

Fuel Type: LPG
  RMSE: 17761.45
  MAE:  13154.43
  R²:   0.43

Fuel Type: CNG
  RMSE: 9470.81
  MAE:  8253.08
  R²:   -0.94

Fuel Typ

In [3]:
# Version 0.1 - Baseline: Durchschnittspreis pro Modell

def baseline_model():
    X_train, X_test, y_train, y_test, categorical_features, numeric_features = split_data()

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Imputationsmappings vorbereiten
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')
    range_maps = get_imputation_maps(X_train, target_col='electric_range')

    # Imputation anwenden
    for col, maps in [('fuel_consumption_l_100km', fuel_maps),
                      ('power_ps', ps_maps),
                      ('electric_range', range_maps)]:
        X_train = apply_imputation(X_train, target_col=col, maps=maps)
        X_test = apply_imputation(X_test, target_col=col, maps=maps)

    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Durchschnittspreis pro Automarke berechnen
    model_avg_price = X_train.copy()
    model_avg_price['target'] = y_train
    model_mean = model_avg_price.groupby('model')['target'].mean().to_dict()

    # Vorhersagen basierend auf der Marke
    def predict_with_model_mean(x):
        return model_mean.get(x['model'], y_train.mean())  # Fallback: Gesamtdurchschnitt

    y_pred_model = X_test.apply(predict_with_model_mean, axis=1)

    # Bewertung
    evaluate_model(y_test, y_pred_model, "Durchschnitt pro Model")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = X_test.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred_model, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

best_model = baseline_model()


Fehlende Werte vor Imputation:
fuel_consumption:
17935 in X_train
4493 in X_test
power_ps:
102 in X_train
25 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
346 in X_train
72 in X_test
power_ps:
8 in X_train
1 in X_test
------------------------------
Durchschnitt pro Model Performance Metrics:
MAE: 8986.81
MSE: 1134671951.33
RMSE: 33684.89
R²: 0.46
------------------------------

--- Evaluation nach fuel_type ---

Fuel Type: Diesel
  RMSE: 13223.35
  MAE:  8838.85
  R²:   0.45

Fuel Type: Petrol
  RMSE: 42032.79
  MAE:  9027.31
  R²:   0.47

Fuel Type: Other
  RMSE: 10237.15
  MAE:  8498.86
  R²:   0.73

Fuel Type: Electric
  RMSE: 12411.36
  MAE:  7930.10
  R²:   0.77

Fuel Type: Hybrid
  RMSE: 35480.07
  MAE:  10013.03
  R²:   -0.44

Fuel Type: Diesel Hybrid
  RMSE: 16574.72
  MAE:  10335.60
  R²:   0.29

Fuel Type: LPG
  RMSE: 14607.32
  MAE:  10119.63
  R²:   0.61

Fuel Type: CNG
  RMSE: 7606.33
  MAE:  6253.75
  R²:   -0.25

Fuel Type: Hy