In [None]:
# Baseline

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from Preprocessing.imputation import get_imputation_maps, apply_imputation
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns

# Eigene Module
from Preprocessing.split_new import split_data

from utils.eval_call import evaluate_model


In [6]:
def baseline_brand():
    X_train, X_test, y_train, y_test, categorical_features, numeric_features = split_data()

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Imputationsmappings vorbereiten
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')
    range_maps = get_imputation_maps(X_train, target_col='electric_range')

    # Imputation anwenden
    for col, maps in [('fuel_consumption_l_100km', fuel_maps),
                      ('power_ps', ps_maps),
                      ('electric_range', range_maps)]:
        X_train = apply_imputation(X_train, target_col=col, maps=maps)
        X_test = apply_imputation(X_test, target_col=col, maps=maps)

    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Durchschnittspreis pro Automarke berechnen
    brand_avg_price = X_train.copy()
    brand_avg_price['target'] = y_train
    brand_mean = brand_avg_price.groupby('brand')['target'].mean().to_dict()

    # Vorhersagen basierend auf der Marke
    def predict_with_brand_mean(x):
        return brand_mean.get(x['brand'], y_train.mean())  # Fallback: Gesamtdurchschnitt

    y_pred_brand = X_test.apply(predict_with_brand_mean, axis=1)

    # Bewertung
    evaluate_model(y_test, y_pred_brand, "Durchschnitt pro Marke")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = X_test.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred_brand, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

best_model = baseline_brand()


Fehlende Werte vor Imputation:
fuel_consumption:
18133 in X_train
4497 in X_test
power_ps:
101 in X_train
27 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
348 in X_train
69 in X_test
power_ps:
8 in X_train
2 in X_test
------------------------------
Durchschnitt pro Marke Performance Metrics:
MAE: 12726.38
MSE: 727355888.07
RMSE: 26969.54
R²: 0.44
------------------------------

--- Evaluation nach fuel_type ---

Fuel Type: Diesel
  RMSE: 16786.45
  MAE:  12052.61
  R²:   0.07

Fuel Type: Petrol
  RMSE: 27326.17
  MAE:  13018.21
  R²:   0.53

Fuel Type: Electric
  RMSE: 26029.19
  MAE:  17075.99
  R²:   -0.03

Fuel Type: Hybrid
  RMSE: 60656.20
  MAE:  12143.30
  R²:   0.24

Fuel Type: Hydrogen
  RMSE: 19260.57
  MAE:  14915.35
  R²:   -0.55

Fuel Type: LPG
  RMSE: 14542.12
  MAE:  10698.75
  R²:   0.48

Fuel Type: CNG
  RMSE: 10732.13
  MAE:  8330.88
  R²:   0.35

Fuel Type: Other
  RMSE: 14246.32
  MAE:  11402.40
  R²:   0.02

Fuel Type: Di

In [7]:
# Version 0.1 - Baseline: Durchschnittspreis pro Modell

def baseline_model():
    X_train, X_test, y_train, y_test, categorical_features, numeric_features = split_data()

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Imputationsmappings vorbereiten
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')
    range_maps = get_imputation_maps(X_train, target_col='electric_range')

    # Imputation anwenden
    for col, maps in [('fuel_consumption_l_100km', fuel_maps),
                      ('power_ps', ps_maps),
                      ('electric_range', range_maps)]:
        X_train = apply_imputation(X_train, target_col=col, maps=maps)
        X_test = apply_imputation(X_test, target_col=col, maps=maps)

    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Durchschnittspreis pro Automarke berechnen
    model_avg_price = X_train.copy()
    model_avg_price['target'] = y_train
    model_mean = model_avg_price.groupby('model')['target'].mean().to_dict()

    # Vorhersagen basierend auf der Marke
    def predict_with_model_mean(x):
        return model_mean.get(x['model'], y_train.mean())  # Fallback: Gesamtdurchschnitt

    y_pred_model = X_test.apply(predict_with_model_mean, axis=1)

    # Bewertung
    evaluate_model(y_test, y_pred_model, "Durchschnitt pro Model")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = X_test.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred_model, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

best_model = baseline_model()


Fehlende Werte vor Imputation:
fuel_consumption:
18133 in X_train
4497 in X_test
power_ps:
101 in X_train
27 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
348 in X_train
69 in X_test
power_ps:
8 in X_train
2 in X_test
------------------------------
Durchschnitt pro Model Performance Metrics:
MAE: 8843.73
MSE: 471087154.09
RMSE: 21704.54
R²: 0.64
------------------------------

--- Evaluation nach fuel_type ---

Fuel Type: Diesel
  RMSE: 12432.29
  MAE:  8832.35
  R²:   0.49

Fuel Type: Petrol
  RMSE: 21673.52
  MAE:  8737.58
  R²:   0.71

Fuel Type: Electric
  RMSE: 13158.52
  MAE:  8268.87
  R²:   0.74

Fuel Type: Hybrid
  RMSE: 53558.78
  MAE:  10514.78
  R²:   0.41

Fuel Type: Hydrogen
  RMSE: 13696.35
  MAE:  10874.37
  R²:   0.21

Fuel Type: LPG
  RMSE: 10755.85
  MAE:  7452.55
  R²:   0.72

Fuel Type: CNG
  RMSE: 8085.63
  MAE:  6225.04
  R²:   0.63

Fuel Type: Other
  RMSE: 12799.43
  MAE:  9287.48
  R²:   0.21

Fuel Type: Diesel Hybr