In [2]:
# Baseline

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from Preprocessing.imputation import get_imputation_maps, apply_imputation
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns

# Eigene Module
from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_offerdesc import preprocessing_pipeline_offerdesc
from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline_impute
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment


In [3]:
# Version 0 - Baseline: Durchschnittspreis pro Automarke

def average_price_per_brand(df, price_column="price_in_euro", brand_column="brand", test_size=0.2, random_state=42):
    """
    Baseline-Modell mit Stratification:
    - Durchschnittspreis pro Marke im Training
    - Vorhersage auf Testdaten
    - Evaluation mit MSE, RMSE, MAE, R²
    """

    # Entferne Marken, die weniger als 2x vorkommen
    brand_counts = df[brand_column].value_counts()
    valid_brands = brand_counts[brand_counts >= 2].index
    df = df[df[brand_column].isin(valid_brands)]
    
    # Train-Test-Split mit Stratifikation auf Marke
    df_train, df_test = train_test_split(
        df,
        test_size=test_size,
        stratify=df[brand_column],
        random_state=random_state
    )

    # Durchschnittspreis pro Marke im Training
    brand_avg = df_train.groupby(brand_column)[price_column].mean()

    # Vorhersage: Testdaten bekommen den Markenmittelwert
    df_test = df_test.copy()
    df_test["predicted_price"] = df_test[brand_column].map(brand_avg)

    # Fallback: Wenn Marke nicht im Training war, setze Gesamtmittelwert
    overall_mean = df_train[price_column].mean()
    df_test["predicted_price"].fillna(overall_mean, inplace=True)

    # Evaluation
    y_true = df_test[price_column]
    y_pred = df_test["predicted_price"]

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("Baseline-Modell: Durchschnittspreis pro Marke")
    print(f"MSE:  {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"R²:   {r2:.2f}")

    return df_test[[brand_column, price_column, "predicted_price"]]

# Beispiel-Anwendung:
df = preprocessing_pipeline_impute(path='data.csv') 
average_price_per_brand(df)


Baseline-Modell: Durchschnittspreis pro Marke
MSE:  738802142.01
RMSE: 27180.92
MAE:  12561.53
R²:   0.41


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["predicted_price"].fillna(overall_mean, inplace=True)


Unnamed: 0,brand,price_in_euro,predicted_price
14228,audi,57444,27229.199473
8456,audi,19990,27229.199473
210103,volkswagen,17790,18346.889412
70454,hyundai,8490,17996.688452
89888,mazda,26900,22297.727865
...,...,...,...
196446,volkswagen,4990,18346.889412
2558,audi,6000,27229.199473
136926,opel,17650,13991.840324
111266,mercedes-benz,36999,29615.595736


In [4]:
# Version 0.1 - Baseline: Durchschnittspreis pro Modell

def average_price_per_model(df, price_column="price_in_euro", model_column="model", test_size=0.2, random_state=42):
    """
    Baseline-Modell mit Stratification:
    - Durchschnittspreis pro Modell im Training
    - Vorhersage auf Testdaten
    - Evaluation mit MSE, RMSE, MAE, R²
    """

    # Entferne Marken, die weniger als 2x vorkommen
    model_counts = df[model_column].value_counts()
    valid_models = model_counts[model_counts >= 2].index
    df = df[df[model_column].isin(valid_models)]
    
    # Train-Test-Split mit Stratifikation auf Marke
    df_train, df_test = train_test_split(
        df,
        test_size=test_size,
        stratify=df[model_column],
        random_state=random_state
    )

    # Durchschnittspreis pro Marke im Training
    brand_avg = df_train.groupby(model_column)[price_column].mean()

    # Vorhersage: Testdaten bekommen den Modellmittelwert
    df_test = df_test.copy()
    df_test["predicted_price"] = df_test[model_column].map(brand_avg)

    # Fallback: Wenn Modell nicht im Training war, setze Gesamtmittelwert
    overall_mean = df_train[price_column].mean()
    df_test["predicted_price"].fillna(overall_mean, inplace=True)

    # Evaluation
    y_true = df_test[price_column]
    y_pred = df_test["predicted_price"]

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("Baseline-Modell: Durchschnittspreis pro Modell")
    print(f"MSE:  {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"R²:   {r2:.2f}")

    return df_test[[model_column, price_column, "predicted_price"]]

# Beispiel-Anwendung:
df = preprocessing_pipeline_impute(path='data.csv') 
average_price_per_model(df)


Baseline-Modell: Durchschnittspreis pro Modell
MSE:  320008645.18
RMSE: 17888.79
MAE:  8573.58
R²:   0.68


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["predicted_price"].fillna(overall_mean, inplace=True)


Unnamed: 0,model,price_in_euro,predicted_price
193804,Volkswagen Passat,13490,13229.065455
6023,Audi A3,6399,17508.785036
99564,Mercedes-Benz Sprinter,14399,23509.586957
6989,Audi A8,23999,35288.578947
134795,Opel Grandland,17490,28746.865979
...,...,...,...
150931,Porsche 992,229900,193534.076503
171633,Skoda Superb,17498,25391.120719
145559,Peugeot 2008,19790,19285.600000
14395,Audi A6,38770,24385.093285
