In [2]:
# Local Ridge Regression

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from Preprocessing.imputation import get_imputation_maps, apply_imputation, ContextImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns

# Eigene Module
from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_offerdesc import preprocessing_pipeline_offerdesc
from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline_impute
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment


In [None]:
# Version 0 - Baseline: Durchschnittspreis pro Automarke

def average_price_per_brand(df, price_column="price_in_euro", brand_column="brand", test_size=0.2, random_state=42):
    """
    Baseline-Modell mit Stratification:
    - Durchschnittspreis pro Marke im Training
    - Vorhersage auf Testdaten
    - Evaluation mit MSE, RMSE, MAE, R²
    """

    # Entferne Marken, die weniger als 2x vorkommen
    brand_counts = df[brand_column].value_counts()
    valid_brands = brand_counts[brand_counts >= 2].index
    df = df[df[brand_column].isin(valid_brands)]
    
    # Train-Test-Split mit Stratifikation auf Marke
    df_train, df_test = train_test_split(
        df,
        test_size=test_size,
        stratify=df[brand_column],
        random_state=random_state
    )

    # Durchschnittspreis pro Marke im Training
    brand_avg = df_train.groupby(brand_column)[price_column].mean()

    # Vorhersage: Testdaten bekommen den Markenmittelwert
    df_test = df_test.copy()
    df_test["predicted_price"] = df_test[brand_column].map(brand_avg)

    # Fallback: Wenn Marke nicht im Training war, setze Gesamtmittelwert
    overall_mean = df_train[price_column].mean()
    df_test["predicted_price"].fillna(overall_mean, inplace=True)

    # Evaluation
    y_true = df_test[price_column]
    y_pred = df_test["predicted_price"]

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("Baseline-Modell: Durchschnittspreis pro Marke")
    print(f"MSE:  {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"R²:   {r2:.2f}")

    return df_test[[brand_column, price_column, "predicted_price"]]

# Beispiel-Anwendung:
df = preprocessing_pipeline_impute(path='data.csv') 
average_price_per_brand(df)


Baseline-Modell: Durchschnittspreis pro Marke
MSE:  738802142.01
RMSE: 27180.92
MAE:  12561.53
R²:   0.41


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["predicted_price"].fillna(overall_mean, inplace=True)


Unnamed: 0,brand,price_in_euro,predicted_price
14228,audi,57444,27229.199473
8456,audi,19990,27229.199473
210103,volkswagen,17790,18346.889412
70454,hyundai,8490,17996.688452
89888,mazda,26900,22297.727865
...,...,...,...
196446,volkswagen,4990,18346.889412
2558,audi,6000,27229.199473
136926,opel,17650,13991.840324
111266,mercedes-benz,36999,29615.595736


In [7]:
# Version 0.1 - Baseline: Durchschnittspreis pro Modell

def average_price_per_model(df, price_column="price_in_euro", model_column="model", test_size=0.2, random_state=42):
    """
    Baseline-Modell mit Stratification:
    - Durchschnittspreis pro Modell im Training
    - Vorhersage auf Testdaten
    - Evaluation mit MSE, RMSE, MAE, R²
    """

    # Entferne Marken, die weniger als 2x vorkommen
    model_counts = df[model_column].value_counts()
    valid_models = model_counts[model_counts >= 2].index
    df = df[df[model_column].isin(valid_models)]
    
    # Train-Test-Split mit Stratifikation auf Marke
    df_train, df_test = train_test_split(
        df,
        test_size=test_size,
        stratify=df[model_column],
        random_state=random_state
    )

    # Durchschnittspreis pro Marke im Training
    brand_avg = df_train.groupby(model_column)[price_column].mean()

    # Vorhersage: Testdaten bekommen den Modellmittelwert
    df_test = df_test.copy()
    df_test["predicted_price"] = df_test[model_column].map(brand_avg)

    # Fallback: Wenn Modell nicht im Training war, setze Gesamtmittelwert
    overall_mean = df_train[price_column].mean()
    df_test["predicted_price"].fillna(overall_mean, inplace=True)

    # Evaluation
    y_true = df_test[price_column]
    y_pred = df_test["predicted_price"]

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("Baseline-Modell: Durchschnittspreis pro Modell")
    print(f"MSE:  {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE:  {mae:.2f}")
    print(f"R²:   {r2:.2f}")

    return df_test[[model_column, price_column, "predicted_price"]]

# Beispiel-Anwendung:
df = preprocessing_pipeline_impute(path='data.csv') 
average_price_per_model(df)


Baseline-Modell: Durchschnittspreis pro Modell
MSE:  320008645.18
RMSE: 17888.79
MAE:  8573.58
R²:   0.68


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test["predicted_price"].fillna(overall_mean, inplace=True)


Unnamed: 0,model,price_in_euro,predicted_price
193804,Volkswagen Passat,13490,13229.065455
6023,Audi A3,6399,17508.785036
99564,Mercedes-Benz Sprinter,14399,23509.586957
6989,Audi A8,23999,35288.578947
134795,Opel Grandland,17490,28746.865979
...,...,...,...
150931,Porsche 992,229900,193534.076503
171633,Skoda Superb,17498,25391.120719
145559,Peugeot 2008,19790,19285.600000
14395,Audi A6,38770,24385.093285


In [None]:
# Version 1.0 - Local Linear Regression: Grid Search + Label Encoding, k = 3 / 30

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    # Nimm ersten 10.000 Zeilen zufällig
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [3,30]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    # Quasi intern erledigt von scikit learn an dieser Stelle:
    # X_train_transformed = preprocessor.fit_transform(X_train)
    # model.fit(X_train_transformed, y_train)
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model


# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch()




Starte Grid Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

 Bestes n_neighbors: 30

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   Sim



In [None]:
# Version 1.0.1 - Local Linear Regression: Grid Search + Label Encoding

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_0():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    
    
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [10, 25, 50, 100, 200, 500]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    # Quasi intern erledigt von scikit learn an dieser Stelle:
    # X_train_transformed = preprocessor.fit_transform(X_train)
    # model.fit(X_train_transformed, y_train)
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model


# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_0()




Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors: 200

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   S



In [2]:
# Version 1.0.2 - Local Linear Regression: Grid Search + Label Encoding + offer-decrrption

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_0():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    df = preprocessing_pipeline_offerdesc(df)

    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [200]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    # Quasi intern erledigt von scikit learn an dieser Stelle:
    # X_train_transformed = preprocessor.fit_transform(X_train)
    # model.fit(X_train_transformed, y_train)
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model


# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_0()




Starte Grid Search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

 Bestes n_neighbors: 200

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   Si



In [14]:
# Version 1.1 - Local Linear Regression: Grid Search + Target Encoding

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_1():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [200]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_1()

Starte Grid Search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
        X,
    ...<3 lines>...
        routed_params=routed_params,
    )
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ~~~~^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 730, in fit_transform
    return last_step.fit_transform(
           ~~~~~~~~~~~~~~~~~~~~~~~^
        Xt, y, **last_step_params["fit_transform"]
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_target_encoder.py", line 273, in fit_transform
    X_out = np.empty(
        (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
        dtype=np.float64,
    )
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 48.2 GiB for an array with shape (109969, 58795) and data type float64

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
        X,
    ...<3 lines>...
        routed_params=routed_params,
    )
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ~~~~^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 730, in fit_transform
    return last_step.fit_transform(
           ~~~~~~~~~~~~~~~~~~~~~~~^
        Xt, y, **last_step_params["fit_transform"]
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_target_encoder.py", line 273, in fit_transform
    X_out = np.empty(
        (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
        dtype=np.float64,
    )
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 48.1 GiB for an array with shape (109969, 58675) and data type float64

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
        X,
    ...<3 lines>...
        routed_params=routed_params,
    )
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ~~~~^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 730, in fit_transform
    return last_step.fit_transform(
           ~~~~~~~~~~~~~~~~~~~~~~~^
        Xt, y, **last_step_params["fit_transform"]
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\preprocessing\_target_encoder.py", line 273, in fit_transform
    X_out = np.empty(
        (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
        dtype=np.float64,
    )
numpy._core._exceptions._ArrayMemoryError: Unable to allocate 48.0 GiB for an array with shape (109970, 58585) and data type float64


In [19]:
# Version 1.2 - Local Linear Regression: Grid Search + One-Hot Encoding

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_2():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100,200]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_2()

Starte Grid Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


KeyboardInterrupt: 

In [18]:
# Version 1.3 - Local Linear Regression: Grid Search + Label Encoding + Robust Scaling

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_3():
    # Preprocessing 
    df = preprocessing_pipeline(path = 'data.csv')  
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100,200]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_3()

Starte Grid Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

 Bestes n_neighbors: 200

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   Simp



In [26]:
# Version 2 - Local Lasso Regression: Grid Search + Label + Standard

class LocalLassoRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Lasso(alpha=self.alpha, max_iter=10000)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_lasso_regression_with_gridsearch():
    df = preprocessing_pipeline(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    model = LocalLassoRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    param_grid = {
        'model__n_neighbors': [100,200],
        'model__alpha': [0.01, 0.1, 1.0]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\nBestes Modell:")
    print(grid_search.best_estimator_)
    print("\nBeste Hyperparameter:")
    print(grid_search.best_params_)
    print("\nBestes Cross-Validation Ergebnis (neg_root_mse):")
    print(grid_search.best_score_)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_lasso_regression_with_gridsearch()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

Bestes Modell:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
     



In [28]:
# Version 3.0 - Local Ridge Regression: Grid Search + Label + Robust

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch():
    # Preprocessing
    df = preprocessing_pipeline(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100, 200],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 0.1, 'model__n_neighbors': 100}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
               



In [30]:
# Version 3.1 - Local Ridge Regression: Grid Search + Label + Standard

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_1():
    # Preprocessing
    df = preprocessing_pipeline(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100, 200],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_1()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1.0, 'model__n_neighbors': 100}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
             



In [3]:
# Version 3.2 - Local Ridge Regression: Grid Search + Label + Standard + Imputation (k = 50 / 100)

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_2():
    # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [50, 100],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = df.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_2()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1.0, 'model__n_neighbors': 50}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('imp_fc',
                 ContextImputer(target_col='fuel_consumption_l_100km')),
                ('imp_ps', ContextImputer(target_col='power_ps')),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   '



In [5]:
# Version 3.3 - Local Ridge Regression: Grid Search + Label + Standard + PCA

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_3():
    # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)


    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline inklusive PCA
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.9)),
        ('model', model)
    ])

    param_grid = {
        'model__n_neighbors': [100, 200],
        'model__alpha': [0.1, 1.0, 10.0]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\nBestes Modell:")
    print(grid_search.best_estimator_)
    print("\nBeste Hyperparameter:")
    print(grid_search.best_params_)
    print("\nBestes CV-Ergebnis (neg RMSE):")
    print(grid_search.best_score_)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Funktion starten
best_model = main_local_ridge_regression_with_gridsearch_3()


Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

Bestes Modell:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                         



In [7]:
# Version 3.4 - Local Ridge Regression: Grid Search + Label + Standard + Log Transformation

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch():
   # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Log-Transformation der Zielvariable
    y_train_log = np.log1p(y_train)  # log(1 + y)
    y_test_log = np.log1p(y_test)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100, 200],
        'model__alpha': [0.1, 1.0, 10.0]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train_log)  # <-- Trainiere auf log(y)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    best_model = grid_search.best_estimator_

    # Bewertung
    y_pred_log = best_model.predict(X_test)  # Vorhersage im Log-Raum
    y_pred = np.expm1(y_pred_log)  # Rücktransformation zu echtem Preis
    y_test_real = np.expm1(y_test_log)

    mse = mean_squared_error(y_test_real, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test_real, y_pred)
    r2 = r2_score(y_test_real, y_pred)

    print(f"\nTest MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch()


Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1.0, 'model__n_neighbors': 100}

Test MSE: 123675406.54
Test RMSE: 11120.94
Test MAE: 3264.85
Test R²: 0.89




In [None]:
#RFE Funktion

def run_rfe_on_preprocessed(df, target_col, preprocessor):
    """
    Führt RFE auf einem DataFrame mit Zielspalte durch.
    Erwartet, dass der Preprocessor bereits korrekt definiert wurde.

    Ergebnis:
    pd.DataFrame: Feature-reduziertes DataFrame
    list: Liste der ausgewählten Features
    """
    from sklearn.linear_model import Ridge
    from sklearn.feature_selection import RFECV
    import pandas as pd

    X = df.drop(columns=[target_col])
    y = df[target_col]

    # Preprocessing anwenden
    X_proc = preprocessor.fit_transform(X)
    feature_names = preprocessor.get_feature_names_out()

    # RFE mit Ridge
    selector = RFECV(estimator=Ridge(), cv=3, scoring='neg_root_mean_squared_error')
    selector.fit(X_proc, y)

    selected_features = feature_names[selector.support_]
    removed_features = feature_names[~selector.support_]

    X_reduced = selector.transform(X_proc)
    df_reduced = pd.DataFrame(X_reduced, columns=selected_features)
    df_reduced[target_col] = y.reset_index(drop=True)

    print(f"RFE ausgewählt: {len(selected_features)} Features")
    print("Behaltene Features:")
    print(selected_features)

    print("Entfernte Features:")
    print(removed_features)

    return df_reduced, selected_features


In [None]:
# Version 4.1. - Version 3.2. + RFE

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch():
   # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # RFE vorbereiten
    X_train_df = X_train.copy()
    X_train_df['target'] = y_train
    df = run_rfe_on_preprocessed(X_train_df, 'target', preprocessor)

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100],
        'model__alpha': [0.1]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    best_model = grid_search.best_estimator_

    # Bewertung
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nTest MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch()


Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
✅ RFE ausgewählt: 9 Features

🟢 Behaltene Features:
['num__year' 'num__power_ps' 'num__fuel_consumption_l_100km'
 'num__mileage_in_km' 'cat__brand' 'cat__model' 'cat__color'
 'cat__transmission_type' 'cat__fuel_type']

🔴 Entfernte Features:
[]
Starte Grid Search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

 Bestes n_neighbors und alpha: {'model__alpha': 0.1, 'model__n_neighbors': 100}

Test MSE: 129057839.81
Test RMSE: 11360.36
Test MAE: 3714.54
Test R²: 0.88




In [3]:
# Version 4.2 - Local Ridge Regression: Best Combo from 3 -> 3.2 + Offer Description

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_2():
    # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    df = preprocessing_pipeline_offerdesc(df)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [300],
        'model__alpha': [1]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_2()


Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
Starte Grid Search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1, 'model__n_neighbors': 300}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
        

found 0 physical cores < 1
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Test MSE: 767821508.42
Test RMSE: 27709.59
Test MAE: 16833.36
Test R²: 0.29




In [11]:
df = preprocessing_pipeline_impute(path = 'data.csv')
df = preprocessing_pipeline_offerdesc(df)
df.head()

Unnamed: 0,brand,model,color,year,price_in_euro,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,mileage_in_km,...,has_heated_seats,has_leather_interior,has_navigation,has_panoramic_roof,has_sport_features,has_trailer_hitch,has_tuev,is_cabrio,is_combi,is_coupe
0,alfa-romeo,Alfa Romeo GTV,red,1995,1300,201.0,Manual,Petrol,10.9,160500.0,...,False,False,False,False,False,False,False,False,False,False
1,alfa-romeo,Alfa Romeo 164,black,1995,24900,260.0,Manual,Petrol,,190000.0,...,False,False,False,False,False,False,False,False,False,False
2,alfa-romeo,Alfa Romeo Spider,black,1995,5900,150.0,Unknown,Petrol,,129000.0,...,False,False,False,False,False,False,False,False,False,False
3,alfa-romeo,Alfa Romeo Spider,black,1995,4900,150.0,Manual,Petrol,9.5,189500.0,...,False,False,False,False,False,False,False,False,False,False
4,alfa-romeo,Alfa Romeo 164,red,1996,17950,179.0,Manual,Petrol,7.2,96127.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Version 4.3 - Local Ridge Regression: Version 3.2 + Offer Description + RFE (Label Encoding)

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch():
   # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    df = preprocessing_pipeline_offerdesc(df)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # RFE vorbereiten
    X_train_df = X_train.copy()
    X_train_df['target'] = y_train
    df = run_rfe_on_preprocessed(X_train_df, 'target', preprocessor)

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100],
        'model__alpha': [0.1]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    best_model = grid_search.best_estimator_

    # Bewertung
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nTest MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch()

   

Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
✅ RFE ausgewählt: 28 Features

🟢 Behaltene Features:
['num__year' 'num__power_ps' 'num__fuel_consumption_l_100km'
 'num__mileage_in_km' 'num__has_acc' 'num__has_additional_motortechnology'
 'num__has_all_wheel_drive' 'num__has_assistence_systems'
 'num__has_dab_radio' 'num__has_diesel_particel_feature'
 'num__has_doubleclutch_transmission' 'num__has_heated_seats'
 'num__has_leather_interior' 'num__has_navigation'
 'num__has_panoramic_roof' 'num__has_sport_features'
 'num__has_trailer_hitch' 'num__has_tuev' 'num__is_cabrio' 'num__is_combi'
 'num__is_coupe' 'cat__brand' 'cat__model' 'cat__color'
 'cat__transmission_type' 'cat__fuel_type' 'cat__offer_description'
 'cat__offer_description_cleaned']

🔴 E



In [None]:
# Version 4.4 - Local Ridge Regression: Version 3.2 + Segments

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_2():
    # Preprocessing
    df = preprocessing_pipeline_impute(path = 'data.csv')
    df = preprocessing_pipeline_segment(df)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [100],
        'model__alpha': [0.1]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_2()


Fehlende Werte vor Imputation:
fuel_consumption:
12967 in X_train
3233 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
88 in X_train
26 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
Starte Grid Search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

 Bestes n_neighbors und alpha: {'model__alpha': 0.1, 'model__n_neighbors': 100}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
      



In [4]:
# Version 4.4.1 - Evaluierung nach Segmenten

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]
            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Funktion zur Bewertung des Modells nach einzelnen Segmenten
def evaluate_by_segment(X_test, y_test, y_pred, segments):
    segment_results = []
    for segment in segments.unique():
        mask = segments == segment
        mse = mean_squared_error(y_test[mask], y_pred[mask])
        rmse = mse ** 0.5
        mae = mean_absolute_error(y_test[mask], y_pred[mask])
        r2 = r2_score(y_test[mask], y_pred[mask])
        segment_results.append({
            'segment': segment,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2
        })
    return pd.DataFrame(segment_results)

def main_local_ridge_regression_with_gridsearch_segments():
    df = preprocessing_pipeline_impute(path='data.csv')
    df = preprocessing_pipeline_segment(df)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    # Transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline und GridSearch
    model = LocalRidgeRegressor()
    full_pipeline = Pipeline([
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    param_grid = {
        'model__n_neighbors': [100],
        'model__alpha': [0.1]
    }
    grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                               scoring='neg_root_mean_squared_error',
                               n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    # Segmentbewertung
    segment_results = evaluate_by_segment(X_test, y_test, y_pred, X_test['segment'])

    print("Segmentweise Modellbewertung:")
    print(segment_results)

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_segments()


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Segmentweise Modellbewertung:
              segment           MSE           RMSE           MAE        R2
0                 Van  6.069469e+07    7790.679959   3795.135486  0.780824
1       Kompaktklasse  4.745657e+07    6888.872806   2125.211073  0.656281
2                 SUV  8.347351e+07    9136.383769   4033.842748  0.895114
3          Kleinwagen  9.152619e+06    3025.329623   1666.242241  0.844893
4          Sportwagen  4.517932e+08   21255.428223  10648.432927  0.886246
5        Mittelklasse  2.931806e+07    5414.615795   2907.484854  0.852424
6  Obere Mittelklasse  8.816683e+07    9389.719335   4524.039191  0.767138
7         Luxusklasse  4.380596e+08   20929.872529  13233.819486  0.904198
8     Supersportwagen  1.091026e+10  104452.192198  75953.586472  0.754539




In [None]:
# Version 4.5. - Local Ridge Regression: Best Combo from Versions 3 + Offer Description + Segments

# Macht tatsächlich irgendwie keinen Sinn, weil Offer_Description die Vorhersagen so schlecht macht