In [1]:
# Local Ridge Regression

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from Preprocessing.imputation import get_imputation_maps, apply_imputation, ContextImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
import seaborn as sns

# Eigene Module
from Preprocessing.split_new import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_offerdesc import preprocessing_pipeline_offerdesc
from Preprocessing.preprocessing_pipeline_impute_fuel_types_no_drop import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment


In [None]:
# Version 3.2.ND - Local Ridge Regression: Grid Search + Label + Standard + Imputation, k = 100 / 200

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_2():
    # Preprocessing
    X_train, X_test, y_train, y_test, categorical_features , numeric_features = split_data()

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [50],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    print("\n--- Evaluation nach fuel_type ---")
    X_test_with_fuel = X_test.copy()
    X_test_with_fuel['fuel_type'] = X_test_with_fuel.loc[X_test.index, 'fuel_type']
    y_test_series = pd.Series(y_test, index=X_test.index)
    y_pred_series = pd.Series(y_pred, index=X_test.index)

    for fuel in X_test_with_fuel['fuel_type'].unique():
        mask = X_test_with_fuel['fuel_type'] == fuel
        y_true_fuel = y_test_series[mask]
        y_pred_fuel = y_pred_series[mask]

        mse_fuel = mean_squared_error(y_true_fuel, y_pred_fuel)
        rmse_fuel = mse_fuel ** 0.5
        mae_fuel = mean_absolute_error(y_true_fuel, y_pred_fuel)
        r2_fuel = r2_score(y_true_fuel, y_pred_fuel)

        print(f"\nFuel Type: {fuel}")
        print(f"  RMSE: {rmse_fuel:.2f}")
        print(f"  MAE:  {mae_fuel:.2f}")
        print(f"  R²:   {r2_fuel:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_2()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1.0, 'model__n_neighbors': 50}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('imp_fc',
                 ContextImputer(target_col='fuel_consumption_l_100km')),
                ('imp_ps', ContextImputer(target_col='power_ps')),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['year', 'power_ps',
                                                   'fuel_consumption_l_100km',
                                                   '



In [2]:
# Version 4.4.1.ND - Version 3.2 + Segmente + Evaluation nach Segmenten

class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]
            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Funktion zur Bewertung des Modells nach einzelnen Segmenten
def evaluate_by_segment(X_test, y_test, y_pred, segments):
    segment_results = []
    for segment in segments.unique():
        mask = segments == segment
        mse = mean_squared_error(y_test[mask], y_pred[mask])
        rmse = mse ** 0.5
        mae = mean_absolute_error(y_test[mask], y_pred[mask])
        r2 = r2_score(y_test[mask], y_pred[mask])
        segment_results.append({
            'segment': segment,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2
        })
    return pd.DataFrame(segment_results)

def main_local_ridge_regression_with_gridsearch_segments():
    # Preprocessing
    X_train, X_test, y_train, y_test, categorical_features , numeric_features = split_data()

    # Transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline und GridSearch
    model = LocalRidgeRegressor()
    full_pipeline = Pipeline([
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    param_grid = {
        'model__n_neighbors': [50,100],
        'model__alpha': [1.0]
    }
    grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                               scoring='neg_root_mean_squared_error',
                               n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

     # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    # Segmentbewertung
    segment_results = evaluate_by_segment(X_test, y_test, y_pred, X_test['segment'])

    print("Segmentweise Modellbewertung:")
    print(segment_results)

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_segments()


Fitting 3 folds for each of 2 candidates, totalling 6 fits




Test MSE: 951108682.69
Test RMSE: 30840.05
Test MAE: 3956.80
Test R²: 0.55
Segmentweise Modellbewertung:
              segment           MSE           RMSE            MAE        R2
0                 SUV  7.208928e+07    8490.540499    3961.415800  0.901574
1          Kleinwagen  1.460295e+07    3821.380139    1880.273265  0.809833
2       Kompaktklasse  4.584738e+07    6771.069051    2215.189855  0.678823
3                 Van  3.722763e+07    6101.445175    3550.543318  0.869072
4  Obere Mittelklasse  1.317454e+08   11478.040406    4564.835281  0.697212
5          Sportwagen  8.828313e+08   29712.477314   10903.449370  0.802874
6        Mittelklasse  3.510492e+07    5924.940814    2975.749909  0.848630
7         Luxusklasse  7.520931e+08   27424.316077   13563.840200  0.856024
8     Supersportwagen  2.686990e+11  518361.811394  147909.163954  0.100124


