In [14]:

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.base import BaseEstimator, RegressorMixin



In [15]:

# Version 1 Local Linear Regression mit Grid Search

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch():
    # Preprocessing 
    df = preprocessing_pipeline()  
    # Nimm ersten 10.000 Zeilen zufällig
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 


    numeric_transformer = Pipeline(steps=[
        # NaN Werte
        ('imputer', SimpleImputer(strategy='median')),
        # Alle anderen Werte
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalLinearRegressor()  

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [3,30]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    # Printe alle Ergebnisse

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewerte
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Berechnungen
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Ausgabe
    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model



In [16]:
best_model = main_local_linear_regression_with_gridsearch()


Starte Grid Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

 Bestes n_neighbors: 3

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'fuel_consumption_g_km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
     



In [20]:
# Version 1.2 Local Linear Regression mit Grid Search und PCA

from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)  # Falls X ein Numpy-Array ist
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = LinearRegression()
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Eigentliche Funktion
def main_local_linear_regression_with_gridsearch_pca():
    # Preprocessing 
    df = preprocessing_pipeline()  
    # Nimm ersten 10.000 Zeilen zufällig
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df) 

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline inklusive PCA
    model = LocalLinearRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.9)),  # Hier wird PCA Schritt eingefügt
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [3, 30]
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors:", grid_search.best_params_['model__n_neighbors'])

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_linear_regression_with_gridsearch_pca()


Starte Grid Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

 Bestes n_neighbors: 30

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'fuel_consumption_g_km',
                                                   'mileage_in_km']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
    



In [None]:
# Version 1.3. Local Ridge Regression mit Grid Search 

from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# Local Ridge Regression mit Grid Search
class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch():
    # Preprocessing
    df = preprocessing_pipeline()
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [30, 40],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 10.0, 'model__n_neighbors': 40}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'fuel_consumption_g_km',
                                                   'mileage_in_km']),
                                                 ('cat',
                       



In [24]:
# Version 1.4. Local Ridge Regression mit Grid Search & Log Transformation d. Targets

from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# Local Ridge Regression
class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_log_target():
    # Preprocessing
    df = preprocessing_pipeline()
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    # Transformer Pipelines
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Gesamte Pipeline (ohne PCA!)
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [15, 30],
        'model__alpha': [0.1, 1.0, 10.0]
    }

    # Log-Transformation des Targets
    y_train_log = np.log1p(y_train)

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train_log)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung auf Testdaten
    best_model = grid_search.best_estimator_
    y_pred_log = best_model.predict(X_test)
    y_pred = np.expm1(y_pred_log)  # Rücktransformation

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_log_target()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 1.0, 'model__n_neighbors': 30}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'fuel_consumption_g_km',
                                                   'mileage_in_km']),
                                                 ('cat',
                          



In [25]:
# Version 1.5. Local Ridge Regression mit Grid Search (Label Encoding)

from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

# Local Ridge Regression mit Grid Search
class LocalRidgeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=10, alpha=1.0):
        self.n_neighbors = n_neighbors
        self.alpha = alpha
        self.nn = None
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = pd.DataFrame(X)
        self.y_train = pd.Series(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train)
        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        predictions = []
        for x in X.values:
            distances, indices = self.nn.kneighbors([x])
            X_neighbors = self.X_train.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            model = Ridge(alpha=self.alpha)
            model.fit(X_neighbors, y_neighbors)
            pred = model.predict([x])
            predictions.append(pred[0])
        return np.array(predictions)

# Hauptfunktion
def main_local_ridge_regression_with_gridsearch_label():
    # Preprocessing
    df = preprocessing_pipeline()
    df = df.sample(n=10000, random_state=42).reset_index(drop=True)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Pipeline
    model = LocalRidgeRegressor()

    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Gridsearch
    param_grid = {
        'model__n_neighbors': [30, 40],
        'model__alpha': [0.1, 1.0, 10.0]  # Ridge Regularisierung (klein/mittel/stark)
    }

    grid_search = GridSearchCV(
        full_pipeline,
        param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )

    print("Starte Grid Search...")
    grid_search.fit(X_train, y_train)

    print("\n Bestes n_neighbors und alpha:", grid_search.best_params_)

    print("\n Bestes Modell (beste Pipeline):")
    print(grid_search.best_estimator_)

    print("\n Beste Hyperparameter:")
    print(grid_search.best_params_)

    print("\n Bestes Cross-Validation Ergebnis (neg_root_mean_squared_error):")
    print(grid_search.best_score_)

    # Bewertung
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {mse:.2f}")
    print(f"Test RMSE: {rmse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.2f}")

    return best_model

# Aufruf der Funktion
best_model = main_local_ridge_regression_with_gridsearch_label()


Starte Grid Search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits

 Bestes n_neighbors und alpha: {'model__alpha': 10.0, 'model__n_neighbors': 30}

 Bestes Modell (beste Pipeline):
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['power_ps',
                                                   'fuel_consumption_l_100km',
                                                   'fuel_consumption_g_km',
                                                   'mileage_in_km']),
                                                 ('cat',
                       



In [None]:
# Version 2 Local Linear Regression ohne Grid Search

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
import joblib

# Hier versuche ich, für KNN nur die numerischen Features zu verwenden
# Als Versuch, dass mein RMSE besser wird 

class LocalLinearRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_neighbors=30):
        self.n_neighbors = n_neighbors
        self.nn = None
        self.X_train_full = None
        self.X_train_numeric = None
        self.y_train = None
        self.numeric_features = None

    def fit(self, X, y):
        X = pd.DataFrame(X)
        self.X_train_full = X.copy()
        self.y_train = pd.Series(y)

        # Nur numerische Spalten für NearestNeighbors
        self.numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
        X_numeric = X[self.numeric_features]

        self.X_train_numeric = X_numeric.copy()

        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(self.X_train_numeric)

        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        X_numeric = X[self.numeric_features]

        predictions = []
        for idx, x_num in enumerate(X_numeric.values):
            distances, indices = self.nn.kneighbors([x_num])
            X_neighbors = self.X_train_full.iloc[indices[0]]
            y_neighbors = self.y_train.iloc[indices[0]]

            # Falls noch ein NaN gefunden wird, soll der Mean verwendet werdem
            if np.isnan(X_neighbors.values).any() or np.isinf(X_neighbors.values).any():
                pred = np.mean(y_neighbors)
            else:
                model = LinearRegression()
                model.fit(X_neighbors, y_neighbors)
                pred = model.predict([self.X_train_full.iloc[idx].values])[0]

            predictions.append(pred)

        return np.array(predictions)

# Eigentliche Funktion

def train_final_local_linear_regression_with_target_transform():
    # Preprocessing
    df = preprocessing_pipeline() 
    # Train - Test Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df)  

    # Nur Stichprobe fürs Training verwenden
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Auflistung Features
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    print("\n Erkannte numerische Features:", numeric_features)
    print("\n Erkannte kategorische Features:", categorical_features)
   
    # Encoding
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    if np.isnan(X_train_transformed).any() or np.isinf(X_train_transformed).any():
        print("Bereinige...")
        X_train_transformed = np.nan_to_num(X_train_transformed, nan=0.0, posinf=1e10, neginf=-1e10)

    if np.isnan(X_test_transformed).any() or np.isinf(X_test_transformed).any():
        print("Bereinige...")
        X_test_transformed = np.nan_to_num(X_test_transformed, nan=0.0, posinf=1e10, neginf=-1e10)

    # Modell + log transformation auf ziel variable
    # Hier mit k = 30

    local_model = LocalLinearRegressor(n_neighbors=30)

    model_with_target_transform = TransformedTargetRegressor(
        regressor=local_model,
        func=np.log1p,
        inverse_func=np.expm1
    )

    # Pipeline
    full_pipeline = Pipeline(steps=[
        ('model', model_with_target_transform)
    ])

    # Trainieren des Modells
    print("Trainiere Local Linear Regression (k=30) mit log-transformiertem Target...")
    full_pipeline.fit(X_train_transformed, y_train)

    # Bewerten
    y_pred = full_pipeline.predict(X_test_transformed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    print(f"Test RMSE auf 2.000 Testpunkten (mit log-Transformation): {rmse:.2f}")

    print("Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return full_pipeline

# Speichern des Modells in final_model
final_model = train_final_local_linear_regression_with_target_transform()



 Erkannte numerische Features: ['power_ps', 'fuel_consumption_l_100km', 'fuel_consumption_g_km', 'mileage_in_km']

 Erkannte kategorische Features: ['brand', 'model', 'color', 'transmission_type', 'fuel_type']
Trainiere Local Linear Regression (k=30) mit log-transformiertem Target...


found 0 physical cores < 1
  File "c:\Users\Bibbe\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Test RMSE auf 2.000 Testpunkten (mit log-Transformation): 424075979668909194233466181441700476702567471567881568360857600.00
Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):
Echter Preis: 7499.00 - Vorhergesagt: 118.91
Echter Preis: 4299.00 - Vorhergesagt: 51600.29
Echter Preis: 2000.00 - Vorhergesagt: 224776.75
Echter Preis: 28440.00 - Vorhergesagt: 26139.31
Echter Preis: 15988.00 - Vorhergesagt: 32869.37
Echter Preis: 18890.00 - Vorhergesagt: 59579.85
Echter Preis: 98500.00 - Vorhergesagt: 21290.45
Echter Preis: 11500.00 - Vorhergesagt: 26218.24
Echter Preis: 25890.00 - Vorhergesagt: 129022.62
Echter Preis: 16950.00 - Vorhergesagt: 16955.66
