In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor


import sys
import os

import sys
import os
sys.path.append(os.path.abspath("../../.."))

from utils.scatter_plot import scatter_prediction
from utils.eval_call import evaluate_model

from Preprocessing.imputation import get_imputation_maps, apply_imputation, ContextImputer

from Preprocessing.split_new import split_data
from utils.eval_call import evaluate_model

In [2]:
from sklearn.model_selection import KFold, cross_validate
import numpy as np

X_train, X_test, y_train, y_test, categorical_features , numeric_features = split_data('../../../data.csv')

# Preprocessing-Pipelines erstellen
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


knn_pipeline = Pipeline(steps=[
    ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
    ('imp_ps', ContextImputer('power_ps')),
    ('imp_er', ContextImputer('electric_range')),
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(
        n_neighbors=4,
        weights='distance',  
        algorithm='auto',
        p = 2, 
        n_jobs=-1 
              ))
])


print("Performing k-fold cross-validation...")
k_folds = 5
cv = KFold(n_splits=k_folds, shuffle=True, random_state=42)


scoring = {
    'mae': 'neg_mean_absolute_error',
    'mse': 'neg_mean_squared_error',
    'r2': 'r2'
}

cv_results = cross_validate(knn_pipeline, X_train, y_train, cv=cv, 
                           scoring=scoring, verbose=1)


mae_scores = -cv_results['test_mae']
mse_scores = -cv_results['test_mse']
rmse_scores = np.sqrt(mse_scores)
r2_scores = cv_results['test_r2']


print(f"Cross-validation MAE scores: {mae_scores}")
print(f"Mean MAE: {mae_scores.mean():.2f}, Std: {mae_scores.std():.2f}")


print("\nDetailed metrics:")
print(f"MAE: {mae_scores.mean():.2f}")
print(f"MSE: {mse_scores.mean():.2f}")
print(f"RMSE: {rmse_scores.mean():.2f}")
print(f"R²: {r2_scores.mean():.4f}")




Performing k-fold cross-validation...
Cross-validation MAE scores: [3990.82985808 3954.9337927  4088.17098563 3861.14577895 4020.4662196 ]
Mean MAE: 3983.11, Std: 75.05

Detailed metrics:
MAE: 3983.11
MSE: 233972760.34
RMSE: 15149.31
R²: 0.8116
