In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression


import sys
import os

import sys
import os
sys.path.append(os.path.abspath("../../.."))

from Preprocessing.imputation import get_imputation_maps, apply_imputation, ContextImputer

from Preprocessing.split_new import split_data
from utils.eval_call import evaluate_model

In [5]:
def main():


    X_train, X_test, y_train, y_test, categorical_features , numeric_features = split_data('../../../data.csv')

    # Preprocessing-Pipelines erstellen
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    transformed_model = TransformedTargetRegressor(
        regressor=LinearRegression(),
        func=np.log1p,
        inverse_func=np.expm1
    )


    linear_regression_pipeline = Pipeline(steps=[
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('imp_er', ContextImputer('electric_range')),
        ('preprocessor', preprocessor),
        ('model', transformed_model)
    ])

    # Modell trainieren
    linear_regression_pipeline.fit(X_train, y_train)

    # Vorhersagen treffen
    y_pred_lr = linear_regression_pipeline.predict(X_test)



    evaluate_model(y_test, y_pred_lr, "Linear Regression")

        # Modell trainieren
    linear_regression_pipeline.fit(X_train, y_train)

    # Vorhersagen treffen
    y_pred_lr = linear_regression_pipeline.predict(X_test)

    # Modell evaluieren (gesamt)
    evaluate_model(y_test, y_pred_lr, "Linear Regression")

    # --------------------------------------------
    # üîç Fehleranalyse nach fuel_type
    # --------------------------------------------

    # Pr√ºfe, ob fuel_type in X_test enthalten ist
    if 'fuel_type' in X_test.columns:
        results_df = pd.DataFrame({
            'fuel_type': X_test['fuel_type'].values,
            'y_true': y_test.values,
            'y_pred': y_pred_lr
        })

        from sklearn.metrics import mean_absolute_error, mean_squared_error

        print("\nFehleranalyse nach Fuel Type:")
        for fuel in results_df['fuel_type'].unique():
            subset = results_df[results_df['fuel_type'] == fuel]
            mae = mean_absolute_error(subset['y_true'], subset['y_pred'])
            print(f"{fuel:10s} ‚Äì MAE: {mae:.2f}, N = {len(subset)}")
    else:
        print("\n‚ö†Ô∏è 'fuel_type' nicht in X_test enthalten ‚Äì Fuelanalyse nicht m√∂glich.")


if __name__ == "__main__":
    main()

Linear Regression Performance Metrics:
MAE: 3813.86
MSE: 300838466.66
RMSE: 17344.70
R¬≤: 0.77
------------------------------
Linear Regression Performance Metrics:
MAE: 3813.86
MSE: 300838466.66
RMSE: 17344.70
R¬≤: 0.77
------------------------------

Fehleranalyse nach Fuel Type:
Diesel     ‚Äì MAE: 3384.80, N = 17238
Petrol     ‚Äì MAE: 3828.41, N = 28698
Electric   ‚Äì MAE: 5028.12, N = 1182
Hybrid     ‚Äì MAE: 6069.75, N = 2509
Hydrogen   ‚Äì MAE: 7883.07, N = 19
LPG        ‚Äì MAE: 2768.04, N = 240
CNG        ‚Äì MAE: 2276.79, N = 119
Other      ‚Äì MAE: 4312.49, N = 40
Diesel Hybrid ‚Äì MAE: 5897.03, N = 90
Unknown    ‚Äì MAE: 6412.84, N = 15
Ethanol    ‚Äì MAE: 549.75, N = 2
