In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import GridSearchCV

import sys
import os

import sys
import os
sys.path.append(os.path.abspath("../.."))

from Preprocessing.imputation import get_imputation_maps, apply_imputation, ContextImputer
from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split import split_data
from eval_call import evaluate_model

In [4]:
def main():
    df = preprocessing_pipeline('../../data.csv') 
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    transformed_model = TransformedTargetRegressor(
        regressor=Ridge(),
        func=np.log1p,
        inverse_func=np.expm1
    )

    linear_regression_pipeline = Pipeline(steps=[
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('preprocessor', preprocessor),
        ('model', transformed_model)
    ])

    param_grid = {
        'model__regressor__alpha': np.logspace(-4, 3, 20)
    }

    grid_search = GridSearchCV(linear_regression_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred_lr = best_model.predict(X_test)

    print("Bestes Alpha:", grid_search.best_params_['model__regressor__alpha'])
    evaluate_model(y_test, y_pred_lr, "Linear Regression (Ridge mit GridSearch)")

if __name__ == "__main__":
    main()

Bestes Alpha: 1.1288378916846884
Linear Regression (Ridge mit GridSearch) Performance Metrics:
MAE: 3594.93
MSE: 141174545.25
RMSE: 11881.69
R²: 0.87
------------------------------
