In [44]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV


import gc  # Garbage Collector zur Speicherverwaltung
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split import split_data
from eval_call import evaluate_model


In [63]:
def main_transformed(model, use_gridsearch=False):
    df = preprocessing_pipeline() 
    df = preprocessing_pipeline_segment(df)
    X_train, X_test, y_train, y_test, X, y, categorical_features, numeric_features = split_data(df)

    # Preprocessing-Pipelines erstellen
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Modell + Transformation
    transformed_model = TransformedTargetRegressor(
        regressor=model,
        func=np.log1p,
        inverse_func=np.expm1
    )

    # Pipeline aufbauen
    linear_regression_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', transformed_model)
    ])

    if use_gridsearch:
        # Grid Search Setup
        from sklearn.model_selection import GridSearchCV

        param_grid = {
            'model__regressor__alpha': np.logspace(-3, 3, 30)
        }

        grid_search = GridSearchCV(
            linear_regression_pipeline,
            param_grid,
            cv=5,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=2
        )

        print("Starte Grid Search...")
        grid_search.fit(X_train, y_train)

        best_pipeline = grid_search.best_estimator_
        print("Bestes Alpha:", grid_search.best_params_)

        y_pred = best_pipeline.predict(X_test)
        evaluate_model(y_test, y_pred, "Ridge Regression mit GridSearch")
    
    else:
        # Normales Training ohne GridSearch
        print("Trainiere Modell ohne GridSearch...")
        linear_regression_pipeline.fit(X_train, y_train)
        y_pred = linear_regression_pipeline.predict(X_test)
        evaluate_model(y_test, y_pred, "Linear Regression (ohne GridSearch)")

In [13]:
def main():
    df = preprocessing_pipeline() 
    df = preprocessing_pipeline_segment(df)
    X_train, X_test, y_train, y_test , X,y, categorical_features , numeric_features = split_data(df)
    #print(X_train.head(5))

    # Preprocessing-Pipelines erstellen
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    linear_regression_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    # Modell trainieren
    linear_regression_pipeline.fit(X_train, y_train)

    # Vorhersagen treffen
    y_pred_lr = linear_regression_pipeline.predict(X_test)

    
    evaluate_model(y_test, y_pred_lr, "Linear Regression")

In [31]:
print('model performance price log: ')
main_transformed()
print('model performance without price log: ')
main()

model performance price log: 
Linear Regression Performance Metrics:
MAE: 4510.89
MSE: 115380032.71
RMSE: 10741.51
R²: 0.77
------------------------------
model performance without price log: 
Linear Regression Performance Metrics:
MAE: 5082.45
MSE: 142297787.83
RMSE: 11928.86
R²: 0.71
------------------------------


In [66]:
for model in [LinearRegression(), Ridge(alpha=8.53), Lasso(alpha = 0.001)]:
    print(f'Performance for Model:{model}, y scaled using log: \n')
    main_transformed(model)

Performance for Model:LinearRegression(), y scaled using log: 

Trainiere Modell ohne GridSearch...
Linear Regression (ohne GridSearch) Performance Metrics:
MAE: 4510.89
MSE: 115380032.71
RMSE: 10741.51
R²: 0.77
------------------------------
Performance for Model:Ridge(alpha=8.53), y scaled using log: 

Trainiere Modell ohne GridSearch...
Linear Regression (ohne GridSearch) Performance Metrics:
MAE: 4504.59
MSE: 115057203.77
RMSE: 10726.47
R²: 0.77
------------------------------
Performance for Model:Lasso(alpha=0.001), y scaled using log: 

Trainiere Modell ohne GridSearch...
Linear Regression (ohne GridSearch) Performance Metrics:
MAE: 5132.50
MSE: 137692102.77
RMSE: 11734.23
R²: 0.72
------------------------------


In [64]:
main_transformed(Ridge(), use_gridsearch= True)

Starte Grid Search...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ......................model__regressor__alpha=0.001; total time=   0.5s
[CV] END ......................model__regressor__alpha=0.001; total time=   0.5s
[CV] END ......................model__regressor__alpha=0.001; total time=   0.5s
[CV] END ......................model__regressor__alpha=0.001; total time=   0.5s
[CV] END ......................model__regressor__alpha=0.001; total time=   0.4s
[CV] END ......model__regressor__alpha=0.0016102620275609393; total time=   0.4s
[CV] END ......model__regressor__alpha=0.0016102620275609393; total time=   0.4s
[CV] END ......model__regressor__alpha=0.0016102620275609393; total time=   0.4s
[CV] END ......model__regressor__alpha=0.0016102620275609393; total time=   0.4s
[CV] END ......model__regressor__alpha=0.0016102620275609393; total time=   0.3s
[CV] END .......model__regressor__alpha=0.002592943797404667; total time=   0.4s
[CV] END .......model__re