In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression


import sys
import os

import sys
import os
sys.path.append(os.path.abspath("../.."))

from Preprocessing.imputation import get_imputation_maps, apply_imputation
from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split import split_data
from eval_call import evaluate_model

In [4]:
def main():
    df = preprocessing_pipeline('../../data.csv') 
    #df = preprocessing_pipeline_segment(df)
    X_train, X_test, y_train, y_test , X,y, categorical_features , numeric_features = split_data(df)
    print("Fehlende Werte vor Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)
    
    # Mapping aus dem Trainingsset lernen
    fuel_maps = get_imputation_maps(X_train, target_col='fuel_consumption_l_100km')
    ps_maps = get_imputation_maps(X_train, target_col='power_ps')

    
    # Mapping auf Trainings- und Testdaten anwenden
    X_train = apply_imputation(X_train, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_test = apply_imputation(X_test, target_col='fuel_consumption_l_100km', maps=fuel_maps)
    X_train = apply_imputation(X_train, target_col='power_ps', maps=ps_maps)
    X_test = apply_imputation(X_test, target_col='power_ps', maps=ps_maps)


    print("Fehlende Werte nach Imputation:")
    print('fuel_consumption:')
    print(X_train['fuel_consumption_l_100km'].isna().sum(), "in X_train")
    print(X_test['fuel_consumption_l_100km'].isna().sum(), "in X_test")
    print('power_ps:')
    print(X_train['power_ps'].isna().sum(), "in X_train")
    print(X_test['power_ps'].isna().sum(), "in X_test")
    print('-'*30)

    # Preprocessing-Pipelines erstellen
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])


    linear_regression_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    # Modell trainieren
    linear_regression_pipeline.fit(X_train, y_train)

    # Vorhersagen treffen
    y_pred_lr = linear_regression_pipeline.predict(X_test)



    evaluate_model(y_test, y_pred_lr, "Linear Regression")



if __name__ == "__main__":
    main()

Fehlende Werte vor Imputation:
fuel_consumption:
14620 in X_train
3642 in X_test
power_ps:
61 in X_train
11 in X_test
------------------------------
Fehlende Werte nach Imputation:
fuel_consumption:
100 in X_train
29 in X_test
power_ps:
1 in X_train
0 in X_test
------------------------------
Linear Regression Performance Metrics:
MAE: 5405.22
MSE: 199688824.89
RMSE: 14131.13
R²: 0.82
------------------------------
