In [1]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV, KFold


import xgboost as xgb
import pandas as pd
import gc
import sys
import os
sys.path.append(os.path.abspath("../../.."))


from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.imputation import get_imputation_maps, apply_imputation,ContextImputer
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split_new import split_data
from utils.eval_call import evaluate_model

In [2]:
def main():
    # 1) Daten laden / splitten
    X_train, X_test, y_train, y_test, cat_feats, num_feats = split_data('../../../data.csv')
    
    # 2) Transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, num_feats),
        ('cat', categorical_transformer, cat_feats)
    ])

    # 3) XGBoost-Pipeline
    xgb_pipeline = Pipeline([
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('imp_er', ContextImputer('electric_range')),
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        ))
    ])

    print("Training XGBoost model...")
    xgb_pipeline.fit(X_train, y_train)

        # --- nach dem Fit ---
    xgb_model      = xgb_pipeline.named_steps['model']          # XGBRegressor-Objekt


    feature_names  = (
        xgb_pipeline.named_steps['preprocessor']
        .get_feature_names_out()         
    )

    # Gain-Importance direkt aus XGBoost
    importances = xgb_model.feature_importances_   # numpy-Array

    # 👉 DataFrame bauen & sortieren
    fi = (pd.DataFrame({'feature': feature_names,
                        'gain'   : importances})
            .sort_values('gain', ascending=False)
            .reset_index(drop=True))
    print(fi.head(30))
    (print(len(fi)))
    y_pred = xgb_pipeline.predict(X_test)
    evaluate_model(y_test, y_pred, "XGBoost")

if __name__ == "__main__":
    main()

Training XGBoost model...
                              feature      gain
0              cat__brand_lamborghini  0.069793
1                  cat__brand_ferrari  0.059224
2             cat__fuel_type_Electric  0.051922
3                       num__power_ps  0.044058
4              cat__model_Lamborghini  0.035620
5    cat__model_Lamborghini Aventador  0.029109
6              cat__model_Porsche 918  0.029104
7                  cat__brand_bentley  0.027363
8                    cat__brand_dodge  0.026340
9               cat__fuel_type_Hybrid  0.022349
10            cat__brand_aston-martin  0.018603
11                 cat__brand_porsche  0.017868
12    cat__model_Aston Martin Vantage  0.017273
13   cat__transmission_type_Automatic  0.017071
14             cat__model_Porsche 991  0.016391
15                 cat__fuel_type_CNG  0.014157
16      cat__model_Porsche Carrera GT  0.013637
17                    cat__brand_jeep  0.013022
18       cat__model_Ferrari Portofino  0.011738
19            

In [None]:
def main():
    # 1) Daten laden / splitten
    X_train, X_test, y_train, y_test, cat_feats, num_feats = split_data('../../../data.csv')
    
    # 2) Transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, num_feats),
        ('cat', categorical_transformer, cat_feats)
    ])

    # 3) XGBoost-Pipeline
    xgb_pipeline = Pipeline([
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('imp_er', ContextImputer('electric_range')),
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        ))
    ])

    print("Training XGBoost model...")
    xgb_pipeline.fit(X_train, y_train)

    # ------------------------
    # 1) Hyper-Parameterraum
    # ------------------------
    param_dist = {
        'model__n_estimators'    : randint(300, 1000),
        'model__learning_rate'   : uniform(0.01, 0.15),
        'model__max_depth'       : randint(5, 10),
        'model__subsample'       : uniform(0.6, 0.4),         # 0.6 – 1.0
        'model__colsample_bytree': uniform(0.6, 0.4),
        'model__min_child_weight': randint(1, 10),
        'model__gamma'           : uniform(0, 5),        
    }

    # ------------------------
    # 2) Cross-Validation-Setup
    # ------------------------
    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    rs = RandomizedSearchCV(
        estimator = xgb_pipeline,
        param_distributions = param_dist,
        n_iter = 20,                  
        scoring = 'neg_mean_absolute_error',
        cv = cv,
        n_jobs = -1,                   # alle Kerne
        verbose = 2,
        random_state = 42,
        return_train_score = False
    )

    rs.fit(X_train, y_train, model__verbose = False)

    print("Best RMSE   :", -rs.best_score_)
    print("Best params :")
    for k, v in rs.best_params_.items():
        print(f"   {k}: {v}")

    best_pipeline = rs.best_estimator_
    y_pred = best_pipeline.predict(X_test)
    evaluate_model(y_test, y_pred, "XGBoost Tuned")

if __name__ == "__main__":
    main()

Training XGBoost model...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END model__colsample_bytree=0.6399899663272012, model__gamma=2.2962444598293357, model__learning_rate=0.06005629167085327, model__max_depth=7, model__min_child_weight=6, model__n_estimators=608, model__subsample=0.9879639408647978; total time=  13.5s
[CV] END model__colsample_bytree=0.6399899663272012, model__gamma=2.2962444598293357, model__learning_rate=0.06005629167085327, model__max_depth=7, model__min_child_weight=6, model__n_estimators=608, model__subsample=0.9879639408647978; total time=  13.5s
[CV] END model__colsample_bytree=0.6399899663272012, model__gamma=2.2962444598293357, model__learning_rate=0.06005629167085327, model__max_depth=7, model__min_child_weight=6, model__n_estimators=608, model__subsample=0.9879639408647978; total time=  13.6s
[CV] END model__colsample_bytree=0.749816047538945, model__gamma=4.75357153204958, model__learning_rate=0.11979909127171076, model__max_depth=9, 