In [2]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV, KFold


import xgboost as xgb
import pandas as pd
import gc
import sys
import os
sys.path.append(os.path.abspath("../../.."))


from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.imputation import get_imputation_maps, apply_imputation,ContextImputer
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split_new import split_data
from utils.eval_call import evaluate_model

In [3]:
def main():
    # 1) Daten laden / splitten
    X_train, X_test, y_train, y_test, cat_feats, num_feats = split_data('../../../data.csv', segment= True)
    
    # 2) Transformer
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, num_feats),
        ('cat', categorical_transformer, cat_feats)
    ])

    # 3) XGBoost-Pipeline
    xgb_pipeline = Pipeline([
        ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
        ('imp_ps', ContextImputer('power_ps')),
        ('imp_er', ContextImputer('electric_range')),
        ('preprocessor', preprocessor),
        ('model', xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=7,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        ))
    ])

    print("Training XGBoost model...")
    xgb_pipeline.fit(X_train, y_train)


    param_dist = {
        'model__n_estimators'    : randint(300, 1000),
        'model__learning_rate'   : uniform(0.01, 0.15),
        'model__max_depth'       : randint(5, 10),
        'model__subsample'       : uniform(0.6, 0.4),        
        'model__colsample_bytree': uniform(0.6, 0.4),
        'model__min_child_weight': randint(1, 10),
        'model__gamma'           : uniform(0, 5),        
    }


    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    rs = RandomizedSearchCV(
        estimator = xgb_pipeline,
        param_distributions = param_dist,
        n_iter = 20,                  
        scoring = 'neg_mean_absolute_percentage_error',
        cv = cv,
        n_jobs = -1,                   # alle Kerne
        verbose = 2,
        random_state = 42,
        return_train_score = False
    )

    rs.fit(X_train, y_train, model__verbose = False)

    print("Best RMSE   :", -rs.best_score_)
    print("Best params :")
    for k, v in rs.best_params_.items():
        print(f"   {k}: {v}")

    best_pipeline = rs.best_estimator_
    y_pred = best_pipeline.predict(X_test)
    evaluate_model(y_test, y_pred, "XGBoost Tuned")

if __name__ == "__main__":
    main()

Training XGBoost model...
Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV] END model__colsample_bytree=0.749816047538945, model__gamma=4.75357153204958, model__learning_rate=0.11979909127171076, model__max_depth=9, model__min_child_weight=5, model__n_estimators=914, model__subsample=0.7783331011414365; total time=  19.0s
[CV] END model__colsample_bytree=0.749816047538945, model__gamma=4.75357153204958, model__learning_rate=0.11979909127171076, model__max_depth=9, model__min_child_weight=5, model__n_estimators=914, model__subsample=0.7783331011414365; total time=  19.0s
[CV] END model__colsample_bytree=0.749816047538945, model__gamma=4.75357153204958, model__learning_rate=0.11979909127171076, model__max_depth=9, model__min_child_weight=5, model__n_estimators=914, model__subsample=0.7783331011414365; total time=  19.2s
[CV] END model__colsample_bytree=0.749816047538945, model__gamma=4.75357153204958, model__learning_rate=0.11979909127171076, model__max_depth=9, model__