In [None]:
#Version 1 XGBoost + One-Hot Encoding

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline

def train_final_xgboost_regression():
    df = preprocessing_pipeline()

    # Train-Test-Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df) 

    # Stichprobe ziehen
    print("\nVerwende nur Stichprobe (10.000 Train, 2.000 Test)")
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Features einmal anzeigen (für Fehlerfindung, falls nötig)
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("Erkannte numerische Features:", numeric_features)
    print("Erkannte kategorische Features:", categorical_features)

    # Skalierung, diesmal One-Hot-Encoding
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Anwendung Preprocessing
    print("Preprocessing wird angewendet...")
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # XGBoost Modell
    print("Trainiere XGBoost Regression...")
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=1
    )

    # Modell trainieren
    model.fit(X_train_transformed, y_train)

    # Bewerten
    y_pred = model.predict(X_test_transformed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    print(f"Test RMSE auf 2.000 Testpunkten: {rmse:.2f}")
    print("Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return model, preprocessor

# Aufrufen der Funktion
final_model, preprocessor = train_final_xgboost_regression()

In [None]:
# Version 2 - XGBoost mit Label Encoding

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline

def train_final_xgboost_regression_label_encoding():
    df = preprocessing_pipeline()
    # Train-Test-Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df)

    # -Stichprobe ziehen
    print("Verwende nur Stichprobe (10.000 Train, 2.000 Test)")
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

 
    # Feature-Auflistung, evtl. für Fehlerfindung später
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("Erkannte numerische Features:", numeric_features)
    print("Erkannte kategorische Features:", categorical_features)

    # Diesmal Label-Encoding
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Anwendung Preprocessing
    print("Preprocessing wird angewendet...")
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    
    # Trainieren des Modells
    print("Trainiere XGBoost Regression...")
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=1
    )

    model.fit(X_train_transformed, y_train)

    # Bewerten 
    y_pred = model.predict(X_test_transformed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    print(f"Test RMSE auf 2.000 Testpunkten: {rmse:.2f}")
    print("Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return model, preprocessor

# Anwenden der Funktion
final_model, preprocessor = train_final_xgboost_regression_label_encoding()




In [None]:
# Version 3 - XGBoost mit Label-Encoding & Hyperparameteroptimierung RandomSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline


def train_final_xgboost_with_hyperparameter_tuning():
    df = preprocessing_pipeline() 
    # Train-Test Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df) 

    # Stichprobe ziehen
    print("Verwende nur Stichprobe (10.000 Train, 2.000 Test)")
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Auflistung der Features

    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("Erkannte numerische Features:", numeric_features)
    print("Erkannte kategorische Features:", categorical_features)

    # Skalierung, diesmal mit Label (Ordinal) Encoding
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Anwendung Preprocessing
    print("Preprocessing wird angewendet...")
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Hyperparameter-Optimierung mit RandomSearch
    print("Starte RandomizedSearchCV für Hyperparameteroptimierung...")

    # Definition der Randoms
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    }

    xgb = XGBRegressor(random_state=42, n_jobs=-1, verbosity=1)

    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=30,  # 30 Kombinationen testen
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # Randomized Search ausführen
    random_search.fit(X_train_transformed, y_train)

    best_model = random_search.best_estimator_

    print("Beste Hyperparameter:")
    print(random_search.best_params_)

    # Bewertung mit Ergebnissen der Hyperparameteroptimierung
    y_pred = best_model.predict(X_test_transformed)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    print(f"Test RMSE auf 2.000 Testpunkten: {rmse:.2f}")
    print("\n✅ Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return best_model, preprocessor


# Aufrufen der Funktion
final_model, preprocessor = train_final_xgboost_with_hyperparameter_tuning()



In [None]:
# Version 4 - XGBoost mit One-Hot-Encoding & Hyperparameteroptimierung RandomSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline


def train_final_xgboost_with_hyperparameter_tuning_onehot():
    df = preprocessing_pipeline()
    # Train-Test Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df)

    # Stichprobe ziehen
    print("Verwende nur Stichprobe (10.000 Train, 2.000 Test) für schnelleren Lauf!")
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Auflistung der Features
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("Erkannte numerische Features:", numeric_features)
    print("Erkannte kategorische Features:", categorical_features)

    # Skalierung, diesmal mit One-Hot Encoding
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Anwendung Preprocessing
    print("Preprocessing wird angewendet...")
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Hyperparameteroptimierung mit RandomizedSearch

    print("Starte RandomizedSearchCV für Hyperparameteroptimierung...")

    # Definition der Random Parameter
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    }

    xgb = XGBRegressor(random_state=42, n_jobs=-1, verbosity=1)

    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=30,  # 30 Kombinationen testen
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # Randomized Search ausführen
    random_search.fit(X_train_transformed, y_train)

    best_model = random_search.best_estimator_

    print("Beste Hyperparameter:")
    print(random_search.best_params_)

    # Bewertung mit Ergebnissen aus RandomSearch
    y_pred = best_model.predict(X_test_transformed)

    

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test RMSE auf 2.000 Testpunkten: {rmse:.2f}")
    print(f"Test MSE: {mse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.4f}")
    print("Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return best_model, preprocessor

# Aufrufen der Funktion
final_model, preprocessor = train_final_xgboost_with_hyperparameter_tuning_onehot()




In [None]:
# Version 5 - XGBoost mit Target-Hot-Encoding & Hyperparameteroptimierung RandomSearchCV

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import joblib
from category_encoders import TargetEncoder

from Preprocessing.split import split_data
from Preprocessing.preprocessing_pipeline_initial import preprocessing_pipeline


def train_final_xgboost_with_target_encoding():
    df = preprocessing_pipeline()
    # Train-Test Split
    X_train, X_test, y_train, y_test, X, y, _, _ = split_data(df) 

    # Stichprobe ziehen
    print("Verwende nur Stichprobe")
    X_train = X_train.sample(n=10000, random_state=42)
    y_train = y_train.loc[X_train.index]

    X_test = X_test.sample(n=2000, random_state=42)
    y_test = y_test.loc[X_test.index]

    # Feature Auflistung
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("\n✅ Erkannte numerische Features:", numeric_features)
    print("\n✅ Erkannte kategorische Features:", categorical_features)

    # Preprocessing für numerische Features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing für kategorische Features (Target Encoding)
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('target', TargetEncoder())
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Preprocessing Anwendung
    print("Preprocessing wird auf Trainingsdaten angewendet...")
    X_train_transformed = preprocessor.fit_transform(X_train, y_train)  # <- Wichtig: Target (y_train) für TargetEncoder!
    X_test_transformed = preprocessor.transform(X_test)

    # Hyperparameteroptimierung mit RandomizedSearchCV

    print("Starte RandomizedSearchCV für Hyperparameteroptimierung...")
    param_dist = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5, 6, 7],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    }

    xgb = XGBRegressor(random_state=42, n_jobs=-1, verbosity=1)

    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=30,  # 30 Kombinationen testen
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # Randomized Search ausführen
    random_search.fit(X_train_transformed, y_train)
    best_model = random_search.best_estimator_

    print("Beste Hyperparameter:")
    print(random_search.best_params_)

    # Bewertung mit Ergebnissen aus RandomSearch
    y_pred = best_model.predict(X_test_transformed)

    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Test RMSE auf 2.000 Testpunkten: {rmse:.2f}")
    print(f"Test MSE: {mse:.2f}")
    print(f"Test MAE: {mae:.2f}")
    print(f"Test R²: {r2:.4f}")
    print(" Wahre vs. Vorhergesagte Preise (erste 10 Beispiele):")
    for true_val, pred_val in zip(y_test[:10], y_pred[:10]):
        print(f"Echter Preis: {true_val:.2f} - Vorhergesagt: {pred_val:.2f}")

    return best_model, preprocessor

# Aufrufen der Funktion
final_model, preprocessor = train_final_xgboost_with_target_encoding()

