# XGBoost Regression für ENTSCHEIDUNGSDATUM

Dieses Notebook erstellt ein vollständiges, reproduzierbares Regressions-Setup für die Zielvariable **ENTSCHEIDUNGSDATUM**.
Es enthält EDA, sauberes Preprocessing, Baselines, XGBoost mit Early Stopping, Hyperparameter-Tuning und finale Evaluation inklusive Artefakt-Export.

In [None]:
# Setup & Reproduzierbarkeit
import os
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge

from xgboost import XGBRegressor

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print('Imports geladen. RANDOM_STATE gesetzt auf', RANDOM_STATE)

## Daten laden & Quick-EDA

Wir laden die CSV-Datei vom fest vorgegebenen Pfad. Falls die Datei fehlt, bricht das Notebook kontrolliert ab.

In [None]:
DATA_PATH = Path('/home/dsrg/Data/df_encoded_no_user_encode_original_Datum.csv')

if not DATA_PATH.exists():
    raise FileNotFoundError(f'Datei nicht gefunden: {DATA_PATH}. Bitte Pfad prüfen.')

print('Lade Daten von:', DATA_PATH)
df = pd.read_csv(DATA_PATH)

print('Shape:', df.shape)
display(df.head())
display(df.dtypes)
display(df.describe(include='all'))

In [None]:
target = 'ENTSCHEIDUNGSDATUM'

missing_per_col = df.isna().sum().sort_values(ascending=False)
missing_total = df.isna().sum().sum()
print('Missing Values pro Spalte:')
display(missing_per_col)
print('Missing Values total:', missing_total)

y = df[target]
print('Zielvariable Min/Max:', y.min(), y.max())
outside_range = ((y < 0) | (y > 365)).mean()
print('Anteil außerhalb [0, 365]:', round(outside_range * 100, 2), '%')

plt.figure(figsize=(8, 4))
plt.hist(y, bins=30, color='#5b8def', edgecolor='white')
plt.title('Histogramm ENTSCHEIDUNGSDATUM')
plt.xlabel('ENTSCHEIDUNGSDATUM')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

is_int_like = False
if pd.api.types.is_integer_dtype(y):
    is_int_like = True
elif pd.api.types.is_float_dtype(y):
    is_int_like = np.allclose(y.dropna(), np.round(y.dropna()), atol=1e-6)
print('Zielvariable integer-ähnlich:', is_int_like)

## Split in Train/Val/Test

Wir verwenden einen 70/15/15 Split mit Shuffle und Random State 42.

In [None]:
X = df.drop(columns=[target])
y = df[target]

id_col = None
if 'ID' in X.columns:
    id_col = X['ID'].copy()
    X = X.drop(columns=['ID'])
    print('ID-Spalte entfernt aus Features, wird für Outputs behalten.')

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, shuffle=True
)

print('Train/Val/Test Shapes:', X_train.shape, X_val.shape, X_test.shape)

## Data Quality Checks

Wir prüfen NaNs in den Splits und zeigen die Zielverteilungen.

In [None]:
def report_missing(name, X_split, y_split):
    print(f'Missing in {name}: X={X_split.isna().sum().sum()}, y={y_split.isna().sum()}')

report_missing('Train', X_train, y_train)
report_missing('Val', X_val, y_val)
report_missing('Test', X_test, y_test)

print('Feature counts:', X_train.shape[1], X_val.shape[1], X_test.shape[1])

def describe_target(name, y_split):
    print(f'{name} target stats:')
    display(y_split.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]))

describe_target('Train', y_train)
describe_target('Val', y_val)
describe_target('Test', y_test)

## Preprocessing (leakage-frei)

Wir erkennen Feature-Typen automatisch und bauen eine Pipeline mit Imputern und One-Hot-Encoding.

In [None]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print('Numeric features:', len(numeric_features))
print('Categorical features:', len(categorical_features))

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## Baselines

Wir vergleichen gegen Dummy-Regressoren (Mean/Median) sowie optional Ridge.

In [None]:
def evaluate_model(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {'model': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

baseline_results = []

dummy_mean = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DummyRegressor(strategy='mean'))
])
dummy_mean.fit(X_train, y_train)
pred_val = dummy_mean.predict(X_val)
baseline_results.append(evaluate_model('DummyMean', y_val, pred_val))

dummy_median = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DummyRegressor(strategy='median'))
])
dummy_median.fit(X_train, y_train)
pred_val = dummy_median.predict(X_val)
baseline_results.append(evaluate_model('DummyMedian', y_val, pred_val))

ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', Ridge(random_state=RANDOM_STATE))
])
ridge.fit(X_train, y_train)
pred_val = ridge.predict(X_val)
baseline_results.append(evaluate_model('Ridge', y_val, pred_val))

baseline_df = pd.DataFrame(baseline_results)
display(baseline_df)

## XGBoost Regression mit Early Stopping

Wir verwenden XGBRegressor mit Early Stopping. Für maximale Kompatibilität wird ein Fallback mit Callbacks genutzt, falls der direkte Fit-Parameter nicht verfügbar ist.

In [None]:
xgb_params = {
    'n_estimators': 5000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.0,
    'reg_lambda': 1.0,
    'objective': 'reg:squarederror',
    'random_state': RANDOM_STATE,
    'n_jobs': -1
}

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)

xgb_model = XGBRegressor(**xgb_params)

def fit_xgb_with_early_stopping(model, X_tr, y_tr, X_va, y_va):
    try:
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            early_stopping_rounds=100,
            verbose=False
        )
        return model
    except TypeError:
        from xgboost.callback import EarlyStopping
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[EarlyStopping(rounds=100, save_best=True)],
            verbose=False
        )
        return model

xgb_model = fit_xgb_with_early_stopping(xgb_model, X_train_proc, y_train, X_val_proc, y_val)

val_pred = xgb_model.predict(X_val_proc)
val_metrics = evaluate_model('XGB-Baseline', y_val, val_pred)
print('XGB baseline metrics:', val_metrics)
print('Best iteration:', getattr(xgb_model, 'best_iteration', None))

## Hyperparameter-Tuning (RandomizedSearchCV)

Wir nutzen RandomizedSearchCV mit MAE-Scoring und 5-facher CV.

In [None]:
param_distributions = {
    'model__max_depth': [3, 4, 5, 6, 7, 8],
    'model__learning_rate': [0.03, 0.05, 0.08],
    'model__subsample': [0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'model__reg_alpha': [0.0, 0.1, 0.5],
    'model__reg_lambda': [0.5, 1.0, 1.5],
}

search_model = XGBRegressor(
    n_estimators=2000,
    objective='reg:squarederror',
    random_state=RANDOM_STATE,
    n_jobs=1
)

search_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', search_model)
])

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

random_search = RandomizedSearchCV(
    search_pipeline,
    param_distributions=param_distributions,
    n_iter=40,
    scoring='neg_mean_absolute_error',
    cv=cv,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=1
)

print('Starte RandomizedSearchCV...')
random_search.fit(X_train, y_train)
print('Best params:', random_search.best_params_)
print('Best CV MAE:', -random_search.best_score_)

## Finales XGBoost-Modell mit Early Stopping

Wir trainieren ein finales Modell mit den besten Parametern und Early Stopping auf der Validation.

In [None]:
best_params = {k.replace('model__', ''): v for k, v in random_search.best_params_.items()}

final_xgb_params = {
    'n_estimators': 5000,
    'objective': 'reg:squarederror',
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
}
final_xgb_params.update(best_params)

preprocessor_final = preprocessor

X_train_proc = preprocessor_final.fit_transform(X_train)
X_val_proc = preprocessor_final.transform(X_val)

final_xgb = XGBRegressor(**final_xgb_params)
final_xgb = fit_xgb_with_early_stopping(final_xgb, X_train_proc, y_train, X_val_proc, y_val)

best_iteration = getattr(final_xgb, 'best_iteration', None)
print('Final model best_iteration:', best_iteration)

## Finale Evaluation auf Test

Wir evaluieren das Modell auf dem Testset und berechnen Metriken vor/nach Clipping (und optionalem Runden).

In [None]:
X_test_proc = preprocessor_final.transform(X_test)
y_pred = final_xgb.predict(X_test_proc)

def clip_predictions(preds, min_val=0, max_val=365):
    return np.clip(preds, min_val, max_val)

def round_predictions(preds):
    return np.round(preds)

metrics_raw = evaluate_model('XGB-Test', y_test, y_pred)

y_pred_clipped = clip_predictions(y_pred)
metrics_clipped = evaluate_model('XGB-Test-Clipped', y_test, y_pred_clipped)

metrics_rounded = None
if is_int_like:
    y_pred_rounded = round_predictions(y_pred_clipped)
    metrics_rounded = evaluate_model('XGB-Test-Rounded', y_test, y_pred_rounded)

display(pd.DataFrame([metrics_raw, metrics_clipped] + ([metrics_rounded] if metrics_rounded else [])))

In [None]:
plots_dir = Path('./artifacts/plots')
plots_dir.mkdir(parents=True, exist_ok=True)

# Predicted vs Actual Scatter
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='#7c4dff')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='black', linestyle='--')
plt.title('Predicted vs Actual (Test)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.tight_layout()
plt.savefig(plots_dir / 'pred_vs_actual.png')
plt.show()

# Residual Histogram
residuals = y_test - y_pred
plt.figure(figsize=(8, 4))
plt.hist(residuals, bins=30, color='#26c6da', edgecolor='white')
plt.title('Residuals Histogram (Test)')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(plots_dir / 'residuals_hist.png')
plt.show()

# Optional: Error by bins
bins = [0, 30, 60, 90, 120, 180, 240, 300, 365]
bin_labels = [f'{bins[i]}-{bins[i+1]}' for i in range(len(bins)-1)]
bin_idx = np.digitize(y_test, bins, right=True) - 1
bin_idx = np.clip(bin_idx, 0, len(bin_labels)-1)

bin_mae = []
for i, label in enumerate(bin_labels):
    mask = bin_idx == i
    if mask.sum() == 0:
        bin_mae.append(np.nan)
    else:
        bin_mae.append(mean_absolute_error(y_test[mask], y_pred[mask]))

plt.figure(figsize=(10, 4))
plt.bar(bin_labels, bin_mae, color='#ff7043')
plt.title('MAE nach Ziel-Bins (Test)')
plt.xlabel('Ziel-Bin')
plt.ylabel('MAE')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(plots_dir / 'mae_by_bins.png')
plt.show()

## Modell & Outputs speichern

Wir speichern die Pipeline sowie eine Ergebnis-CSV für das Testset.

In [None]:
artifacts_dir = Path('./artifacts')
artifacts_dir.mkdir(parents=True, exist_ok=True)

# Pipeline mit Preprocessor + finalem Modell
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_final),
    ('model', final_xgb)
])

pipeline_path = artifacts_dir / 'xgb_reg_pipeline.joblib'
joblib.dump(final_pipeline, pipeline_path)
print('Pipeline gespeichert unter:', pipeline_path)

# Test-Predictions CSV
results = pd.DataFrame({
    'y_true': y_test.values,
    'y_pred': y_pred,
    'y_pred_clipped': y_pred_clipped,
})

if id_col is not None:
    results.insert(0, 'ID', id_col.loc[results.index].values)

results['error'] = results['y_pred_clipped'] - results['y_true']
results['abs_error'] = np.abs(results['error'])

preds_path = artifacts_dir / 'test_predictions.csv'
results.to_csv(preds_path, index=False)
print('Test-Predictions gespeichert unter:', preds_path)

summary = {
    'best_params': best_params,
    'best_iteration': best_iteration,
    'test_metrics_raw': metrics_raw,
    'test_metrics_clipped': metrics_clipped,
}
if metrics_rounded:
    summary['test_metrics_rounded'] = metrics_rounded

print('Summary:')
print(summary)