In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

def load_data(file_path):
    """Lädt die Daten aus einer CSV-Datei."""
    return pd.read_csv(file_path)

def load_splits(X_train_path, X_test_path, y_train_path, y_test_path):
    """Lädt die gespeicherten Trainings- und Testdaten."""
    X_train = pd.read_csv(X_train_path)
    X_test = pd.read_csv(X_test_path)
    y_train = pd.read_csv(y_train_path).values.ravel()  # ravel() konvertiert DataFrame in 1D-Array
    y_test = pd.read_csv(y_test_path).values.ravel()
    return X_train, X_test, y_train, y_test

def feature_engineering(data):
    """Erstellt Interaktionsvariablen und lagged features."""
    data['calls_per_duty_lag7'] = data['calls_per_duty'].shift(7)
    data['avg_sick_last_30_days_lag7'] = data['avg_sick_last_30_days'].shift(7)
    data['interaction_term'] = data['calls_per_duty'] * data['avg_sick_last_30_days']
    return data.dropna()

def custom_loss(y_true, y_pred):
    """Berechnet eine benutzerdefinierte Verlustfunktion."""
    under_estimation = (y_true > y_pred) * (y_true - y_pred)
    over_estimation = (y_pred > y_true) * (y_pred - y_true)
    loss = 3 * under_estimation + over_estimation  # Faktor 3 für Unterdeckung
    return loss.sum()

def train_random_forest(X_train, y_train, param_dist):
    """Trainiert ein Random Forest Modell mit Hyperparameter-Optimierung."""
    custom_scorer = make_scorer(custom_loss, greater_is_better=False)
    tscv = TimeSeriesSplit(n_splits=5)

    rf_random = RandomizedSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_distributions=param_dist,
        n_iter=50,
        cv=tscv,
        verbose=2,
        random_state=42,
        n_jobs=-1,
        scoring=custom_scorer
    )
    
    rf_random.fit(X_train, y_train)
    return rf_random.best_estimator_, rf_random.best_params_

def evaluate_model(model, X_test, y_test):
    """Bewertet das Modell und gibt MAE, RMSE und benutzerdefinierte Verlustfunktion zurück."""
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    custom_loss_value = custom_loss(y_test, predictions)
    return mae, rmse, r2, custom_loss_value

def save_model(model, path):
    """Speichert das trainierte Modell in einer Datei."""
    joblib.dump(model, path)

def main():
    # Dateipfade
    input_file_path = r"..\data\processed_data\preprocessed_data.csv"
    model_save_path = r"..\models\fortgeschrittenes_model.pkl"
    
    # Pfade zu den gespeicherten Trainings- und Testdatensätzen
    X_train_path = r"..\data\X_train.csv"
    X_test_path = r"..\data\X_test.csv"
    y_train_path = r"..\data\y_train.csv"
    y_test_path = r"..\data\y_test.csv"

    # Daten einlesen
    data = load_data(input_file_path)

    # Feature Engineering
    data = feature_engineering(data)

    # Feature- und Zielvariablen festlegen
    features = ['calls_per_duty', 'month_sin', 'month_cos', 'year', 'day_of_week']
    target = 'adjusted_need_cleaned'

    # Gespeicherte Trainings- und Testdaten laden
    X_train, X_test, y_train, y_test = load_splits(X_train_path, X_test_path, y_train_path, y_test_path)

    # Hyperparameter für RandomizedSearchCV
    param_dist = {
        'n_estimators': [100, 200, 500, 1000],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    # Random Forest Modell trainieren
    best_rf, best_params = train_random_forest(X_train, y_train, param_dist)
    print(f'Best Parameters: {best_params}')

    # Vorhersagen und Metriken berechnen
    mae, rmse, r2, custom_loss_value = evaluate_model(best_rf, X_test, y_test)
    print(f'Optimiertes Model MAE: {mae:.2f}')
    print(f'Optimiertes Model RMSE: {rmse:.2f}')
    print(f'Optimiertes R²: {r2:.2f}')
    print(f'Custom Loss on Test Set: {custom_loss_value:.2f}')

    # Cross-Validation mit TimeSeriesSplit und benutzerdefinierter Verlustfunktion
    cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), scoring=make_scorer(custom_loss, greater_is_better=False))
    print(f'Custom Loss Cross-Validation Scores: {cross_val_scores}')
    print(f'Mean Custom Loss: {np.mean(cross_val_scores):.2f}')

    # Feature-Importances
    importances = best_rf.feature_importances_
    feature_importances = pd.DataFrame({
        'feature': features,
        'importance': importances
    }).sort_values(by='importance', ascending=False)
    print("Feature Importances:")
    print(feature_importances)

    # Modell speichern
    save_model(best_rf, model_save_path)

if __name__ == "__main__":
    main()


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': True}
Optimiertes Model MAE: 4.47
Optimiertes Model RMSE: 14.19
Optimiertes R²: 0.79
Custom Loss on Test Set: 1862.52
Custom Loss Cross-Validation Scores: [-1386.642      -2256.834      -1732.987      -1890.776
  -977.25442857]
Mean Custom Loss: -1648.90
Feature Importances:
          feature  importance
0  calls_per_duty    0.909906
1       month_sin    0.031155
3            year    0.029035
4     day_of_week    0.015975
2       month_cos    0.013930
