In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

def load_data(file_path):
    """Lädt die Daten aus einer CSV-Datei."""
    return pd.read_csv(file_path)

def split_data(data, features, target, test_size=0.2, random_state=37):
    """Teilt die Daten in Trainings- und Testdatensätze auf."""
    X = data[features]
    y = data[target]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def save_splits(X_train, X_test, y_train, y_test, output_dir):
    """Speichert die Train- und Test-Daten in CSV-Dateien."""
    X_train.to_csv(f'{output_dir}/X_train.csv', index=False)
    X_test.to_csv(f'{output_dir}/X_test.csv', index=False)
    y_train.to_csv(f'{output_dir}/y_train.csv', index=False)
    y_test.to_csv(f'{output_dir}/y_test.csv', index=False)

def evaluate_model(model, X_test, y_test):
    """Bewertet das Modell und gibt MAE, RMSE und R² zurück."""
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    return mae, rmse, r2

def train_baseline_model(X_train, y_train):
    """Trainiert das Basismodell (DummyRegressor) und gibt das Modell zurück."""
    baseline_model = DummyRegressor(strategy='median')
    baseline_model.fit(X_train, y_train)
    return baseline_model

def save_model(model, path):
    """Speichert das trainierte Modell in einer Datei."""
    joblib.dump(model, path)

def main():
    # Dateipfade
    input_file_path = r"..\data\processed_data\preprocessed_data.csv"
    baseline_model_path = r"..\models\baseline_model.pkl"
    output_dir = r"..\data"

    # Daten einlesen
    data = load_data(input_file_path)

    # Feature- und Zielvariablen festlegen
    features = ['calls_per_duty', 'month_sin', 'month_cos', 'year', 'day_of_week']
    target = 'adjusted_need_cleaned'

    # Aufteilen der Daten in Trainings- und Testset
    X_train, X_test, y_train, y_test = split_data(data, features, target)

    # Train- und Testdaten speichern
    save_splits(X_train, X_test, y_train, y_test, output_dir)

    # Baseline-Modell trainieren
    baseline_model = train_baseline_model(X_train, y_train)
    mae_baseline, rmse_baseline, r2_baseline = evaluate_model(baseline_model, X_test, y_test)

    # Ergebnisse ausgeben
    print(f'Baseline MAE: {mae_baseline:.2f}')
    print(f'Baseline RMSE: {rmse_baseline:.2f}')
    print(f'Baseline R²: {r2_baseline:.2f}')

    # Baseline-Modell speichern
    save_model(baseline_model, baseline_model_path)

if __name__ == "__main__":
    main()


Baseline MAE: 11.28
Baseline RMSE: 31.06
Baseline R²: -0.01
