In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_parquet("data.parquet")

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def model_training_tuning_extended(df_ml, features: list, target: str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment','order_month']

    ## One-Hot Encoding
    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42)
    }

    ## Erweiterter Parameterraum
    param_grids = {
        "LinearRegression": {}, 
        
        "RandomForest": {
            'n_estimators': [200, 500],        
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],  
            'min_samples_leaf': [1, 2, 4]     
        },
        
        "GradientBoosting": {
            'n_estimators': [200, 500],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0] 
        }
    }

    ## Metriken
    scoring_metrics = {
        'R2': 'r2',
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error'
    }

    results_summary = {}

    for name, model in models.items():
        print(f"\n{'='*80}")
        print(f"Starte Tuning für: {name}")
        print(f"{'='*80}")

        if not param_grids[name]:
            ## Fallback für LinearRegression ohne Grid
            from sklearn.model_selection import cross_validate
            scores = cross_validate(model, X_encoded, y, cv=5, scoring=scoring_metrics)
            print(f"Keine Parameter zu tunen. Standard-Ergebnisse:")
            print(f"R2: {scores['test_R2'].mean():.4f} | MAE: {-scores['test_MAE'].mean():.2f} | RMSE: {-scores['test_RMSE'].mean():.2f}")
            results_summary[name] = model.fit(X_encoded, y)
            continue

        ## GridSearchCV Setup
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=5,
            scoring=scoring_metrics,
            refit='R2',
            n_jobs=-1,
            verbose=1 
        )
        
        grid_search.fit(X_encoded, y)
        
        ## Auswertung Aller Kombinationen
        results_df = pd.DataFrame(grid_search.cv_results_)
        
        ## Wählen der relevanten Spalten für den Print
        cols_to_keep = ['params', 'mean_test_R2', 'mean_test_MAE', 'mean_test_RMSE']
        view_df = results_df[cols_to_keep].copy()
        
        ## Positiv machen
        view_df['mean_test_MAE'] = -view_df['mean_test_MAE']
        view_df['mean_test_RMSE'] = -view_df['mean_test_RMSE']
        
        ## Sortieren nach R2 (bestes oben)
        view_df = view_df.sort_values(by='mean_test_R2', ascending=False)

        print(f"\nAlle getesteten Kombinationen für {name} (Top 10 Auszug):")
        print("-" * 100)
        print(f"{'R2':<10} | {'MAE':<10} | {'RMSE':<10} | {'Parameters'}")
        print("-" * 100)
        
        ## Iterieren und printen 
        for index, row in view_df.iterrows():
            params_str = str(row['params'])
            print(f"{row['mean_test_R2']:<10.4f} | {row['mean_test_MAE']:<10.2f} | {row['mean_test_RMSE']:<10.2f} | {params_str}")

        print("-" * 100)
        print(f"BESTER {name}: R2={grid_search.best_score_:.4f}")
        print(f"Beste Params: {grid_search.best_params_}")
        
        results_summary[name] = grid_search.best_estimator_

    return results_summary

In [None]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment', 'order_month'
]
target = 'profit'
best_models = model_training_tuning_extended(df, features, target)
print(best_models)

NameError: name 'model_training_tuning_extended' is not defined

## Train

In [4]:
def model_trianing(df_ml, features : list, target : str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment','order_month']

    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(
        max_depth=20,        
        min_samples_leaf=2, 
        min_samples_split=5,
        n_estimators=500, 
        random_state=42, 
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=500, 
        random_state=42, 
        subsample=0.8
    ),
    }

    print(f"{'Model':<20} | {'R2 Score (Mean)':<15} | {'MAE (Mean USD)':<15}")
    print("-" * 60)

    for name, model in models.items():
        cv_r2 = cross_val_score(model, X_encoded, y, cv=5, scoring='r2')
        cv_mae = cross_val_score(model, X_encoded, y, cv=5, scoring='neg_mean_absolute_error')
        print(f"{name:<20} | {cv_r2.mean():<15.4f} | {-cv_mae.mean():<15.2f}")

Beste Feature-Wahl: features = 'sales', 'quantity', "original_price_per_unit", "markdown_amount", 'sub_category', 'region', 'segment', 'order_month'

Beste Parameter RF:  max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500, random_state=42, 

Beste Parameter GB:  n_estimators=500, random_state=42, subsample=0.8

Bestes Modell bei dieser Kombination: GB mit R2-SCore 0.8648 und MAE 16.59

Problem: MAE ist fast doppelt so groß wie Median Profit

In [5]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment', 'order_month'
]
target = 'profit'
model_trianing(df, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.7270          | 37.15          
RandomForest         | 0.7142          | 19.33          
GradientBoosting     | 0.8648          | 16.59          
