In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [3]:
df = pd.read_parquet("data.parquet")

In [4]:
print(df["category"].unique())

['Furniture' 'Office Supplies' 'Technology']


In [37]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

def model_training_tuning_extended(df_ml, features: list, target: str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment']

    # One-Hot Encoding
    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42)
    }

    # Erweiterter Parameterraum
    param_grids = {
        "LinearRegression": {}, # LR hat keine nennenswerten Hyperparameter
        
        "RandomForest": {
            'n_estimators': [200, 500],        # Mehr Bäume
            'max_depth': [10, 20, 30, None],   # Tiefere Bäume erlaubt
            'min_samples_split': [2, 5, 10],   # Verhindert Overfitting
            'min_samples_leaf': [1, 2, 4]      # Wie viele Datenpunkte min. pro Blatt?
        },
        
        "GradientBoosting": {
            'n_estimators': [200, 500],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7],            # GB profitiert oft von flacheren Bäumen als RF
            'subsample': [0.8, 1.0]            # Trainiert jeden Baum nur auf 80% der Daten (Stochastik)
        }
    }

    # Wir definieren mehrere Metriken, die getrackt werden sollen
    scoring_metrics = {
        'R2': 'r2',
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error'
    }

    results_summary = {}

    for name, model in models.items():
        print(f"\n{'='*80}")
        print(f"Starte Tuning für: {name}")
        print(f"{'='*80}")

        if not param_grids[name]:
            # Fallback für LinearRegression ohne Grid
            from sklearn.model_selection import cross_validate
            scores = cross_validate(model, X_encoded, y, cv=5, scoring=scoring_metrics)
            print(f"Keine Parameter zu tunen. Standard-Ergebnisse:")
            print(f"R2: {scores['test_R2'].mean():.4f} | MAE: {-scores['test_MAE'].mean():.2f} | RMSE: {-scores['test_RMSE'].mean():.2f}")
            results_summary[name] = model.fit(X_encoded, y)
            continue

        # GridSearchCV Setup
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=5,
            scoring=scoring_metrics,
            refit='R2', # Welcher Wert entscheidet über den "Sieger"?
            n_jobs=-1,
            verbose=1   # Zeigt Fortschrittsbalken/Logs an
        )
        
        grid_search.fit(X_encoded, y)
        
        # --- Auswertung aller Kombinationen ---
        results_df = pd.DataFrame(grid_search.cv_results_)
        
        # Wir wählen nur die relevanten Spalten für den Print
        cols_to_keep = ['params', 'mean_test_R2', 'mean_test_MAE', 'mean_test_RMSE']
        view_df = results_df[cols_to_keep].copy()
        
        # Scikit-Learn gibt Fehler als negative Werte zurück (damit "max" besser ist).
        # Wir machen sie für die Anzeige wieder positiv:
        view_df['mean_test_MAE'] = -view_df['mean_test_MAE']
        view_df['mean_test_RMSE'] = -view_df['mean_test_RMSE']
        
        # Sortieren nach R2 (bestes oben)
        view_df = view_df.sort_values(by='mean_test_R2', ascending=False)

        print(f"\nAlle getesteten Kombinationen für {name} (Top 10 Auszug):")
        print("-" * 100)
        # Formatierte Ausgabe der Tabelle
        print(f"{'R2':<10} | {'MAE':<10} | {'RMSE':<10} | {'Parameters'}")
        print("-" * 100)
        
        # Iterieren und printen (hier alle, oder slicing [0:20] nutzen wenn es zu viele sind)
        for index, row in view_df.iterrows():
            params_str = str(row['params'])
            print(f"{row['mean_test_R2']:<10.4f} | {row['mean_test_MAE']:<10.2f} | {row['mean_test_RMSE']:<10.2f} | {params_str}")

        print("-" * 100)
        print(f"BESTER {name}: R2={grid_search.best_score_:.4f}")
        print(f"Beste Params: {grid_search.best_params_}")
        
        results_summary[name] = grid_search.best_estimator_

    return results_summary

In [38]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment'
]
target = 'profit'
best_models = model_training_tuning_extended(df, features, target)
print(best_models)


Starte Tuning für: LinearRegression
Keine Parameter zu tunen. Standard-Ergebnisse:
R2: 0.7277 | MAE: 36.91 | RMSE: 109.32

Starte Tuning für: RandomForest
Fitting 5 folds for each of 72 candidates, totalling 360 fits

Alle getesteten Kombinationen für RandomForest (Top 10 Auszug):
----------------------------------------------------------------------------------------------------
R2         | MAE        | RMSE       | Parameters
----------------------------------------------------------------------------------------------------
0.7213     | 18.88      | 106.03     | {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
0.7211     | 19.09      | 107.43     | {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
0.7209     | 19.00      | 107.35     | {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
0.7206     | 18.86      | 106.10     | {'max_depth': 30, 'min_samples_leaf': 2, 'min_sam

## Train

In [15]:
def model_trianing(df_ml, features : list, target : str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment','order_month', 'order_week_of_year']

    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(
        max_depth=20,        
        min_samples_leaf=2, 
        min_samples_split=5,
        n_estimators=500, 
        random_state=42, 
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=500, 
        random_state=42, 
        subsample=0.8
    ),
    }

    print(f"{'Model':<20} | {'R2 Score (Mean)':<15} | {'MAE (Mean USD)':<15}")
    print("-" * 60)

    for name, model in models.items():
        cv_r2 = cross_val_score(model, X_encoded, y, cv=5, scoring='r2')
        cv_mae = cross_val_score(model, X_encoded, y, cv=5, scoring='neg_mean_absolute_error')
        print(f"{name:<20} | {cv_r2.mean():<15.4f} | {-cv_mae.mean():<15.2f}")

Beste Feature-Wahl: features = 'sales', 'quantity', "original_price_per_unit", "markdown_amount", 'sub_category', 'region', 'segment', 'order_month'

Beste Parameter RF:  max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500, random_state=42, 

Beste Parameter GB:  n_estimators=500, random_state=42, subsample=0.8

Bestes Modell bei dieser Kombination: GB mit R2-SCore 0.8648 und MAE 16.59

Problem: MAE ist fast doppelt so groß wie Median Profit

In [13]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment', 'order_month'
]
target = 'profit'
model_trianing(df, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.7270          | 37.15          
RandomForest         | 0.7142          | 19.33          
GradientBoosting     | 0.8648          | 16.59          


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

def model_training_refined(df_ml, features: list, target: str):
    # 1. Datenvorbereitung
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = [f for f in ['sub_category', 'region', 'segment', 'order_month', 'order_week_of_year'] if f in features]

    # One-Hot-Encoding
    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    # 2. Train-Test-Split (80% Training, 20% Test)
    # Das Testset bleibt völlig unangetastet bis zur finalen Bewertung
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500, random_state=42),
        "GradientBoosting": GradientBoostingRegressor(n_estimators=500, random_state=42, subsample=0.8),
    }

    print(f"{'Model':<20} | {'R2 (CV)':<10} | {'MAE':<10} | {'RMSE':<10} | {'MAPE (%)':<10}")
    print("-" * 75)

    results = {}

    for name, model in models.items():
        # 3. Cross-Validation auf den Trainingsdaten
        # Wir nutzen cross_validate für mehrere Metriken gleichzeitig
        scoring = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']
        cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring)
        
        # 4. Finales Training auf dem kompletten Trainingsset & Test auf Hold-out-Daten
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Metriken berechnen
        mae = -cv_results['test_neg_mean_absolute_error'].mean()
        rmse = -cv_results['test_neg_root_mean_squared_error'].mean()
        r2 = cv_results['test_r2'].mean()
        # MAPE (Mean Absolute Percentage Error) gibt Aufschluss über relative Abweichungen
        mape = mean_absolute_percentage_error(y_test, y_pred) * 100

        print(f"{name:<20} | {r2:<10.3f} | {mae:<10.2f} | {rmse:<10.2f} | {mape:<10.2f}%")
        
        results[name] = model

    return results, X_test, y_test

In [6]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment', 'order_month'
]
target = 'profit'
model_training_refined(df, features, target)

Model                | R2 (CV)    | MAE        | RMSE       | MAPE (%)  
---------------------------------------------------------------------------
LinearRegression     | 0.831      | 35.29      | 92.76      | 244323318668277536.00%
RandomForest         | 0.764      | 19.40      | 101.30     | 194182599459704768.00%
GradientBoosting     | 0.853      | 16.66      | 82.01      | 102308756232426032.00%


({'LinearRegression': LinearRegression(),
  'RandomForest': RandomForestRegressor(max_depth=20, min_samples_leaf=2, min_samples_split=5,
                        n_estimators=500, random_state=42),
  'GradientBoosting': GradientBoostingRegressor(n_estimators=500, random_state=42, subsample=0.8)},
         sales  quantity  original_price_per_unit  markdown_amount  \
 3125  563.808         4                   176.19     1.409520e+02   
 1441   36.672         2                    22.92     9.168000e+00   
 4510   37.300         2                    18.65     0.000000e+00   
 39    212.058         3                   100.98     9.088200e+01   
 4509  171.288         3                    71.37     4.282200e+01   
 ...       ...       ...                      ...              ...   
 9956   46.350         5                     9.27    -7.105427e-15   
 1561    2.780         1                     2.78     0.000000e+00   
 1670   16.680         3                     6.95     4.170000e+00   
 69