In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [29]:
df = pd.read_parquet("data.parquet")

In [30]:
print(df["category"].unique())

['Furniture' 'Office Supplies' 'Technology']


In [37]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

def model_training_tuning_extended(df_ml, features: list, target: str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment']

    # One-Hot Encoding
    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(random_state=42),
        "GradientBoosting": GradientBoostingRegressor(random_state=42)
    }

    # Erweiterter Parameterraum
    param_grids = {
        "LinearRegression": {}, # LR hat keine nennenswerten Hyperparameter
        
        "RandomForest": {
            'n_estimators': [200, 500],        # Mehr Bäume
            'max_depth': [10, 20, 30, None],   # Tiefere Bäume erlaubt
            'min_samples_split': [2, 5, 10],   # Verhindert Overfitting
            'min_samples_leaf': [1, 2, 4]      # Wie viele Datenpunkte min. pro Blatt?
        },
        
        "GradientBoosting": {
            'n_estimators': [200, 500],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7],            # GB profitiert oft von flacheren Bäumen als RF
            'subsample': [0.8, 1.0]            # Trainiert jeden Baum nur auf 80% der Daten (Stochastik)
        }
    }

    # Wir definieren mehrere Metriken, die getrackt werden sollen
    scoring_metrics = {
        'R2': 'r2',
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error'
    }

    results_summary = {}

    for name, model in models.items():
        print(f"\n{'='*80}")
        print(f"Starte Tuning für: {name}")
        print(f"{'='*80}")

        if not param_grids[name]:
            # Fallback für LinearRegression ohne Grid
            from sklearn.model_selection import cross_validate
            scores = cross_validate(model, X_encoded, y, cv=5, scoring=scoring_metrics)
            print(f"Keine Parameter zu tunen. Standard-Ergebnisse:")
            print(f"R2: {scores['test_R2'].mean():.4f} | MAE: {-scores['test_MAE'].mean():.2f} | RMSE: {-scores['test_RMSE'].mean():.2f}")
            results_summary[name] = model.fit(X_encoded, y)
            continue

        # GridSearchCV Setup
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=5,
            scoring=scoring_metrics,
            refit='R2', # Welcher Wert entscheidet über den "Sieger"?
            n_jobs=-1,
            verbose=1   # Zeigt Fortschrittsbalken/Logs an
        )
        
        grid_search.fit(X_encoded, y)
        
        # --- Auswertung aller Kombinationen ---
        results_df = pd.DataFrame(grid_search.cv_results_)
        
        # Wir wählen nur die relevanten Spalten für den Print
        cols_to_keep = ['params', 'mean_test_R2', 'mean_test_MAE', 'mean_test_RMSE']
        view_df = results_df[cols_to_keep].copy()
        
        # Scikit-Learn gibt Fehler als negative Werte zurück (damit "max" besser ist).
        # Wir machen sie für die Anzeige wieder positiv:
        view_df['mean_test_MAE'] = -view_df['mean_test_MAE']
        view_df['mean_test_RMSE'] = -view_df['mean_test_RMSE']
        
        # Sortieren nach R2 (bestes oben)
        view_df = view_df.sort_values(by='mean_test_R2', ascending=False)

        print(f"\nAlle getesteten Kombinationen für {name} (Top 10 Auszug):")
        print("-" * 100)
        # Formatierte Ausgabe der Tabelle
        print(f"{'R2':<10} | {'MAE':<10} | {'RMSE':<10} | {'Parameters'}")
        print("-" * 100)
        
        # Iterieren und printen (hier alle, oder slicing [0:20] nutzen wenn es zu viele sind)
        for index, row in view_df.iterrows():
            params_str = str(row['params'])
            print(f"{row['mean_test_R2']:<10.4f} | {row['mean_test_MAE']:<10.2f} | {row['mean_test_RMSE']:<10.2f} | {params_str}")

        print("-" * 100)
        print(f"BESTER {name}: R2={grid_search.best_score_:.4f}")
        print(f"Beste Params: {grid_search.best_params_}")
        
        results_summary[name] = grid_search.best_estimator_

    return results_summary

In [None]:
# features = [
#     'sales', 'quantity', 'discount',     
#     'category', 'sub_category', 'region', 'segment'
# ]

features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment'
]
target = 'profit'
best_models = model_training_tuning_extended(df, features, target)
print(best_models)


Starte Tuning für: LinearRegression
Keine Parameter zu tunen. Standard-Ergebnisse:
R2: 0.7277 | MAE: 36.91 | RMSE: 109.32

Starte Tuning für: RandomForest
Fitting 5 folds for each of 72 candidates, totalling 360 fits


## Trian

In [34]:
def model_trianing(df_ml, features : list, target : str):
    X = df_ml[features]
    y = df_ml[target]
    categorical_features = ['sub_category', 'region', 'segment']

    X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    
    models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(
        max_depth=20,        
        min_samples_leaf=2, 
        min_samples_split=5,
        n_estimators=500, 
        random_state=42, 
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=500, 
        random_state=42, 
        subsample=0.8
    ),
    }

    print(f"{'Model':<20} | {'R2 Score (Mean)':<15} | {'MAE (Mean USD)':<15}")
    print("-" * 60)

    for name, model in models.items():
        cv_r2 = cross_val_score(model, X_encoded, y, cv=5, scoring='r2')
        cv_mae = cross_val_score(model, X_encoded, y, cv=5, scoring='neg_mean_absolute_error')
        print(f"{name:<20} | {cv_r2.mean():<15.4f} | {-cv_mae.mean():<15.2f}")

In [35]:
features = [
    'sales', 'quantity', 'discount', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment'
]
model_trianing(df, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.7284          | 36.86          
RandomForest         | 0.7216          | 18.84          
GradientBoosting     | 0.8341          | 16.83          


In [36]:
features = [
    'sales', 'quantity', "original_price_per_unit", "markdown_amount",
    'sub_category', 'region', 'segment'
]
model_trianing(df, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.7277          | 36.91          
RandomForest         | 0.7211          | 19.09          
GradientBoosting     | 0.8587          | 16.47          


In [15]:
df_1 = df[df["category"]=="Furniture"]
df_2 = df[df["category"]=="Office Supplies"]
df_3 = df[df["category"]=="Technology"]

features = [
    'sales', 'quantity', 'discount',     
    'category', 'sub_category', 'region', 'segment'
]
target = 'profit'

model_trianing(df_1, features, target)
model_trianing(df_2, features, target)
model_trianing(df_3, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.3028          | 63.32          
RandomForest         | 0.8013          | 26.62          
GradientBoosting     | 0.8056          | 27.20          
Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.2980          | 38.09          
RandomForest         | 0.6778          | 10.67          
GradientBoosting     | 0.7332          | 9.73           
Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.1687          | 122.02         
RandomForest         | 0.4963          | 48.85          
GradientBoosting     | 0.6314          | 45.61          


In [16]:
df = df[df['ship_mode'].isin(['First Class','Second Class','Standard Class'])]

df_1 = df[df["category"]=="Furniture"]
df_2 = df[df["category"]=="Office Supplies"]
df_3 = df[df["category"]=="Technology"]

features = [
    'sales', 'quantity', 'discount',     
    'category', 'sub_category', 'region', 'segment'
]
target = 'profit'

model_trianing(df_1, features, target)
model_trianing(df_2, features, target)
model_trianing(df_3, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.3035          | 63.82          
RandomForest         | 0.8098          | 26.65          
GradientBoosting     | 0.8093          | 27.54          
Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.2928          | 39.28          
RandomForest         | 0.6705          | 11.00          
GradientBoosting     | 0.7500          | 9.73           
Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.2007          | 116.20         
RandomForest         | 0.5386          | 47.64          
GradientBoosting     | 0.6515          | 42.12          


In [17]:
model_trianing(df, features, target)

df = df[df['ship_mode'].isin(['First Class','Second Class','Standard Class'])]

features = [
    'sales', 'quantity', 'discount',     
    'category', 'sub_category', 'region', 'segment'
]
target = 'profit'

model_trianing(df, features, target)

Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.2143          | 59.94          
RandomForest         | 0.6503          | 19.78          
GradientBoosting     | 0.7774          | 19.81          
Model                | R2 Score (Mean) | MAE (Mean USD) 
------------------------------------------------------------
LinearRegression     | 0.2143          | 59.94          
RandomForest         | 0.6503          | 19.78          
GradientBoosting     | 0.7774          | 19.81          
