In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


df = pd.read_parquet("data.parquet")

if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

new_columns_names = ["row_id", "order_id", "ship_mode", "customer_id", "customer_name", "segment", "country", "city", "state", "postal_code", "region", "product_id", 
    "category", "sub_category", "product_name", "sales", "quantity", "discount", "profit", "order_year", "order_month", "order_day", "ship_year", "ship_month", "ship_day"]

if len(df.columns) == len(new_columns_names):
    df.columns = new_columns_names
else: 
    pass 

if "order_year" in df.columns:
    df["quantity"] = df["quantity"].astype(int)

    df["order_date"] = pd.to_datetime(
        df["order_year"].astype(str) + '-' +
        df["order_month"].astype(str) + '-' +
        df["order_day"].astype(str)             
    )

    df["order_weekday_index"] = df["order_date"].dt.weekday
    df["order_weekday"] = df["order_date"].dt.day_name()
    df["order_week_of_year"] = df["order_date"].dt.isocalendar().week
    df["order_is_weekend"] = df["order_weekday_index"] >= 5

    df["ship_date"] = pd.to_datetime(
        df["ship_year"].astype(str) + '-' +
        df["ship_month"].astype(str) + '-' +
        df["ship_day"].astype(str)             
    )
    
    df["original_price_per_unit"] = df["sales"] / (df["quantity"] * (1- df["discount"]))
    df["markdown_amount"] = (df["original_price_per_unit"] * df["quantity"]) - df["sales"]

target_col = "profit"

categorical_cols = [
    "sub_category",
    "region",
    "segment",
    "order_month"
]

numeric_cols = [
    "sales",
    "quantity",
    "original_price_per_unit",
    "markdown_amount"
]

X = df[categorical_cols + numeric_cols]
y = df[target_col]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_cols),
        ("cat", OneHotEncoder(drop="first", sparse_output=False, dtype=int, handle_unknown="error"), categorical_cols),
    ],
    verbose_feature_names_out=False
)


def train_models_lol(random_seed):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=random_seed
    )

    print("Starte Preprocessing...")
    X_train_proc = preprocessor.fit_transform(X_train)
    X_val_proc = preprocessor.transform(X_val)
    print("Preprocessing abgeschlossen.")

    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest": RandomForestRegressor(
            max_depth=20,
            min_samples_leaf = 2,
            min_samples_split = 5,
            n_estimators =500,
            random_state = 42,
            n_jobs=-1,
        ),
        "GradientBoosting": GradientBoostingRegressor(
            n_estimators=500,
            subsample=0.8,
            random_state=42,
        ),
    }

    print("\n" + "="*60)
    print(f"Random Seed: {random_seed}")
    print(f"{'Model':<20} | {'R2 Score':<10} | {'MAE':<10} | {'RMSE':<10}")
    print("="*60)

    for model_name, model in models.items():
        model.fit(X_train_proc, y_train)
        preds = model.predict(X_val_proc)

        mse = mean_squared_error(y_val, preds)
        rmse = np.sqrt(mse)
        
        mae = mean_absolute_error(y_val, preds)
        r2 = r2_score(y_val, preds)
        
        print(f"{model_name:<20} | {r2:<10.4f} | {mae:<10.2f} | {rmse:<10.2f}")

    print("="*60)

In [3]:
for element in [39, 40, 41, 42, 43, 44, 45, 187]:
   train_models_lol(element)    

Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 39
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.8446     | 36.03      | 99.83     
RandomForest         | 0.9003     | 18.72      | 79.95     
GradientBoosting     | 0.9568     | 15.74      | 52.66     
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 40
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.8729     | 36.46      | 89.53     
RandomForest         | 0.9339     | 16.53      | 64.54     
GradientBoosting     | 0.9381     | 15.22      | 62.48     
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 41
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.8238     | 36.40      | 86.42     
RandomForest         | 0.9070     | 16.58      | 62.77     
GradientBoosting     | 0.9280     | 15.41      | 55.25     
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 42


In [4]:
zahlen = [random.randint(1, 500) for _ in range(20)]
print(zahlen)
for element in zahlen:
   train_models_lol(element)

[368, 438, 331, 239, 457, 380, 380, 47, 394, 251, 51, 218, 185, 7, 44, 258, 220, 425, 112, 405]
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 368
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.8514     | 35.66      | 116.53    
RandomForest         | 0.8035     | 20.08      | 134.01    
GradientBoosting     | 0.8959     | 17.06      | 97.56     
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 438
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.7863     | 35.65      | 80.27     
RandomForest         | 0.8891     | 15.75      | 57.82     
GradientBoosting     | 0.9169     | 14.73      | 50.04     
Starte Preprocessing...
Preprocessing abgeschlossen.

Random Seed: 331
Model                | R2 Score   | MAE        | RMSE      
LinearRegression     | 0.8451     | 38.20      | 102.92    
RandomForest         | 0.7812     | 20.57      | 122.33    
GradientBoosting     | 0.9290  