In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# === Load data ===
train = pd.read_csv("/workspaces/bakery_sales_prediction/train.csv", parse_dates=["Datum"])
test = pd.read_csv("/workspaces/bakery_sales_prediction/test.csv", parse_dates=["Datum"])
wetter = pd.read_csv("/workspaces/bakery_sales_prediction/wetter.csv", parse_dates=["Datum"])
kiwo = pd.read_csv("/workspaces/bakery_sales_prediction/kiwo.csv", parse_dates=["Datum"])

# === Merge weather and event data ===
train = train.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")
test = test.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")

# === Date features ===
for df in [train, test]:
    df["dayofweek"] = df["Datum"].dt.dayofweek
    df["month"] = df["Datum"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
    df["is_monday"] = (df["dayofweek"] == 0).astype(int)
    df["is_friday"] = (df["dayofweek"] == 4).astype(int)
    df["month_weekend"] = df["month"] * df["is_weekend"]

# === Fill missing values ===
for col in ["Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode"]:
    median = train[col].median()
    train[col] = train[col].fillna(median)
    test[col] = test[col].fillna(median)

# === Features to use ===
features = [
    "Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode",
    "dayofweek", "month", "is_weekend", "is_monday", "is_friday", "month_weekend"
]

categorical = ["dayofweek", "month"]

# === Train a model per Warengruppe ===
submission = pd.DataFrame()
mape_scores = []

for wg in sorted(train["Warengruppe"].unique()):
    print(f" Training Warengruppe {wg}")

    train_wg = train[train["Warengruppe"] == wg]
    test_wg = test[test["Warengruppe"] == wg]

    X = train_wg[features]
    y = train_wg["Umsatz"]
    X_test = test_wg[features]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.03,
        num_leaves=64,
        max_depth=8,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="regression",
        random_state=42
    )

    model.fit(
        X_train, y_train,
        categorical_feature=categorical
    )

    y_val_pred = model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val, y_val_pred)
    mape_scores.append(mape)
    print(f" MAPE: {mape:.4f}")

    preds = model.predict(X_test)
    sub_df = pd.DataFrame({"id": test_wg["id"], "Umsatz": preds})
    submission = pd.concat([submission, sub_df], axis=0)

# === Final output ===
submission = submission.sort_values("id")
submission.to_csv("/workspaces/bakery_sales_prediction/lightgbm4_submission.csv", index=False)

print(f"\n Avg MAPE: {sum(mape_scores) / len(mape_scores):.4f}")
print(" Submission saved to lightgbm4_submission.csv")

 Training Warengruppe 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 348
[LightGBM] [Info] Number of data points in the train set: 1455, number of used features: 10
[LightGBM] [Info] Start training from score 122.847943
 MAPE: 0.2153
 Training Warengruppe 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 348
[LightGBM] [Info] Number of data points in the train set: 1455, number of used features: 10
[LightGBM] [Info] Start training from score 404.065288
 MAPE: 0.1472
 Training Warengruppe 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the 