In [7]:
pip install lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# === Load data ===
train = pd.read_csv("/workspaces/bakery_sales_prediction/train.csv", parse_dates=["Datum"])
test = pd.read_csv("/workspaces/bakery_sales_prediction/test.csv", parse_dates=["Datum"])
wetter = pd.read_csv("/workspaces/bakery_sales_prediction/wetter.csv", parse_dates=["Datum"])
kiwo = pd.read_csv("/workspaces/bakery_sales_prediction/kiwo.csv", parse_dates=["Datum"])

# === Merge external data ===
train = train.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")
test = test.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")

# === Date features ===
for df in [train, test]:
    df["dayofweek"] = df["Datum"].dt.dayofweek
    df["month"] = df["Datum"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
    df["is_monday"] = (df["dayofweek"] == 0).astype(int)
    df["is_friday"] = (df["dayofweek"] == 4).astype(int)

# === Fill missing weather values ===
for col in ["Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode"]:
    median = train[col].median()
    train[col] = train[col].fillna(median)
    test[col] = test[col].fillna(median)

# === Lag feature (previous day's Umsatz) ===
train.sort_values(["Warengruppe", "Datum"], inplace=True)
train["Umsatz_lag1"] = train.groupby("Warengruppe")["Umsatz"].shift(1)
train = train.dropna(subset=["Umsatz_lag1"])  # drop first rows where lag is NA

# === Log transform target to reduce skew ===
train["log_Umsatz"] = np.log1p(train["Umsatz"])

# === Features used for training ===
features = [
    "Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode",
    "KielerWoche", "dayofweek", "month", "is_weekend", "is_monday", "is_friday",
    "Umsatz_lag1"
]

# === Training ===
submission = pd.DataFrame()
mape_scores = []

for wg in sorted(train["Warengruppe"].unique()):
    print(f"Training Warengruppe {wg}")

    train_wg = train[train["Warengruppe"] == wg].copy()
    test_wg = test[test["Warengruppe"] == wg].copy()

    X = train_wg[features]
    y = train_wg["log_Umsatz"]  # log-transformed target

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = lgb.LGBMRegressor(
        n_estimators=200,
        objective="regression",
        random_state=42
    )

    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    val_pred_original = np.expm1(y_val_pred)
    y_val_original = np.expm1(y_val)

    mape = mean_absolute_percentage_error(y_val_original, val_pred_original)
    mape_scores.append(mape)
    print(f" MAPE: {mape:.4f}")

    # Handle lag feature in test set by merging last training entry
    if "Umsatz_lag1" not in test_wg.columns:
        last_umsatz = train_wg.iloc[-1]["Umsatz"]
        test_wg["Umsatz_lag1"] = last_umsatz

    X_test = test_wg[features]
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)

    sub_df = pd.DataFrame({
        "id": test_wg["id"],
        "Umsatz": y_pred
    })
    submission = pd.concat([submission, sub_df], axis=0)

# === Export final predictions ===
submission = submission.sort_values("id")
submission.to_csv("/workspaces/bakery_sales_prediction/lightgbm2_submission.csv", index=False)

print(f"\n Avg MAPE: {sum(mape_scores) / len(mape_scores):.4f}")
print("Submission saved to lightgbm2_submission.csv")

Training Warengruppe 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 592
[LightGBM] [Info] Number of data points in the train set: 1454, number of used features: 11
[LightGBM] [Info] Start training from score 4.757708
 MAPE: 0.2160
Training Warengruppe 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 592
[LightGBM] [Info] Number of data points in the train set: 1454, number of used features: 11
[LightGBM] [Info] Start training from score 5.946601
 MAPE: 0.1008
Training Warengruppe 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000037 