In [8]:
!pip install lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# === Load data ===
train = pd.read_csv("/workspaces/bakery_sales_prediction/train.csv", parse_dates=["Datum"])
test = pd.read_csv("/workspaces/bakery_sales_prediction/test.csv", parse_dates=["Datum"])
wetter = pd.read_csv("/workspaces/bakery_sales_prediction/wetter.csv", parse_dates=["Datum"])
kiwo = pd.read_csv("/workspaces/bakery_sales_prediction/kiwo.csv", parse_dates=["Datum"])

# === Merge weather and Kieler Woche data ===
train = train.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")
test = test.merge(wetter, on="Datum", how="left").merge(kiwo, on="Datum", how="left")

# === Date features ===
for df in [train, test]:
    df["dayofweek"] = df["Datum"].dt.dayofweek
    df["month"] = df["Datum"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)

# === Fill missing values ===
for col in ["Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode"]:
    median = train[col].median()
    train[col] = train[col].fillna(median)
    test[col].fillna(median, inplace=True)

# === Features ===
features = [
    "Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode",
    "KielerWoche", "dayofweek", "month", "is_weekend"
]

# === Training per Warengruppe ===
submission = pd.DataFrame()
mape_scores = []

for wg in sorted(train["Warengruppe"].unique()):
    print(f"Training Warengruppe {wg}")
    
    train_wg = train[train["Warengruppe"] == wg]
    test_wg = test[test["Warengruppe"] == wg]
    
    X = train_wg[features]
    y = train_wg["Umsatz"]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    model = lgb.LGBMRegressor(
        n_estimators=200,
        objective="regression",
        random_state=42
    )

    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val, y_val_pred)
    mape_scores.append(mape)
    print(f"   MAPE: {mape:.4f}")

    preds = model.predict(test_wg[features])
    sub_df = pd.DataFrame({"id": test_wg["id"], "Umsatz": preds})
    submission = pd.concat([submission, sub_df], axis=0)

# === Final export ===
submission = submission.sort_values("id")
submission.to_csv("/workspaces/bakery_sales_prediction/lightgbm_submission.csv", index=False)
print(f"\n Avg MAPE: {sum(mape_scores) / len(mape_scores):.4f}")
print("Submission saved to lightgbm_submission.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves 

Training Warengruppe 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 333
[LightGBM] [Info] Number of data points in the train set: 1455, number of used features: 8
[LightGBM] [Info] Start training from score 122.847943
   MAPE: 0.2160
Training Warengruppe 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 333
[LightGBM] [Info] Number of data points in the train set: 1455, number of used features: 8
[LightGBM] [Info] Start training from score 404.065288
   MAPE: 0.1525
Training Warengruppe 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0