In [22]:
!pip install --upgrade lightgbm



In [23]:
!pip install tensorflow scikit-learn



In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import mlflow
import mlflow.lightgbm
from pathlib import Path
_root = Path.cwd() if (Path.cwd() / "src").exists() else Path.cwd().parent
mlflow.set_tracking_uri(f"sqlite:///{(_root / 'mlflow.db').resolve().as_posix()}")
mlflow.set_experiment("experiments-lightgbm")
mlflow.lightgbm.autolog(log_models=True)

import os
# Data path: use project data/ folder (works when run from experiments/ or project root)
DATA_DIR = os.path.abspath("../data") if os.path.isdir("../data") else os.path.abspath("data")
for dirname, _, filenames in os.walk(DATA_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sales_train_validation = pd.read_csv(os.path.join(DATA_DIR, "sales_train_validation.csv"))
calendar = pd.read_csv(os.path.join(DATA_DIR, "calendar.csv"))

In [None]:
sales_train_validation.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


##Pre-processing data for Single Store Sale prediction

In [None]:
numerical_columns = sales_train_validation.select_dtypes(include=['number'])
numerical_columns.groupby(sales_train_validation['store_id']).sum()

Unnamed: 0_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA_1,4337,4155,2816,3051,2630,3276,3450,5437,4340,3157,...,3982,5437,5954,4345,3793,3722,3709,4387,5577,6113
CA_2,3494,3046,2121,2324,1942,2288,2629,3729,2957,2218,...,4440,5352,5760,3830,3631,3691,3303,4457,5884,6082
CA_3,4739,4827,3785,4232,3817,4369,4703,5456,5581,4912,...,5337,6936,8271,6068,5683,5235,5018,5623,7419,7721
CA_4,1625,1777,1386,1440,1536,1389,1469,1988,1818,1535,...,2496,2839,3047,2809,2677,2500,2458,2628,2954,3271
TX_1,2556,2687,1822,2258,1694,2734,1691,2820,2887,2174,...,3084,3724,4192,3410,3257,2901,2776,3022,3700,4033
TX_2,3852,3937,2731,2954,2492,3439,2588,3772,3657,2932,...,3897,4475,4998,3311,3727,3384,3446,3902,4483,4292
TX_3,3030,3006,2225,2169,1726,2833,1947,2848,2832,2213,...,3819,4261,4519,3147,3938,3315,3380,3691,4083,3957
WI_1,2704,2194,1562,1251,2,2049,2815,3248,1674,1355,...,3862,4862,4812,3236,3069,3242,3324,3991,4772,4874
WI_2,2256,1922,2018,2522,1175,2244,2232,2643,2140,1836,...,6259,5579,5566,4347,4464,4194,4393,4988,5404,5127
WI_3,4038,4198,3317,3211,2132,4590,4486,5991,4850,3240,...,4613,4897,4521,3556,3331,3159,3226,3828,4686,4325


In [None]:
#Aggregate by the store level for now
numerical_columns = sales_train_validation.select_dtypes(include=['number'])
store_level = numerical_columns.groupby(sales_train_validation['store_id']).sum()
store_levelt = store_level.transpose()
store_levelt['d'] = store_levelt.index
store_levelt

store_id,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3,d
d_1,4337,3494,4739,1625,2556,3852,3030,2704,2256,4038,d_1
d_2,4155,3046,4827,1777,2687,3937,3006,2194,1922,4198,d_2
d_3,2816,2121,3785,1386,1822,2731,2225,1562,2018,3317,d_3
d_4,3051,2324,4232,1440,2258,2954,2169,1251,2522,3211,d_4
d_5,2630,1942,3817,1536,1694,2492,1726,2,1175,2132,d_5
...,...,...,...,...,...,...,...,...,...,...,...
d_1909,3722,3691,5235,2500,2901,3384,3315,3242,4194,3159,d_1909
d_1910,3709,3303,5018,2458,2776,3446,3380,3324,4393,3226,d_1910
d_1911,4387,4457,5623,2628,3022,3902,3691,3991,4988,3828,d_1911
d_1912,5577,5884,7419,2954,3700,4483,4083,4772,5404,4686,d_1912


In [None]:
#Merge this with the calendar data set to look at trends
store_level_final = store_levelt.merge(calendar, on='d')
from datetime import datetime
store_level_final['date'] = store_level_final['date'].apply(lambda t: datetime.strptime(t, '%Y-%m-%d'))
store_level_final

Unnamed: 0,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,WI_1,WI_2,WI_3,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,4337,3494,4739,1625,2556,3852,3030,2704,2256,4038,...,1,1,2011,,,,,0,0,0
1,4155,3046,4827,1777,2687,3937,3006,2194,1922,4198,...,2,1,2011,,,,,0,0,0
2,2816,2121,3785,1386,1822,2731,2225,1562,2018,3317,...,3,1,2011,,,,,0,0,0
3,3051,2324,4232,1440,2258,2954,2169,1251,2522,3211,...,4,2,2011,,,,,1,1,0
4,2630,1942,3817,1536,1694,2492,1726,2,1175,2132,...,5,2,2011,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1908,3722,3691,5235,2500,2901,3384,3315,3242,4194,3159,...,5,4,2016,,,,,0,0,0
1909,3709,3303,5018,2458,2776,3446,3380,3324,4393,3226,...,6,4,2016,,,,,0,0,0
1910,4387,4457,5623,2628,3022,3902,3691,3991,4988,3828,...,7,4,2016,,,,,0,0,0
1911,5577,5884,7419,2954,3700,4483,4083,4772,5404,4686,...,1,4,2016,,,,,0,0,0


In [None]:
# Preparing the data with a window size
def create_dataset(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

In [None]:
window_size = 4
X, y = create_dataset(store_level_final['CA_1'], window_size)

In [None]:
X.shape, y.shape

((1909, 4), (1909,))

In [None]:
# Splitting the data into train and test sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1527, 4), (1527,), (382, 4), (382,))

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.01,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'colsample_bytree': 0.9,
}

# LightGBM 4.x: early_stopping via callback; num_boost_round as argument
callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=True), lgb.log_evaluation(period=100)]
model_lgb = lgb.train(
    params,
    lgb_train,
    num_boost_round=2500,
    valid_sets=[lgb_train, lgb_eval],
    callbacks=callbacks,
)

2026/02/28 11:48:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'eeb7e8fc754249b399b4cef0f3efdbf8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 1527, number of used features: 4
[LightGBM] [Info] Start training from score 3920.870334
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 582.807	valid_1's rmse: 657.111
[200]	training's rmse: 468.399	valid_1's rmse: 561.889
[300]	training's rmse: 426.915	valid_1's rmse: 541.766
[400]	training's rmse: 401.789	valid_1's rmse: 535.811
[500]	training's rmse: 381.193	valid_1's rmse: 535.766
Early stopping, best iteration is:
[460]	training's rmse: 388.616	valid_1's rmse: 534.5




In [None]:
import math
pred = model_lgb.predict(X_test)
mae = mean_squared_error(y_test, pred)
rmse = math.sqrt(mae)
rmse

534.5004702933962

In [None]:
import xgboost as xgb

# Convert data into DMatrix format, which is used by XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.01,
    'subsample': 0.75,
    'colsample_bytree': 0.9,
    'max_depth': 6,
    'seed': 236
}

# number of boosting rounds and early stopping
num_boost_round = 2500
early_stopping_rounds = 50

# Train model with early stopping
model_xgb = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dtrain, 'train'), (dtest, 'eval')],
                  early_stopping_rounds=early_stopping_rounds, verbose_eval=100)

[0]	train-rmse:977.77072	eval-rmse:1042.75079
[100]	train-rmse:582.12591	eval-rmse:679.12374
[200]	train-rmse:434.58325	eval-rmse:572.08847
[300]	train-rmse:368.81097	eval-rmse:542.20813
[400]	train-rmse:332.07062	eval-rmse:535.64659
[500]	train-rmse:309.75887	eval-rmse:534.28687
[513]	train-rmse:307.19469	eval-rmse:534.31306


xgb is overfitting