In [1]:
!pip install lightgbm





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb 
import joblib

In [3]:
df = pd.read_csv("../data/processed/model_input.csv")
df.head(), df.shape


(   Store  Promo  Promo2  CompetitionDistance  HasCompetition  DayOfWeek  \
 0      2      1       1                570.0               1          2   
 1      2      1       1                570.0               1          3   
 2      2      1       1                570.0               1          4   
 3      2      0       1                570.0               1          5   
 4      2      0       1                570.0               1          0   
 
    IsWeekend  WeekOfYear  Quarter  Sales_Lag_1  Sales_Lag_7  Sales_Lag_30  \
 0          0           6        1       6269.0       3461.0        4422.0   
 1          0           6        1       6438.0       4446.0        4159.0   
 2          0           6        1       5575.0       4340.0        4484.0   
 3          1           6        1       4203.0       4545.0        2342.0   
 4          0           7        1       3031.0       2956.0        6775.0   
 
    Sales_Rolling_7  Sales_Rolling_30  Sales  
 0      4642.428571      

In [4]:
X = df.drop('Sales', axis=1)
y = df['Sales']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((205470, 14), (51368, 14))

In [6]:
def evaluate_model(y_true, y_pred, name="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    print(f"--- {name} ---")
    print(f"MAE  : {mae:.2f}")
    print(f"RMSE : {rmse:.2f}")
    print(f"MAPE : {mape:.2f}%\n")

    return mae, rmse, mape


In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

evaluate_model(y_test, y_pred_lr, "Linear Regression")


--- Linear Regression ---
MAE  : 1038.17
RMSE : 1447.17
MAPE : 17.35%



(1038.1689678984983,
 np.float64(1447.1697194985136),
 np.float64(17.35371794532323))

In [8]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

evaluate_model(y_test, y_pred_rf, "Random Forest")


--- Random Forest ---
MAE  : 625.06
RMSE : 924.74
MAPE : 10.27%



(625.0574347843016,
 np.float64(924.7434884008946),
 np.float64(10.26570705114108))

In [11]:
from lightgbm import early_stopping, log_evaluation

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

lgb_model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    num_boost_round=500,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

y_pred_lgb = lgb_model.predict(X_test)

evaluate_model(y_test, y_pred_lgb, "LightGBM")


Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 1170.84	valid_1's rmse: 1185.38
[100]	training's rmse: 1049.24	valid_1's rmse: 1070.63
[150]	training's rmse: 998.269	valid_1's rmse: 1024.15
[200]	training's rmse: 960.782	valid_1's rmse: 991.536
[250]	training's rmse: 938.134	valid_1's rmse: 972.52
[300]	training's rmse: 921.753	valid_1's rmse: 959.79
[350]	training's rmse: 908.644	valid_1's rmse: 950.108
[400]	training's rmse: 896.895	valid_1's rmse: 941.464
[450]	training's rmse: 884.228	valid_1's rmse: 931.623
[500]	training's rmse: 873.869	valid_1's rmse: 923.631
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 873.869	valid_1's rmse: 923.631
--- LightGBM ---
MAE  : 636.25
RMSE : 923.63
MAPE : 10.60%



(636.2499122557631,
 np.float64(923.6306717253959),
 np.float64(10.600592612928068))

In [12]:
models = ["Linear Regression", "Random Forest", "LightGBM"]
# just a note to compare printed scores manually
print("Check scores above ðŸ‘† and pick the best model.")


Check scores above ðŸ‘† and pick the best model.


In [14]:
joblib.dump(lgb_model, "../model/sales_forecast_model.joblib")
print("Model saved in models/ folder")


Model saved in models/ folder


In [15]:
joblib.dump(list(X.columns), "../model/feature_columns.joblib")


['../model/feature_columns.joblib']