In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import numpy as np

In [6]:
data = pd.read_excel('data/online_retail.xlsx', parse_dates=['InvoiceDate'])

In [7]:
data['TotalSales'] = data['Quantity'] * data['UnitPrice']
data['Date'] = data['InvoiceDate'].dt.date

In [8]:
daily_sales = data.groupby(['Date', 'StockCode'])['TotalSales'].sum().reset_index()

In [9]:
daily_sales = daily_sales.sort_values(by=['StockCode', 'Date'])  # Ensure sorted for lag calculation
daily_sales['Lag_1'] = daily_sales.groupby('StockCode')['TotalSales'].shift(1)
daily_sales['Lag_7'] = daily_sales.groupby('StockCode')['TotalSales'].shift(7)

In [10]:
daily_sales = daily_sales.dropna()

In [11]:
daily_sales['Day'] = pd.to_datetime(daily_sales['Date']).dt.day
daily_sales['Month'] = pd.to_datetime(daily_sales['Date']).dt.month
daily_sales['Week'] = pd.to_datetime(daily_sales['Date']).dt.isocalendar().week

In [12]:
X = daily_sales[['Lag_1', 'Lag_7', 'Day', 'Month', 'Week']]
y = daily_sales['TotalSales']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [14]:
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Model Mean Absolute Error: {mae}")

Model Mean Absolute Error: 41.18291281087989


In [23]:
from datetime import datetime

# Get today's date
today = datetime.now().date()

# Get the most recent date in the dataset
latest_date = daily_sales['Date'].max()

# Check if we have sufficient data to compute lag features for prediction
if (latest_date - pd.Timedelta(days=1)) in daily_sales['Date'].values and \
   (latest_date - pd.Timedelta(days=7)) in daily_sales['Date'].values:

    # Get the most recent sales values for lag features
    lag_1_value = daily_sales.loc[daily_sales['Date'] == (latest_date - pd.Timedelta(days=1)), 'TotalSales'].values[0]
    lag_7_value = daily_sales.loc[daily_sales['Date'] == (latest_date - pd.Timedelta(days=7)), 'TotalSales'].values[0]

    # Extract today's additional features
    day_value = today.day
    month_value = today.month
    week_value = today.isocalendar()[1]  # Week number

    # Prepare input for prediction
    future_lag_features = pd.DataFrame([{
        'Lag_1': lag_1_value,
        'Lag_7': lag_7_value,
        'Day': day_value,
        'Month': month_value,
        'Week': week_value
    }])

    # Ensure the columns are in the same order as X_train
    future_lag_features = future_lag_features[X_train.columns]

    # Predict today's sales
    today_prediction = model.predict(future_lag_features)

    print(f"Predicted sales for {today}: {today_prediction[0]}")
else:
    print("Insufficient data to compute lag features for today's prediction.")


Predicted sales for 2024-12-24: 20.02800178527832
