In [1]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np

# Load the data
data_path = 'C:/Users/User/PycharmProjects/M5 Forecasting - Accuracy/data/'
sales_train_validation = pd.read_csv(data_path + 'sales_train_validation.csv')
calendar = pd.read_csv(data_path + 'calendar.csv')
sell_prices = pd.read_csv(data_path + 'sell_prices.csv')

# Find the maximum day in the training data
max_day = sales_train_validation.columns[-1]
max_day_num = int(max_day.split('_')[1])

# Create columns for the next 28 days
future_days = [f'd_{i}' for i in range(max_day_num + 1, max_day_num + 29)]
for day in future_days:
    sales_train_validation[day] = None

# Melt the extended sales data to long format
sales_train_validation = sales_train_validation.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='day',
    value_name='sales'
)

In [2]:
# Merge with the calendar data
extended_data = sales_train_validation.merge(calendar, left_on='day', right_on='d', how='left')

# Merge with the sell prices data
extended_data = extended_data.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')


In [3]:
# Convert date column to datetime
extended_data['date'] = pd.to_datetime(extended_data['date'])

# Date features
extended_data['day'] = extended_data['date'].dt.day
extended_data['weekday'] = extended_data['date'].dt.weekday
extended_data['month'] = extended_data['date'].dt.month
extended_data['year'] = extended_data['date'].dt.year

# Rolling window features
extended_data['rolling_mean_7'] = extended_data.groupby(['id'])['sales'].shift(1).rolling(window=7).mean()
extended_data['rolling_std_7'] = extended_data.groupby(['id'])['sales'].shift(1).rolling(window=7).std()
extended_data['rolling_mean_30'] = extended_data.groupby(['id'])['sales'].shift(1).rolling(window=30).mean()
extended_data['rolling_std_30'] = extended_data.groupby(['id'])['sales'].shift(1).rolling(window=30).std()

# Handle NaN values in sell_price column by filling with the mean price of that item
extended_data['sell_price'] = extended_data.groupby('item_id')['sell_price'].transform(lambda x: x.fillna(x.mean()))


In [4]:
print(extended_data.head())


                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002  HOBBIES_1  HOBBIES     CA_1   
2  HOBBIES_1_003_CA_1_validation  HOBBIES_1_003  HOBBIES_1  HOBBIES     CA_1   
3  HOBBIES_1_004_CA_1_validation  HOBBIES_1_004  HOBBIES_1  HOBBIES     CA_1   
4  HOBBIES_1_005_CA_1_validation  HOBBIES_1_005  HOBBIES_1  HOBBIES     CA_1   

  state_id  day sales       date  wm_yr_wk  ...  event_name_2  event_type_2  \
0       CA   29     0 2011-01-29     11101  ...           NaN           NaN   
1       CA   29     0 2011-01-29     11101  ...           NaN           NaN   
2       CA   29     0 2011-01-29     11101  ...           NaN           NaN   
3       CA   29     0 2011-01-29     11101  ...           NaN           NaN   
4       CA   29     0 2011-01-29     11101  ...           NaN           NaN   

   snap_CA  snap_TX snap_WI sell_price rolli

In [5]:
# Verify calendar dates and cutoff date
cutoff_date_str = calendar[calendar['d'] == max_day]['date'].values[0]
print(f"Cutoff date from calendar: {cutoff_date_str}")
cutoff_date = pd.to_datetime(cutoff_date_str)
print(f"Parsed cutoff date: {cutoff_date}")

# Filter the training data up to the date corresponding to the cutoff date
train_data = extended_data[extended_data['date'] <= cutoff_date]
print(f"Training data shape: {train_data.shape}")

Cutoff date from calendar: 2016-04-24
Parsed cutoff date: 2016-04-24 00:00:00
Training data shape: (58327370, 27)


In [6]:
# Selected features
selected_features = [
    'id', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30',
    'sell_price', 'wm_yr_wk', 'weekday', 'day', 'month', 'year'
]

# Prepare the training data
X_train = train_data[selected_features]
y_train = train_data['sales']
print(X_train.shape)

(58327370, 11)


In [7]:

# Optimize memory usage for training data
def optimize_memory(df):
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].astype(np.float32)
        elif df[col].dtype == "int64":
            df[col] = df[col].astype(np.int32)
    return df

X_train = optimize_memory(X_train)

# Train the model
model = HistGradientBoostingRegressor()
model.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,

MemoryError: Unable to allocate 4.78 GiB for an array with shape (58327370, 11) and data type float64