In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [2]:
import pandas as pd

# Define the common path to the files
data_path = '/kaggle/input/m5-forecasting-accuracy/'

# Load datasets using the common path
sales_train_validation = pd.read_csv(f'{data_path}sales_train_validation.csv')
sales_train_evaluation = pd.read_csv(f'{data_path}sales_train_evaluation.csv')
calendar = pd.read_csv(f'{data_path}calendar.csv')
sell_prices = pd.read_csv(f'{data_path}sell_prices.csv')
sample_submission = pd.read_csv(f'{data_path}sample_submission.csv')


In [3]:
# Melt the sales_train_validation data
sales_train_validation_melted = sales_train_validation.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='sales'
)

# Display the first few rows of the melted DataFrame
print("Melted Sales Train Validation Data:")
print(sales_train_validation_melted.head())

Melted Sales Train Validation Data:
                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002  HOBBIES_1  HOBBIES     CA_1   
2  HOBBIES_1_003_CA_1_validation  HOBBIES_1_003  HOBBIES_1  HOBBIES     CA_1   
3  HOBBIES_1_004_CA_1_validation  HOBBIES_1_004  HOBBIES_1  HOBBIES     CA_1   
4  HOBBIES_1_005_CA_1_validation  HOBBIES_1_005  HOBBIES_1  HOBBIES     CA_1   

  state_id    d  sales  
0       CA  d_1      0  
1       CA  d_1      0  
2       CA  d_1      0  
3       CA  d_1      0  
4       CA  d_1      0  


In [4]:
def merge_in_chunks(df, calendar, prices, chunk_size=1500000):
    # List to hold the merged chunks
    merged_chunks = []
    
    # Number of chunks
    num_chunks = len(df) // chunk_size + 1
    
    for i in range(num_chunks):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk = df[start:end]
        chunk = chunk.merge(calendar, on='d', how='left')
        chunk = chunk.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
              
        merged_chunks.append(chunk)
        
        print(f"Chunk {i+1}/{num_chunks} merged")
    
    # Concatenate all chunks
    merged_df = pd.concat(merged_chunks, ignore_index=True)
    
    return merged_df

# Merge the melted DataFrame in chunks
sales_train_validation_merged = merge_in_chunks(sales_train_validation_melted, calendar, sell_prices)

# Display the first few rows of the merged DataFrame
print("Merged Sales Train Validation Data:")
print(sales_train_validation_merged.head())

Chunk 1/39 merged
Chunk 2/39 merged
Chunk 3/39 merged
Chunk 4/39 merged
Chunk 5/39 merged
Chunk 6/39 merged
Chunk 7/39 merged
Chunk 8/39 merged
Chunk 9/39 merged
Chunk 10/39 merged
Chunk 11/39 merged
Chunk 12/39 merged
Chunk 13/39 merged
Chunk 14/39 merged
Chunk 15/39 merged
Chunk 16/39 merged
Chunk 17/39 merged
Chunk 18/39 merged
Chunk 19/39 merged
Chunk 20/39 merged
Chunk 21/39 merged
Chunk 22/39 merged
Chunk 23/39 merged
Chunk 24/39 merged
Chunk 25/39 merged
Chunk 26/39 merged
Chunk 27/39 merged
Chunk 28/39 merged
Chunk 29/39 merged
Chunk 30/39 merged
Chunk 31/39 merged
Chunk 32/39 merged
Chunk 33/39 merged
Chunk 34/39 merged
Chunk 35/39 merged
Chunk 36/39 merged
Chunk 37/39 merged
Chunk 38/39 merged
Chunk 39/39 merged
Merged Sales Train Validation Data:
                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002  HOBBIES_1  HOBBIES     

In [5]:
# Fill missing sell prices with the last available price
sales_train_validation_merged['sell_price'] = sales_train_validation_merged['sell_price'].ffill()

def create_rolling_features(df, window_sizes):
    for window in window_sizes:
        df[f'rolling_mean_{window}'] = df.groupby(['id'])['sales'].shift(1).rolling(window=window).mean()
        df[f'rolling_std_{window}'] = df.groupby(['id'])['sales'].shift(1).rolling(window=window).std()
    return df

# Apply rolling features
window_sizes = [7, 30]
sales_train_validation_merged = create_rolling_features(sales_train_validation_merged, window_sizes)

In [6]:
# Handle NaNs in rolling features
rolling_mean_cols = [f'rolling_mean_{window}' for window in window_sizes]
rolling_std_cols = [f'rolling_std_{window}' for window in window_sizes]

# Fill NaNs in rolling features with the overall mean and std (assuming that initial periods can use overall stats)
sales_train_validation_merged[rolling_mean_cols] = sales_train_validation_merged[rolling_mean_cols].fillna(sales_train_validation_merged['sales'].mean())
sales_train_validation_merged[rolling_std_cols] = sales_train_validation_merged[rolling_std_cols].fillna(sales_train_validation_merged['sales'].std())

Reduce Memory Usage by Downcasting
Here we convert data types to use less memory. For example, converting float64 to float32 and int64 to int32 can save memory.

In [7]:
import numpy as np

def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        
        # Check if the column is a date and skip if it is
        if np.issubdtype(col_type, np.datetime64):
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

        else:
            df[col] = df[col].astype('category')

    return df

sales_train_validation_merged = reduce_memory_usage(sales_train_validation_merged)

In [8]:
# Convert to string first, then to datetime
sales_train_validation_merged['date'] = sales_train_validation_merged['date'].astype(str)
sales_train_validation_merged['date'] = pd.to_datetime(sales_train_validation_merged['date'], errors='coerce')

# Check the result
print(sales_train_validation_merged['date'].dtype)
print(sales_train_validation_merged['date'].head(10))

datetime64[ns]
0   2011-01-29
1   2011-01-29
2   2011-01-29
3   2011-01-29
4   2011-01-29
5   2011-01-29
6   2011-01-29
7   2011-01-29
8   2011-01-29
9   2011-01-29
Name: date, dtype: datetime64[ns]


# List of columns to remove
columns_to_remove = [
    'weekday', 'wday', 'month', 'year', 'event_type_1', 'event_name_2', 'event_type_2'
]
Remove Unnecessary Columns
Drop any columns that aren't essential for the modeling process.

In [9]:
# List of columns to remove
columns_to_remove = [
    'dept_id', 'cat_id', 'state_id', 'weekday', 
    'wday', 'month', 'year', 'event_type_1', 
    'event_name_2', 'event_type_2'
]

# Remove the unwanted columns
sales_train_validation_merged = sales_train_validation_merged.drop(columns=columns_to_remove)

# Binary encoding: 1 if there's any event, 0 otherwise
sales_train_validation_merged['event_occurred'] = sales_train_validation_merged['event_name_1'].notna().astype(int)

# Drop the original 'event_name_1' column
sales_train_validation_merged = sales_train_validation_merged.drop('event_name_1', axis=1)

# Verify the final DataFrame
print(sales_train_validation_merged.head())
print(sales_train_validation_merged.columns)

                              id        item_id store_id    d  sales  \
0  HOBBIES_1_001_CA_1_validation  HOBBIES_1_001     CA_1  d_1      0   
1  HOBBIES_1_002_CA_1_validation  HOBBIES_1_002     CA_1  d_1      0   
2  HOBBIES_1_003_CA_1_validation  HOBBIES_1_003     CA_1  d_1      0   
3  HOBBIES_1_004_CA_1_validation  HOBBIES_1_004     CA_1  d_1      0   
4  HOBBIES_1_005_CA_1_validation  HOBBIES_1_005     CA_1  d_1      0   

        date  wm_yr_wk  snap_CA  snap_TX  snap_WI  sell_price  rolling_mean_7  \
0 2011-01-29     11101        0        0        0         NaN        1.125977   
1 2011-01-29     11101        0        0        0         NaN        1.125977   
2 2011-01-29     11101        0        0        0         NaN        1.125977   
3 2011-01-29     11101        0        0        0         NaN        1.125977   
4 2011-01-29     11101        0        0        0         NaN        1.125977   

   rolling_std_7  rolling_mean_30  rolling_std_30  event_occurred  
0       3.87

In [10]:
# Define the features and target
features = ['sell_price', 'wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI','rolling_mean_7',
            'rolling_mean_30', 'rolling_std_7', 'rolling_std_30', 'event_occurred']
target = 'sales'

In [11]:
models = {}  # Dictionary to store models and scalers for each department
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Get unique ids
id = sales_train_validation_merged['id'].unique()

# Calculate total number of items
total_items = len(id)

for idx, items in enumerate(id, start=1):
    # Filter data for the current department
    items_data = sales_train_validation_merged[sales_train_validation_merged['id'] == items]
    
    # Prepare features and target
    X = items_data[features]
    y = items_data[target]
    
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Handle NaNs
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
    X_train_scaled.fillna(X_train_scaled.mean(), inplace=True)
    X_val_scaled = pd.DataFrame(X_val_scaled, columns=features)
    X_val_scaled.fillna(X_val_scaled.mean(), inplace=True)
    X_train_scaled = X_train_scaled.values
    X_val_scaled = X_val_scaled.values
    
    # Initialize the model
    model = SGDRegressor(max_iter=1000, tol=1e-3)
    
    # Train the model on the entire dataset
    model.fit(X_train_scaled, y_train)
    
    # Store the model and scaler for the current department
    models[items] = (model, scaler)
    
    # Evaluate the model
    y_pred = model.predict(X_val_scaled)
    mse = mean_squared_error(y_val, y_pred)
    
    # Print progress every 5,000 items
    if idx % 5000 == 0 or idx == total_items:
        progress = (idx / total_items) * 100
        print(f"Progress: {progress:.2f}% ({idx} /{total_items} items processed)")

print("All items processed and models stored.")

Progress: 16.40% (5000 /30490 items processed)
Progress: 32.80% (10000 /30490 items processed)
Progress: 49.20% (15000 /30490 items processed)
Progress: 65.60% (20000 /30490 items processed)
Progress: 81.99% (25000 /30490 items processed)
Progress: 98.39% (30000 /30490 items processed)
Progress: 100.00% (30490 /30490 items processed)
All items processed and models stored.


In [101]:
# Melt the sample submission data frame
sample_submission = pd.read_csv(f'{data_path}sample_submission.csv')

sample_submission_melted = sample_submission.melt(
    id_vars=['id'],
    value_vars=['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
       'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19', 'F20',
       'F21', 'F22', 'F23', 'F24', 'F25', 'F26', 'F27', 'F28'],
    value_name='sales')
print(sample_submission_melted.dtypes)
print(sample_submission_melted.shape)

id          object
variable    object
sales        int64
dtype: object
(1707440, 3)


In [102]:
# Define the mapping of variables to dates
variable_to_date = {
    'F1': '2016-04-25',
    'F2': '2016-04-26',
    'F3': '2016-04-27',
    'F4': '2016-04-28',
    'F5': '2016-04-29',
    'F6': '2016-04-30',
    'F7': '2016-05-1',
    'F8': '2016-05-2',
    'F9': '2016-05-3',
    'F10': '2016-05-4',
    'F11': '2016-05-5',
    'F12': '2016-05-6',
    'F13': '2016-05-7',
    'F14': '2016-05-8',
    'F15': '2016-05-9',
    'F16': '2016-05-10',
    'F17': '2016-05-11',
    'F18': '2016-05-12',
    'F19': '2016-05-13',
    'F20': '2016-05-14',
    'F21': '2016-05-15',
    'F22': '2016-05-16',
    'F23': '2016-05-17',
    'F24': '2016-05-18',
    'F25': '2016-05-19',
    'F26': '2016-05-20',
    'F27': '2016-05-21',
    'F28': '2016-05-22',  
   }
# Add a new column for dates
sample_submission_melted['date'] = sample_submission_melted['variable'].map(variable_to_date)
sample_submission_melted['date'] = pd.to_datetime(sample_submission_melted['date'])

print("\nDataFrame with Dates:")
print(sample_submission_melted.dtypes)
print(sample_submission_melted.shape)


DataFrame with Dates:
id                  object
variable            object
sales                int64
date        datetime64[ns]
dtype: object
(1707440, 4)


In [103]:
# Merge the sample_submission_melted with calendar to get relevant features.
calendar = pd.read_csv(f'{data_path}calendar.csv')
calendar['date'] = pd.to_datetime(calendar['date'])
sample_submission_melted = pd.merge(sample_submission_melted, calendar, on='date', how='left')

In [104]:
print(sample_submission_melted.head)

<bound method NDFrame.head of                                     id variable  sales       date  wm_yr_wk  \
0        HOBBIES_1_001_CA_1_validation       F1      0 2016-04-25     11613   
1        HOBBIES_1_002_CA_1_validation       F1      0 2016-04-25     11613   
2        HOBBIES_1_003_CA_1_validation       F1      0 2016-04-25     11613   
3        HOBBIES_1_004_CA_1_validation       F1      0 2016-04-25     11613   
4        HOBBIES_1_005_CA_1_validation       F1      0 2016-04-25     11613   
...                                ...      ...    ...        ...       ...   
1707435    FOODS_3_823_WI_3_evaluation      F28      0 2016-05-22     11617   
1707436    FOODS_3_824_WI_3_evaluation      F28      0 2016-05-22     11617   
1707437    FOODS_3_825_WI_3_evaluation      F28      0 2016-05-22     11617   
1707438    FOODS_3_826_WI_3_evaluation      F28      0 2016-05-22     11617   
1707439    FOODS_3_827_WI_3_evaluation      F28      0 2016-05-22     11617   

        weekday  wday

In [105]:
# Merge the sample_submission_melted with sell prices to get relevant features. 
# We loop through the sales_validation_document.

# Prepare the sales_train_validation_merged.
sales_train_validation = pd.read_csv(f'{data_path}sales_train_validation.csv')

# Select only the 'id' and 'item_id' columns
id_subset_df = sales_train_validation[['id', 'item_id','store_id']].drop_duplicates()
id_subset_df['id'] = id_subset_df['id'].astype('category')
id_subset_df['item_id'] = id_subset_df['item_id'].astype('category')
id_subset_df['store_id'] = id_subset_df['store_id'].astype('category')
sample_submission_melted['id'] = sample_submission_melted['id'].astype('category')

sample_submission_melted = pd.merge(
    sample_submission_melted,     # Original dataframe
    id_subset_df,                         # Dataframe containing 'sell_price'
    on='id',  # Columns to match on
    how='left'                           # Use 'left' to maintain all rows from the original dataframe
)

print(sample_submission_melted.shape)

(1707440, 19)


In [106]:
# Merge 'sample_submission_melted_merged' with 'sell_prices' based on 'store_id', 'item_id', and 'wm_yr_wk'
sample_submission_melted = pd.merge(
    sample_submission_melted,     # Original dataframe
    sell_prices,                         # Dataframe containing 'sell_price'
    on=['store_id', 'item_id', 'wm_yr_wk'],  # Columns to match on
    how='left'                           # Use 'left' to maintain all rows from the original dataframe
)

# Verify the merge
print(sample_submission_melted.head())

                              id variable  sales       date  wm_yr_wk weekday  \
0  HOBBIES_1_001_CA_1_validation       F1      0 2016-04-25     11613  Monday   
1  HOBBIES_1_002_CA_1_validation       F1      0 2016-04-25     11613  Monday   
2  HOBBIES_1_003_CA_1_validation       F1      0 2016-04-25     11613  Monday   
3  HOBBIES_1_004_CA_1_validation       F1      0 2016-04-25     11613  Monday   
4  HOBBIES_1_005_CA_1_validation       F1      0 2016-04-25     11613  Monday   

   wday  month  year       d event_name_1 event_type_1 event_name_2  \
0     3      4  2016  d_1914          NaN          NaN          NaN   
1     3      4  2016  d_1914          NaN          NaN          NaN   
2     3      4  2016  d_1914          NaN          NaN          NaN   
3     3      4  2016  d_1914          NaN          NaN          NaN   
4     3      4  2016  d_1914          NaN          NaN          NaN   

  event_type_2  snap_CA  snap_TX  snap_WI        item_id store_id  sell_price  
0     

In [107]:
submission_merged = sample_submission_melted
print(submission_merged.head)

<bound method NDFrame.head of                                     id variable  sales       date  wm_yr_wk  \
0        HOBBIES_1_001_CA_1_validation       F1      0 2016-04-25     11613   
1        HOBBIES_1_002_CA_1_validation       F1      0 2016-04-25     11613   
2        HOBBIES_1_003_CA_1_validation       F1      0 2016-04-25     11613   
3        HOBBIES_1_004_CA_1_validation       F1      0 2016-04-25     11613   
4        HOBBIES_1_005_CA_1_validation       F1      0 2016-04-25     11613   
...                                ...      ...    ...        ...       ...   
1707435    FOODS_3_823_WI_3_evaluation      F28      0 2016-05-22     11617   
1707436    FOODS_3_824_WI_3_evaluation      F28      0 2016-05-22     11617   
1707437    FOODS_3_825_WI_3_evaluation      F28      0 2016-05-22     11617   
1707438    FOODS_3_826_WI_3_evaluation      F28      0 2016-05-22     11617   
1707439    FOODS_3_827_WI_3_evaluation      F28      0 2016-05-22     11617   

        weekday  wday

In [108]:
# List of columns to remove
columns_to_remove = [
    'weekday', 'wday', 'month', 'year', 'event_type_1', 'event_name_2', 'event_type_2'
]
# Remove the unwanted columns
submission_merged = submission_merged.drop(columns=columns_to_remove)

# Binary encoding: 1 if there's any event, 0 otherwise
submission_merged['event_occurred'] = submission_merged['event_name_1'].notna().astype(int)

# Drop the original 'event_name_1' column
submission_merged = submission_merged.drop('event_name_1', axis=1)

# Select relevant columns from sales_train_validation_merged
rolling_features = sales_train_validation_merged[['id', 'date', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30']]

# Merge rolling features into next_28_days based on 'id' and 'date'
submission_merged = submission_merged.merge(rolling_features, how='left', on=['id', 'date'])

# Align data types in next_28_days with those in X_train
submission_merged['sell_price'] = submission_merged['sell_price'].astype('float16')
submission_merged['wm_yr_wk'] = submission_merged['wm_yr_wk'].astype('int16')
submission_merged['snap_CA'] = submission_merged['snap_CA'].astype('int8')
submission_merged['snap_TX'] = submission_merged['snap_TX'].astype('int8')
submission_merged['snap_WI'] = submission_merged['snap_WI'].astype('int8')

# Fill missing values in rolling features
submission_merged[['rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30']] = submission_merged[['rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30']].fillna(0)

# Verify the merge
print(submission_merged.head())

                              id variable  sales       date  wm_yr_wk       d  \
0  HOBBIES_1_001_CA_1_validation       F1      0 2016-04-25     11613  d_1914   
1  HOBBIES_1_002_CA_1_validation       F1      0 2016-04-25     11613  d_1914   
2  HOBBIES_1_003_CA_1_validation       F1      0 2016-04-25     11613  d_1914   
3  HOBBIES_1_004_CA_1_validation       F1      0 2016-04-25     11613  d_1914   
4  HOBBIES_1_005_CA_1_validation       F1      0 2016-04-25     11613  d_1914   

   snap_CA  snap_TX  snap_WI        item_id store_id  sell_price  \
0        0        0        0  HOBBIES_1_001     CA_1    8.382812   
1        0        0        0  HOBBIES_1_002     CA_1    3.970703   
2        0        0        0  HOBBIES_1_003     CA_1    2.970703   
3        0        0        0  HOBBIES_1_004     CA_1    4.640625   
4        0        0        0  HOBBIES_1_005     CA_1    2.880859   

   event_occurred  rolling_mean_7  rolling_std_7  rolling_mean_30  \
0               0             0.0  

In [None]:
next_28_days = submission_merged

import numpy as np
import pandas as pd

# Assuming features used in the model
features = ['sell_price', 'wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 
            'rolling_mean_7', 'rolling_mean_30', 'rolling_std_7', 'rolling_std_30', 'event_occurred']

# Initialize an empty dictionary to store predictions
predictions_dict = {}

# Loop through each unique item id in next_28_days data
for idx, items in enumerate(next_28_days['id'].unique(), start=1):
    
    # Filter data for the current item
    items_data = next_28_days[next_28_days['id'] == items]
    
    # Extract the features
    X_test = items_data[features]
    
    # Load the model and scaler for the current item
    models[items] = (model, scaler)
    
    if model and scaler:
        # Standardize the features using the scaler
        X_test_scaled = scaler.transform(X_test)
        
        # Handle NaNs (if any) by filling them with the mean of the column
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)
        X_test_scaled.fillna(X_test_scaled.mean(), inplace=True)
        X_test_scaled = X_test_scaled.values
        
        # Make predictions for the current item
        predictions = model.predict(X_test_scaled)
        
        # Store predictions in a dictionary
        predictions_dict[items] = predictions
    
    # Print progress every 5,000 items
    if idx % 5000 == 0 or idx == len(next_28_days['id'].unique()):
        progress = (idx / len(next_28_days['id'].unique())) * 100
        print(f"Progress: {progress:.2f}% ({idx} / {len(next_28_days['id'].unique())} items processed)")

print("All predictions generated.")

Progress: 8.20% (5000 / 60980 items processed)
Progress: 16.40% (10000 / 60980 items processed)
