In [22]:
%cd ..

/Users/tantri/Documents/My Research Material/UTSSpring23/AdvancedML/AT2/Retail_Analytics/American_Retail_Sales/notebooks


In [26]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the preprocessed data from a CSV file
data = pd.read_csv('../data/processed/final_merged_events.csv', low_memory=False)

# Pseudocode explanation:
# 1. Import the required libraries (pandas and numpy).
# 2. Load the preprocessed data from the 'final_merged_events.csv' file.
# 3. The 'low_memory=False' parameter is used to read the CSV file efficiently.


In [None]:
#not storing this in a data_cleaned df variable as the dataset is very large. 

In [27]:
# Import the required library
from sklearn.pipeline import Pipeline

# Pseudocode explanation:
# 1. Import the Pipeline class from scikit-learn.
# 2. A pipeline is a sequence of data processing steps.

In [28]:
# List of store names
store_names = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

# Define the groups
group1 = ['CA_1', 'CA_2', 'CA_3', 'CA_4']
group2 = ['TX_1', 'TX_2', 'TX_3']
group3 = ['WI_1', 'WI_2', 'WI_3']

In [36]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
import numpy as np
import os
import joblib


# Define a custom transformer for feature engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['date'] = pd.to_datetime(X['date'])

        # Extract date components and create new features
        X['day_of_week'] = X['date'].dt.dayofweek
        X['month'] = X['date'].dt.month
        X['year'] = X['date'].dt.year

        # Drop the original 'date' column if needed
        # X = X.drop(columns=['date'])

        return X

# Define the LightGBM model
model = lgb.LGBMRegressor()

# Define the categorical and date feature groups
date_features = ['day_of_week', 'month', 'year']
categorical_features = ['store_id', 'item_id']

# Define a pipeline for target encoding

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('feature_engineer', FeatureEngineer(), []),
        ('target_encoder', TargetEncoder(),categorical_features)  # Apply target encoding
    ] 
)

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocessing steps
    ('model', model)  # LightGBM model
])

# Initialize lists to store RMSE for each group
rmse_scores = []

# Iterate over each group and filter the data accordingly
for group_idx, group in enumerate([group1, group2, group3]):
    # Filter the data for the current group
    group_data = data[data['store_id'].isin(group)]

    # Define the target variable
    target = 'sales'

    # Split the data into training, validation, and test sets
    train_data, test_data = train_test_split(group_data, test_size=0.2, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

    # Extract features and target variables for training, validation, and test sets
    X_train, y_train = train_data.drop(columns=[target]), train_data[target]
    X_val, y_val = val_data.drop(columns=[target]), val_data[target]
    X_test, y_test = test_data.drop(columns=[target]), test_data[target]

    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    test_predictions = pipeline.predict(X_test)
    encoder = pipeline.named_steps['preprocessor'].named_transformers_['target_encoder']
    #encoder = pipeline.named_steps['target_encoder']

    print (encoder)
    model_and_encoders = {
        'model': pipeline,
        'encoders': encoder
    }

    models_dir = "../models/Predictive"  # Update the directory name
    os.makedirs(models_dir, exist_ok=True)  # Create the directory if it doesn't exist

    # Define the model file path for saving
    model_file_path = os.path.join(models_dir, f'model_group_{group_idx + 1}.joblib')

    # Save the model to the specified file path using joblib
    joblib.dump(model_and_encoders, model_file_path)


    # Calculate RMSE on the test set
    rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
    print(f'Test RMSE for the group{group_idx + 1}: {rmse}')

    # Append the RMSE score to the list
    rmse_scores.append(rmse)

# Calculate the average RMSE across groups
average_rmse = np.mean(rmse_scores)
print(f'Average RMSE across groups: {average_rmse}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 260
[LightGBM] [Info] Number of data points in the train set: 12028182, number of used features: 2
[LightGBM] [Info] Start training from score 3.410668
TargetEncoder(cols=['store_id', 'item_id'])
Test RMSE for the group1: 8.023665513740387
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 259
[LightGBM] [Info] Number of data points in the train set: 9021136, number of used features: 2
[LightGBM] [Info] Start training from score 2.929159
TargetEncoder(cols=['store_id', 'item_id'])
Test RMSE for the group2: 7.089358911886424
[LightGBM] 

In [37]:
import pandas as pd
import numpy as np
test_data = pd.read_csv('../data/processed/test_final_merged.csv', low_memory=False)

test_data['date'] = pd.to_datetime(test_data['date'])

# Now you can use .dt accessor to extract date components
test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month
test_data['year'] = test_data['date'].dt.year
test_data_2016 = test_data[test_data['year'] == 2016]


In [51]:
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error
import numpy as np
import random

# Define the store names and groups
store_names = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
group1 = ['CA_1', 'CA_2', 'CA_3', 'CA_4']
group2 = ['TX_1', 'TX_2', 'TX_3']
group3 = ['WI_1', 'WI_2', 'WI_3']

# Filter the data for the year 2016
test_data_2016 = test_data[test_data['year'] == 2016]

# Define the number of random days to select
num_random_days = 30

# Get a random sample of day indices from the 2016 data
random_day_indices = random.sample(test_data_2016.index.tolist(), num_random_days)

# Iterate through each randomly selected day
for index in random_day_indices:
    row = test_data_2016.loc[index]
    
    # Determine the model_group based on the store_id
    store_id = row['store_id']
    if store_id in group1:
        model_group = 1
    elif store_id in group2:
        model_group = 2
    elif store_id in group3:
        model_group = 3
    else:
        # Handle the case when the store_id doesn't match any group
        model_group = None

    if model_group is not None:
        # Load the trained model for the determined model_group
        model_file_path = f"../models/LightGBM/model_group_{model_group}.joblib"
        loaded_model = joblib.load(model_file_path)
        print(loaded_model['encoders'])
        # Prepare the input data for batch prediction
        input_data = pd.DataFrame({
            'day_of_week': [row['day_of_week']],
            'month': [row['month']],
            'year': [row['year']],
            'store_id': [row['store_id']],
            'item_id': [row['item_id']]
        })

        # Perform target encoding on categorical features using the loaded encoders
        for feature, encoder in loaded_model['encoders'].items():
            input_data[feature] = encoder.transform(input_data[feature])

        # Make batch predictions using the loaded model
        predicted_sales_batch = loaded_model['model'].predict(input_data)

        # Calculate the squared difference for the batch
        squared_diff_batch = (row['sales'] - predicted_sales_batch[0]) ** 2

        # Calculate the RMSE for the selected random day
        rmse_batch = np.sqrt(squared_diff_batch)

        # Print the RMSE for the selected random day
        print(f"LightGBM RMSE for {row['date']} in {row['store_id']}: {rmse_batch}")


{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-01-08 00:00:00 in CA_2: 6.325721139266072
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-05-19 00:00:00 in WI_3: 7.385783287405409
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-03-12 00:00:00 in CA_2: 1.057343191444116
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-01-22 00:00:00 in TX_2: 2.666413111623605
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-04-21 00:00:00 in WI_3: 1.9363434518271796
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': TargetEncoder(cols=['item_id'])}
LightGBM RMSE for 2016-03-30 00:00:00 in TX_3: 17.133841770993694
{'store_id': TargetEncoder(cols=['store_id']), 'item_id': Targ