In [1]:
import pandas as pd
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Embedding, Dense, Dropout, Flatten, concatenate, Input, LSTM
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2024-02-19 17:24:41.846222: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 17:24:41.846354: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 17:24:41.867938: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-19 17:24:41.925595: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('/depot/lanhamm/project-smith/Test/sales1CA2_Melt_Calendar_OutTreat_LabelEnc.csv')

In [3]:
df.columns

Index(['day', 'wday', 'month', 'year', 'item_id_enc', 'dept_id_enc',
       'store_id_enc', 'event_name_1_enc', 'event_type_1_enc',
       'event_name_1_enc.1', 'snap_CA', 'sales'],
      dtype='object')

In [4]:
# LSTM
sequence_length = 31
sequences = []
categorical_data = []
numerical_data = []
targets = []

In [5]:
# Ensure the data is sorted if it's not already
df.sort_values(by=['day', 'year', 'month', 'wday', 'store_id_enc', 'dept_id_enc', 'item_id_enc', 'event_type_1_enc', 'event_name_1_enc', 'snap_CA'], inplace=True)

# Group by the categorical variables
grouped = df.groupby(['store_id_enc', 'dept_id_enc', 'item_id_enc', 'event_type_1_enc', 'event_name_1_enc'])

In [6]:
for _, group in grouped:
    sales = group['sales'].values
    
    # Convert necessary columns to numpy arrays
    store_ids = group['store_id_enc'].values
    dept_ids = group['dept_id_enc'].values
    item_ids = group['item_id_enc'].values
    events = group['event_name_1_enc'].values
    evtype = group['event_type_1_enc'].values
    
    # New: Numerical features
    days = group['day'].values
    years = group['year'].values
    months = group['month'].values
    wd = group['wday'].values
    snap_CA = group['snap_CA'].values
    
    for i in range(len(sales) - sequence_length):
        sequences.append(sales[i:i+sequence_length])
        targets.append(sales[i+sequence_length])
        categorical_data.append([store_ids[i], dept_ids[i], item_ids[i], events[i], evtype[i]])
        # Include numerical features
        numerical_data.append([days[i], years[i], months[i], wd[i], snap_CA[i]])

In [7]:
# Convert to numpy arrays
sequences = np.array(sequences)
targets = np.array(targets)
categorical_data = np.array(categorical_data)
numerical_data = np.array(numerical_data)

# Split data into train and test sets
X_train_sequences, X_test_sequences, y_train, y_test, X_train_categorical, X_test_categorical, X_train_numerical, X_test_numerical = train_test_split(sequences, targets, categorical_data, numerical_data, test_size=0.2, random_state=42)


In [8]:
# Reshape input for LSTM
X_train_sequences = X_train_sequences.reshape((X_train_sequences.shape[0], X_train_sequences.shape[1], 1))
X_test_sequences = X_test_sequences.reshape((X_test_sequences.shape[0], X_test_sequences.shape[1], 1))

In [9]:
# Define model inputs for categorical variables

embedding_dim = 10  # Reduced embedding dimension

input_item = Input(shape=(1,))
embed_item = Embedding(input_dim=np.max(df['item_id_enc'])+1, output_dim=embedding_dim)(input_item)
flat_item = Flatten()(embed_item)

input_dept = Input(shape=(1,))
embed_dept = Embedding(input_dim=np.max(df['dept_id_enc'])+1, output_dim=embedding_dim)(input_dept)
flat_dept = Flatten()(embed_dept)

input_store = Input(shape=(1,))
embed_store = Embedding(input_dim=np.max(df['store_id_enc'])+1, output_dim=embedding_dim)(input_store)
flat_store = Flatten()(embed_store)

input_event = Input(shape=(1,))
embed_event = Embedding(input_dim=np.max(df['event_name_1_enc'])+1, output_dim=embedding_dim)(input_event)
flat_event = Flatten()(embed_event)

input_evtype = Input(shape=(1,))
embed_evtype = Embedding(input_dim=np.max(df['event_type_1_enc'])+1, output_dim=embedding_dim)(input_evtype)
flat_evtype = Flatten()(embed_evtype)

input_numerical = Input(shape=(5,))  # Adjust shape based on the number of numerical features

2024-02-19 17:26:27.494892: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [10]:
# Define model input for the sales sequences
input_sequences = Input(shape=(sequence_length, 1))

# GRU layer for processing sequences
lstm_out = LSTM(50)(input_sequences)  #

# Concatenate GRU output with flattened embeddings
concat = concatenate([flat_store, flat_dept, flat_item, flat_event, flat_evtype, flat_evtype, input_numerical])

# Output layer
output = Dense(1)(concat)

In [11]:
# Create and compile model
model = Model(inputs=[input_store, input_dept, input_item, input_event, input_evtype, input_sequences, input_numerical], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

In [12]:
model.fit(
    [X_train_categorical[:, 0], X_train_categorical[:, 1], X_train_categorical[:, 2], X_train_categorical[:, 3], X_train_categorical[:, 4], X_train_sequences, X_train_numerical],
    y_train,
    epochs=40,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 21/40

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x2b1acf49b850>

In [13]:
# Predict on the test set
y_pred = model.predict([X_test_categorical[:, 0], X_test_categorical[:, 1], X_test_categorical[:, 2], X_test_categorical[:, 3], X_test_categorical[:, 4], X_test_sequences, X_test_numerical])

# Evaluate the model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate Root Mean Squared Error
rmse = math.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 1.8446230595788107
Root Mean Squared Error (RMSE): 1.3581690099464097


In [14]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.8175005627942337
R-squared (R²): 0.3129957418386349
