# Imports and Constants

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorboard
import os

2023-06-09 23:07:35.850161: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-09 23:07:36.704629: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-06-09 23:07:36.704740: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/

In [2]:
WINDOW = 20
BATCH_SIZE = 32
BUFFER = 100

# Load data

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
# %pwd

In [4]:
# filepath = 'sales_quantity.csv' #for local imports
filepath = '/home/mariefloco/sales_quantity.csv' #colab
data = pd.read_csv(filepath,names=['date','item_code','quantity'],header = 0 )
data.head()

Unnamed: 0,date,item_code,quantity
0,2022-08-26,1000,15
1,2022-08-26,500,14
2,2023-01-01,8991102380706,13
3,2023-01-01,8991102381017,13
4,2023-01-01,8886008101053,20


In [5]:
# Transform data

In [6]:
#extract date features from date column
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_year'] = data['date'].dt.dayofyear



In [7]:
#stack dataframe based on item_code
item_sales = data.groupby(['item_code','date','year','month','day','day_of_week','day_of_year'])['quantity'].sum().unstack(level=0)
#turn each NaN value to 0
item_sales = item_sales.fillna(0)
item_sales.reset_index(inplace=True)
item_sales.head()

item_code,date,year,month,day,day_of_week,day_of_year,(90)NA18210500154(91)2403,(90)NA18211207820(91)2410,00000001,00000002,...,CL000448327,CL000450943,COS LT,COSLT-228,EC0102190002,EC0102191301,EC0103190002,EC0106190101,MP-2203,SLM0958266
0,2022-01-03,2022,1,3,0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-01-04,2022,1,4,1,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-05,2022,1,5,2,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-01-06,2022,1,6,3,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-07,2022,1,7,4,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
import tensorflow as tf

def create_tokenizer(item_code):
    """
    Create a tokenizer to tokenize item codes.

    Args:
        item_code (list or Series): List or Series containing item codes.

    Returns:
        tf.keras.preprocessing.text.Tokenizer: Tokenizer object fitted on item codes.
    """

    # Create a tokenizer with no filters and case-sensitive tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', lower=False)

    # Fit the tokenizer on the item codes
    tokenizer.fit_on_texts(item_code)

    return tokenizer

# Create a tokenizer using item codes from item_sales dataframe columns
tokenizer = create_tokenizer(item_sales.columns[6:].str.replace(' ', ''))

# Get the length of the tokenizer's word index
tokenizer_word_count = len(tokenizer.word_index)

print(f'Tokenizer has {tokenizer_word_count} tokens')

Tokenizer has 13917 tokens


In [9]:
# Extract date values from item_sales dataframe columns
dates = np.array(item_sales[['month', 'day', 'day_of_week', 'day_of_year']])

# Perform cyclic encoding on the date values
dates_cyclic = np.sin(dates) + np.cos(dates)

prefix_features = []

# Iterate over each product in item_sales columns
for product in item_sales.columns[6:].str.replace(' ', ''):
    # Create prefix features for the product
    prefix_feature = np.array([
        [
            tokenizer.word_index[product],
            dates_cyclic[j][0],
            dates_cyclic[j][1],
            dates_cyclic[j][2],
            dates_cyclic[j][3]
        ]
        for j in range(WINDOW, len(item_sales))
    ], dtype=np.float64)
    
    prefix_features.append(prefix_feature)

# Get the total number of prefix features arrays and the shape of the first array
prefix_features_count = len(prefix_features)
prefix_features_shape = prefix_features[0].shape

print(f"A total of {prefix_features_count} numpy arrays with each one having shape {prefix_features_shape}")

print(prefix_features[0])

A total of 13917 numpy arrays with each one having shape (431, 5)
[[ 1.          1.38177329 -1.37905342  0.68075479 -1.37905342]
 [ 1.          1.38177329 -0.48139935  1.         -0.48139935]
 [ 1.          1.38177329  0.85885106  1.38177329  0.85885106]
 ...
 [ 1.         -0.84887249 -0.83378017 -0.84887249  1.37024645]
 [ 1.         -0.84887249  0.51070471 -1.41044612  0.44592305]
 [ 1.         -1.41044612  1.38177329 -0.67526209 -0.88837995]]


In [10]:
#convert each item sales to a tensorflow dataset
sales_datasets = [tf.data.Dataset.from_tensor_slices(item_sales[column]) for column in item_sales.columns[7:]]
sales_datasets[0].element_spec

2023-06-09 23:07:48.155864: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-09 23:07:48.247473: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-09 23:07:48.249175: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-09 23:07:48.254898: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild

TensorSpec(shape=(), dtype=tf.float64, name=None)

In [11]:
def window_dataset(token_time_ds, sales_ds, window_size):
    """
    Create a windowed dataset by combining token_time_ds and sales_ds.

    Args:
        token_time_ds (tf.data.Dataset): Dataset containing token and time information.
        sales_ds (tf.data.Dataset): Dataset containing sales information.
        window_size (int): Size of the window for creating sequences.

    Returns:
        tf.data.Dataset: Windowed dataset with input-output pairs.
    """

    # Create windows of size window_size+1, shifting by 1, and dropping any incomplete windows
    sales_ds = sales_ds.window(window_size+1, shift=1, drop_remainder=True)

    # Flatten the windows into individual datasets and combine them into a single dataset
    sales_ds = sales_ds.flat_map(lambda w: w.batch(window_size+1))

    # Concatenate token_time_ds and sales_ds tensors along the last axis
    windowed_tensors = tf.concat((list(token_time_ds), list(sales_ds)), axis=-1)

    # Create a new dataset from the concatenated tensors
    ds = tf.data.Dataset.from_tensor_slices(windowed_tensors)

    # Map each element of the dataset to input-output pairs
    ds = ds.map(lambda x: (x[:-1], x[-1]))

    return ds


In [12]:
#window the dataset in batches
ds = [window_dataset(prefix_features[i],sales_datasets[i], WINDOW) for i in range(len(sales_datasets))]
del data
del prefix_features
del sales_datasets
del item_sales

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [13]:
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=False, shuffle_size=1000):
    """
    Splits a TensorFlow dataset into training, validation, and test partitions.

    Args:
        ds (tf.data.Dataset): The input dataset.
        ds_size (int): The total size of the input dataset.
        train_split (float, optional): The fraction of data to allocate for training. Defaults to 0.8.
        val_split (float, optional): The fraction of data to allocate for validation. Defaults to 0.1.
        test_split (float, optional): The fraction of data to allocate for testing. Defaults to 0.1.
        shuffle (bool, optional): Whether to shuffle the training dataset. Defaults to True.
        shuffle_size (int, optional): The buffer size used for shuffling. Defaults to 10000.

    Returns:
        tuple: A tuple containing the training, validation, and test partitions of the dataset.
    """
    assert (train_split + test_split + val_split) == 1, "The sum of train_split, val_split, and test_split must be 1."
    
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    if shuffle:
      # Specify seed to always have the same split distribution between runs
      train_ds = ds.take(train_size).shuffle(shuffle_size, seed=12)
    else:
      train_ds = ds.take(train_size)
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds


In [14]:
#split the datasets to train, val, test
ds = [get_dataset_partitions_tf(items, 431, shuffle_size=BUFFER) for items in ds]

In [67]:
#split into individual sets
train_set = [ds[i][0] for i in range(len(ds))]
val_set = [ds[i][1] for i in range(len(ds))]
test_set = [ds[i][2]for i in range(len(ds))]

In [None]:
#turn into tensors
train_set = tf.data.experimental.from_list(train_set, name = 'train').flat_map(lambda x: x)
val_set = tf.data.experimental.from_list(val_set, name = 'val').flat_map(lambda x: x)
test_set = tf.data.experimental.from_list(test_set, name = 'test').flat_map(lambda x: x)
#find the length of each set using map
train_len = train_set.reduce(0, lambda x, _: x + 1).numpy()
val_len = val_set.reduce(0, lambda x, _: x + 1).numpy()
test_len = test_set.reduce(0, lambda x, _: x + 1).numpy()

print(f"Training set length: {train_len}")
print(f"Validation set length: {val_len}")
print(f"Test set length: {test_len}")


In [19]:
#batch and prefetch the datasets
train_set = train_set.shuffle(BUFFER).batch(BATCH_SIZE).prefetch(1)
val_set = val_set.batch(BUFFER).prefetch(1)
test_set = test_set.batch(BUFFER).prefetch(1)

In [None]:
#save the datasets
tf.data.Dataset.save(train_set, 'train_set')
tf.data.Dataset.save(val_set, 'val_set')
tf.data.Dataset.save(test_set, 'test_set')

# Modelling

In [32]:
#create a list of models based on given learning rates and optimizers
def create_model():
    """
    Creates a list of models based on the learning rates and optimizers given.

    Args:
        learning_rate_array (list): A list of learning rates and optimizers to use for each model.

    Returns:
        list: A list of models.
    """
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(filters=64, kernel_size=3,
                               strides=1,
                               activation="relu", padding="causal",
                               input_shape=[25, 1]),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    return model


In [43]:
model = create_model()
train_sample = train_set.shuffle(BUFFER).take(train_len//10).batch(BATCH_SIZE).prefetch(1)
val_sample = val_set.shuffle(BUFFER).take(val_len//10).batch(BATCH_SIZE).prefetch(1)

Cause: could not parse the source code of <function <lambda> at 0x7f09588dd990>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1



Cause: could not parse the source code of <function <lambda> at 0x7f09588dd990>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1



Cause: could not parse the source code of <function <lambda> at 0x7f09588dd990>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1

Cause: could not parse the source code of <function <lambda> at 0x7f0958dc5cf0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1



Cause: could not parse the source code of <function <lambda> at 0x7f0958dc5cf0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1



Cause: could not parse the source code of <function <lambda> at 0x7f0958dc5cf0>: found multiple definitions with identical signatures at the location. This error may be avoided by defining each lambda on a single line and with unique argument names. The matching definitions were:
Match 0:
lambda x, _: x + 1

Match 1:
lambda x, _: 1



KeyboardInterrupt: 

In [36]:
#callback to tune the learning rate
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20))

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-8)

model.compile(loss=tf.keras.losses.Huber(),
              optimizer=optimizer,
              metrics=["mae"])

history = model.fit(train_sample, epochs=100, callbacks=[lr_schedule],validation_data = val_sample, verbose=1)


Epoch 1/100


2023-06-10 00:00:50.857716: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8200
2023-06-10 00:00:54.748355: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f03bbffce80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-06-10 00:00:54.748411: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2023-06-10 00:00:54.797046: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-06-10 00:00:55.209424: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   5639/Unknown - 51s 7ms/step - loss: 0.3214 - mae: 0.3813

KeyboardInterrupt: 