# Imports and Constants

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorboard
import random as rand

In [2]:
WINDOW = 20
BATCH_SIZE = 32
BUFFER = 100

# Load data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#filepath = 'sale_quantity.csv' #for local imports
filepath = '/content/drive/MyDrive/Documents/uni_work/Bangkit2023/batch1/capstone/repo/ml_modeling/sales_quantity.csv' #colab
data = pd.read_csv(filepath,names=['date','item_code','quantity'],header=0)
data.head()

Unnamed: 0,date,item_code,quantity
0,2022-08-26,1000,15
1,2022-08-26,500,14
2,2023-01-01,8991102380706,13
3,2023-01-01,8991102381017,13
4,2023-01-01,8886008101053,20


In [5]:
# Transform data

In [6]:
#extract date features from date column
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_year'] = data['date'].dt.dayofyear



In [7]:
#stack dataframe based on item_code
item_sales = data.groupby(['item_code','date','year','month','day','day_of_week','day_of_year'])['quantity'].sum().unstack(level=0)
#turn each NaN value to 0
item_sales = item_sales.fillna(0)
item_sales.reset_index(inplace=True)
item_sales.head()

item_code,date,year,month,day,day_of_week,day_of_year,(90)NA18210500154(91)2403,(90)NA18211207820(91)2410,00000001,00000002,...,CL000448327,CL000450943,COS LT,COSLT-228,EC0102190002,EC0102191301,EC0103190002,EC0106190101,MP-2203,SLM0958266
0,2022-01-03,2022,1,3,0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-01-04,2022,1,4,1,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-05,2022,1,5,2,5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-01-06,2022,1,6,3,6,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-07,2022,1,7,4,7,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
import tensorflow as tf

def create_tokenizer(item_code):
    """
    Create a tokenizer to tokenize item codes.

    Args:
        item_code (list or Series): List or Series containing item codes.

    Returns:
        tf.keras.preprocessing.text.Tokenizer: Tokenizer object fitted on item codes.
    """

    # Create a tokenizer with no filters and case-sensitive tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', lower=False)

    # Fit the tokenizer on the item codes
    tokenizer.fit_on_texts(item_code)

    return tokenizer

# Create a tokenizer using item codes from item_sales dataframe columns
tokenizer = create_tokenizer(item_sales.columns[6:].str.replace(' ', ''))

# Get the length of the tokenizer's word index
tokenizer_word_count = len(tokenizer.word_index)

print(f'Tokenizer has {tokenizer_word_count} tokens')

Tokenizer has 13917 tokens


In [9]:
import numpy as np

# Extract date values from item_sales dataframe columns
dates = np.array(item_sales[['month', 'day', 'day_of_week', 'day_of_year']])

# Perform cyclic encoding on the date values
dates_cyclic = np.sin(dates) + np.cos(dates)

prefix_features = []

# Iterate over each product in item_sales columns
for product in item_sales.columns[6:].str.replace(' ', ''):
    # Create prefix features for the product
    prefix_feature = np.array([
        [
            tokenizer.word_index[product],
            dates_cyclic[j][0],
            dates_cyclic[j][1],
            dates_cyclic[j][2],
            dates_cyclic[j][3]
        ]
        for j in range(WINDOW, len(item_sales))
    ], dtype=np.float64)
    
    prefix_features.append(prefix_feature)

# Get the total number of prefix features arrays and the shape of the first array
prefix_features_count = len(prefix_features)
prefix_features_shape = prefix_features[0].shape

print(f"A total of {prefix_features_count} numpy arrays with each one having shape {prefix_features_shape}")

print(prefix_features[0])

A total of 13917 numpy arrays with each one having shape (431, 5)
[[ 1.          1.38177329 -1.37905342  0.68075479 -1.37905342]
 [ 1.          1.38177329 -0.48139935  1.         -0.48139935]
 [ 1.          1.38177329  0.85885106  1.38177329  0.85885106]
 ...
 [ 1.         -0.84887249 -0.83378017 -0.84887249  1.37024645]
 [ 1.         -0.84887249  0.51070471 -1.41044612  0.44592305]
 [ 1.         -1.41044612  1.38177329 -0.67526209 -0.88837995]]


In [10]:
#convert each item sales to a tensorflow dataset
sales_datasets = [tf.data.Dataset.from_tensor_slices(item_sales[column]) for column in item_sales.columns[7:]]
sales_datasets[0].element_spec

TensorSpec(shape=(), dtype=tf.float64, name=None)

In [11]:
def window_dataset(token_time_ds, sales_ds, window_size):
    """
    Create a windowed dataset by combining token_time_ds and sales_ds.

    Args:
        token_time_ds (tf.data.Dataset): Dataset containing token and time information.
        sales_ds (tf.data.Dataset): Dataset containing sales information.
        window_size (int): Size of the window for creating sequences.

    Returns:
        tf.data.Dataset: Windowed dataset with input-output pairs.
    """

    # Create windows of size window_size+1, shifting by 1, and dropping any incomplete windows
    sales_ds = sales_ds.window(window_size+1, shift=1, drop_remainder=True)

    # Flatten the windows into individual datasets and combine them into a single dataset
    sales_ds = sales_ds.flat_map(lambda w: w.batch(window_size+1))

    # Concatenate token_time_ds and sales_ds tensors along the last axis
    windowed_tensors = tf.concat((list(token_time_ds), list(sales_ds)), axis=-1)

    # Create a new dataset from the concatenated tensors
    ds = tf.data.Dataset.from_tensor_slices(windowed_tensors)

    # Map each element of the dataset to input-output pairs
    ds = ds.map(lambda x: (x[:-1], x[-1]))

    return ds


In [None]:
#window the dataset in batches
windowed = [window_dataset( prefix_features[i],sales_datasets[i], WINDOW) for i in range(len(sales_datasets))]

windwed