In [833]:
import pandas as pd

In [834]:
# pip install ludwig[full]

In [835]:
url = 'https://raw.githubusercontent.com/john-adeojo/walmartdata/main/Walmart%20Store/TRAIN.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [836]:
# Data set splitting
import numpy as np
import hashlib

def split_data(df):
  # Create a new column 'hash_val' that is the hash of the 'Store_id' column
  df['hash_val'] = df['Store_id'].apply(lambda x: int(hashlib.sha256(str(x).encode('utf-8')).hexdigest(), 16))

  # Use the 'hash_val' column to create a boolean mask for the holdout set
  is_holdout = df['hash_val'] % 10 < 2  # Approximately 20% will be in the holdout set

  # Create the holdout and train sets
  holdout_set = df[is_holdout].copy()
  train_set = df[~is_holdout].copy()

  # Add a 'set' column to each set
  holdout_set['set'] = 'hold_out'
  train_set['set'] = 'train'

  df_all = pd.concat([holdout_set, train_set], axis=0)

  return train_set, df_all

train_set, df_all = split_data(df)

In [837]:
def number_encode(df):
    # Convert 'Discount' column to binary
  df['Discount'] = df['Discount'].map({'Yes': 1, 'No': 0}).astype(int)

  # Encode Store Type
  df['Store_Type'] = df['Store_Type'].map({'S1': 0, 'S2': 1, 'S3': 2, 'S4': 4}).astype(int)

  # Encode Location_Type
  df['Location_Type'] = df['Location_Type'].map({'L1': 0, 'L2': 1, 'L3': 2, 'L4': 4, 'L5': 5}).astype(int)

  # Encode Location_Type
  df['Region_Code'] = df['Region_Code'].map({'R1': 0, 'R2': 1, 'R3': 2, 'R4': 4}).astype(int)

  # Convert 'Date' to datetime
  df['Date'] = pd.to_datetime(df['Date'])

  # Create 'DayOfWeek' and 'MonthOfYear'
  df['DayOfWeek'] = df['Date'].dt.dayofweek
  df['MonthOfYear'] = df['Date'].dt.month

  return df

train_set_encoded = number_encode(train_set)
all_set_encoded = number_encode(df_all)

In [838]:
from sklearn.preprocessing import StandardScaler

# Sequence Features
sequence_features = ['Sales', 'DayOfWeek', 'MonthOfYear', 'Store_Type', 'Location_Type', 'Region_Code']

# initialize a standard scaler
scaler = StandardScaler()

# copy the original Sales column to a new column Sales_original
train_set_encoded['Orders_original'] = train_set_encoded['#Order']
all_set_encoded['Orders_original'] = all_set_encoded['#Order']

# fit the scaler on the training set (excluding 'Sales' because we want to preserve it as the label)
scaler.fit(train_set_encoded[sequence_features])

# transform both the training set and the holdout set
train_set_encoded[sequence_features] = scaler.transform(train_set_encoded[sequence_features])
all_set_encoded[sequence_features] = scaler.transform(all_set_encoded[sequence_features])

# now add 'Sales' to the sequence_features list and fit and transform 'Sales' separately
scaler.fit(train_set_encoded[['#Order']])
train_set_encoded['#Order'] = scaler.transform(train_set_encoded[['#Order']])
all_set_encoded['#Order'] = scaler.transform(all_set_encoded[['#Order']])


In [839]:
all_set_encoded

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales,hash_val,set,DayOfWeek,MonthOfYear,Orders_original
3,T1000004,251,-0.223948,0.593594,-1.002040,2018-01-01,1,1,-1.517885,-1.272747,9017500930054051502567433507221030433242136850...,hold_out,-1.496987,-1.323177,23
10,T1000011,245,1.609495,-0.771205,-0.253714,2018-01-01,1,1,-1.085051,-0.850758,4999567512076337693454235053624426164377272585...,hold_out,-1.496987,-1.323177,36
24,T1000025,272,-0.223948,1.958393,-0.253714,2018-01-01,1,1,-0.685513,-0.513469,1285565998246243054441738250712298189648882471...,hold_out,-1.496987,-1.323177,48
28,T1000029,268,-0.835095,-0.088805,-1.002040,2018-01-01,1,1,0.213450,0.783158,6300121100003475510973824048995146962150913489...,hold_out,-1.496987,-1.323177,75
30,T1000031,267,1.609495,-0.088805,-1.002040,2018-01-01,1,1,-0.652218,-0.266335,6277985557774950910508828507924182645375487057...,hold_out,-1.496987,-1.323177,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188333,T1188334,339,1.609495,-0.088805,-1.002040,2019-05-31,1,0,0.546399,0.376617,1142475204462047850484627316530716315519920829...,train,0.505464,-0.146513,85
188334,T1188335,151,-0.835095,0.593594,-0.253714,2019-05-31,1,0,-1.018462,-1.033781,6440011109971666630110053009071231257727432540...,train,0.505464,-0.146513,38
188335,T1188336,149,-0.223948,0.593594,-0.253714,2019-05-31,1,1,-0.585628,-0.317150,2568390964900585637524306983364424274399986891...,train,0.505464,-0.146513,51
188336,T1188337,153,1.609495,-0.088805,-1.002040,2019-05-31,1,0,0.712873,0.624502,4434893937680080934441553859218205407746165681...,train,0.505464,-0.146513,90


In [840]:
import pandas as pd
import numpy as np

def transform_data(df):


  # Sort DataFrame by 'Store_id' and 'Date'
  df.sort_values(['Store_id', 'Date'], inplace=True)

  # List of sequence features
  sequence_features = ['Orders_original', 'Sales', '#Order', 'Discount', 'DayOfWeek', 'MonthOfYear', 'Holiday', 'Store_Type', 'Location_Type', 'Region_Code']

  # Window size for features and labels (3 days for example)
  feature_window_size = 30
  label_window_size = 30

  # List to store sequences
  sequences = []

  # Generate sequences for each store
  for store_id in df['Store_id'].unique():
      df_store = df[df['Store_id'] == store_id]

      # Check if store has enough data for the window
      if len(df_store) >= (feature_window_size + label_window_size):
          sequence = {feature: ' '.join(map(str, df_store[feature].iloc[-feature_window_size-label_window_size:-label_window_size].values)) for feature in sequence_features}
          sequence['Order_sequence_label'] = ' '.join(map(str, df_store['Orders_original'].iloc[-label_window_size:].values))
          sequence['Order_sequence_label_date'] = ' '.join(map(str, df_store['Date'].iloc[-label_window_size:].dt.date.values))
          sequence['Store_id'] = store_id
          sequences.append(sequence)

  # Convert list of sequences to DataFrame
  df_sequences = pd.DataFrame(sequences)

  df_sequences.rename(columns={'#Order': 'Order'}, inplace=True)

  # Split 'Sales_sequence_label' and 'Sales_sequence_label_date' into list of values
  df_sequences['Order_sequence_label'] = df_sequences['Order_sequence_label'].str.split(' ')
  df_sequences['Order_sequence_label_date'] = df_sequences['Order_sequence_label_date'].str.split(' ')

  # Determine the maximum length of sales sequences
  max_length = df_sequences['Order_sequence_label'].str.len().max()

  # Convert list into separate columns
  sales_columns = df_sequences['Order_sequence_label'].apply(pd.Series)
  sales_columns_date = df_sequences['Order_sequence_label_date'].apply(pd.Series)

  # Rename columns
  sales_columns = sales_columns.rename(columns = lambda x : 'Order_sequence_label_' + str(df_sequences['Order_sequence_label_date'].iloc[0][x]))

  # Concatenate the sales_columns dataframe with the original dataframe
  df_sequences = pd.concat([df_sequences[:], sales_columns[:]], axis=1)

  # Drop the original 'Sales_sequence_label' and 'Sales_sequence_label_date' columns
  df_sequences = df_sequences.drop(['Order_sequence_label', 'Order_sequence_label_date'], axis=1)

  return df_sequences

df_sequences_train = transform_data(train_set_encoded)
df_sequences_all = transform_data(all_set_encoded)
df_sequences_train.head()

Unnamed: 0,Orders_original,Sales,Order,Discount,DayOfWeek,MonthOfYear,Holiday,Store_Type,Location_Type,Region_Code,...,Order_sequence_label_2019-05-22,Order_sequence_label_2019-05-23,Order_sequence_label_2019-05-24,Order_sequence_label_2019-05-25,Order_sequence_label_2019-05-26,Order_sequence_label_2019-05-27,Order_sequence_label_2019-05-28,Order_sequence_label_2019-05-29,Order_sequence_label_2019-05-30,Order_sequence_label_2019-05-31
0,56 49 50 29 33 61 68 69 73 32 48 53 41 48 55 5...,-0.7191607474191822 -0.9338821647659423 -0.912...,-0.41915349610846087 -0.6522177849947374 -0.61...,0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 ...,-0.9963743278180932 -0.4957617152045235 0.0048...,-0.4406789530760858 -0.4406789530760858 -0.440...,0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 ...,-0.8350954358175171 -0.8350954358175171 -0.835...,0.5935940573910438 0.5935940573910438 0.593594...,-1.0020404903955336 -1.0020404903955336 -1.002...,...,57,69,54,65,39,53,57,69,66,62
1,103 101 100 80 89 121 115 109 122 85 107 125 8...,0.5112011382901999 0.34873132060196693 0.25925...,1.145706729270824 1.0791169324461736 1.0458220...,0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 ...,-0.9963743278180932 -0.4957617152045235 0.0048...,-0.4406789530760858 -0.4406789530760858 -0.440...,0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 ...,1.6094947121395253 1.6094947121395253 1.609494...,-0.08880541016086488 -0.08880541016086488 -0.0...,-1.0020404903955336 -1.0020404903955336 -1.002...,...,152,128,132,159,111,121,124,128,126,122
2,39 69 67 68 78 91 38 59 54 57 65 77 55 30 60 5...,-1.1361938271938024 -0.18488814091074188 -0.28...,-0.9851667691179895 0.013680183251766882 -0.05...,0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 ...,-0.9963743278180932 -0.4957617152045235 0.0048...,-0.4406789530760858 -0.4406789530760858 -0.440...,0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 ...,-0.8350954358175171 -0.8350954358175171 -0.835...,-0.7712048777127735 -0.7712048777127735 -0.771...,-0.253713576852066 -0.253713576852066 -0.25371...,...,67,41,67,77,82,75,84,41,61,61
3,49 52 53 56 64 77 62 59 78 63 64 75 46 42 56 5...,-0.7635746071289303 -0.7062611337534833 -0.756...,-0.6522177849947374 -0.5523330897577617 -0.519...,0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 ...,-0.9963743278180932 -0.4957617152045235 0.0048...,-0.4406789530760858 -0.4406789530760858 -0.440...,0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 ...,-0.8350954358175171 -0.8350954358175171 -0.835...,-0.7712048777127735 -0.7712048777127735 -0.771...,0.4946133366914016 0.4946133366914016 0.494613...,...,75,60,57,84,69,72,68,64,67,64
4,68 61 30 73 82 96 92 108 33 69 71 82 67 55 38 ...,-0.4654139165774597 -0.6429060691675191 -1.492...,-0.019614715160558332 -0.2526790040468348 -1.2...,0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 ...,-0.9963743278180932 -0.4957617152045235 0.0048...,-0.4406789530760858 -0.4406789530760858 -0.440...,0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 ...,1.6094947121395253 1.6094947121395253 1.609494...,-0.7712048777127735 -0.7712048777127735 -0.771...,-1.0020404903955336 -1.0020404903955336 -1.002...,...,74,85,82,45,100,85,87,105,110,103


In [None]:
import requests
import yaml
from ludwig.api import LudwigModel

# URL of the raw YAML file in the GitHub repository
url = 'https://raw.githubusercontent.com/john-adeojo/walmartdata/main/Walmart%20Store/transformers_orders.yaml'

# Send a GET request to the URL
response = requests.get(url)

# Raise an exception if the request was unsuccessful
response.raise_for_status()

# Load the YAML data from the response text
config = yaml.safe_load(response.text)

# Now you can use the config dictionary to initialize the Ludwig model
model = LudwigModel(config=config)
results = model.train(dataset=df_sequences_train)

In [None]:
# from ludwig.api import LudwigModel
# from ludwig.visualize import learning_curves
# from google.colab import drive
# import yaml

# # Mount your Google Drive
# drive.mount('/content/drive')

# # URL of the raw YAML file in the GitHub repository
# url = 'https://raw.githubusercontent.com/john-adeojo/walmartdata/main/Walmart%20Store/transformers.yaml'

# # Send a GET request to the URL
# response = requests.get(url)

# # Raise an exception if the request was unsuccessful
# response.raise_for_status()

# # Load the YAML data from the response text
# config = yaml.safe_load(response.text)

# # Set your output directory path
# output_dir = "/content/drive/My Drive/Data-Centric Solutions/07. Blog Posts/deep learning/results"

# # Set up your experiment
# model = LudwigModel(config=config)
# experiment_results = model.experiment(
#   dataset=df_sequences_train,
#   output_directory=output_dir
# )


In [None]:
# %matplotlib inline

# !ludwig visualize --visualization learning_curves \
#   --output_feature_name Sales_sequence_label_2019-05-31 \
#   --training_statistics "/content/drive/My%20Drive/Data-Centric%20Solutions/07.%20Blog%20Posts/deep%20learning/results/experiment_run/training_statistics.json"

In [None]:
predictions, _ = model.predict(dataset=df_sequences_all)
predictions.head()

In [None]:
# First, let's reset the index of predictions dataframe to have the store id as a column
predictions = predictions.reset_index().rename(columns={'index': 'Store_id'})

# Now we convert the wide format dataframe to a long format dataframe
predictions_melted = predictions.melt(id_vars=['Store_id'], var_name='Date', value_name='Predicted_Orders')

# The 'Date' column is currently a string in the format 'Sales_sequence_label_YYYY-MM-DD_predictions', let's extract the date
predictions_melted['Date'] = predictions_melted['Date'].str.extract('(\d{4}-\d{2}-\d{2})')

# Convert 'Date' column back to datetime format
predictions_melted['Date'] = pd.to_datetime(predictions_melted['Date'])

# Now, let's merge this with the original dataframe
df_analysis = pd.merge(df_all, predictions_melted, on=['Store_id', 'Date'], how='left')

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def calculate_rmse(group):
    actual = group['Orders_original']
    predicted = group['Predicted_Orders']
    rmse = sqrt(mean_squared_error(actual, predicted))
    return rmse

# Apply the function to each group
rmse_by_set = df_analysis.loc[df_analysis['Predicted_Orders'].notnull()].groupby('set').apply(calculate_rmse)

print(rmse_by_set)

In [None]:
import plotly.express as px

plot_data = df_analysis.loc[df_analysis['Predicted_Orders'].notnull()]

samples = plot_data.loc[plot_data['set'] == 'hold_out']
samples = samples.sample(n=10)
samples_list = list(samples['Store_id'])

for store in samples_list:
  # Select the data for a specific store
  store_id = store  # replace with your store id
  df_store = plot_data[plot_data['Store_id'] == store_id]

  # Create a line plot of sales over time
  fig = px.line(df_store, x='Date', y='Orders_original', title='Sales Over Time for Store {}'.format(store_id))

  # Add a line for predicted sales
  fig.add_trace(go.Scatter(x=df_store['Date'], y=df_store['Predicted_Orders'], mode='lines', name='Predicted Orders'))

  fig.show()

In [None]:
plot_data['error'] = plot_data['Orders_original'] - plot_data['Predicted_Orders']

In [None]:
import plotly.express as px

# Assuming 'category' is the name of your category column
fig = px.histogram(plot_data, x="error", color="set", nbins=30,
                   labels={"error": "Error"},
                   title="Histogram of Error by Category",
                   template='plotly_white')

fig.show()
