In [370]:
import pandas as pd

In [371]:
# pip install ludwig[full]

In [372]:
url = 'https://raw.githubusercontent.com/john-adeojo/walmartdata/main/Walmart%20Store/TRAIN.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [373]:
df['Date'] = pd.to_datetime(df['Date'])
df_original = df.copy()

In [374]:
# Data set splitting
import numpy as np
import hashlib

def split_data(df):
  # Create a new column 'hash_val' that is the hash of the 'Store_id' column
  df['hash_val'] = df['Store_id'].apply(lambda x: int(hashlib.sha256(str(x).encode('utf-8')).hexdigest(), 16))

  # Use the 'hash_val' column to create a boolean mask for the holdout set
  is_holdout = df['hash_val'] % 10 < 2  # Approximately 20% will be in the holdout set

  # Create the holdout and train sets
  holdout_set = df[is_holdout].copy()
  train_set = df[~is_holdout].copy()

  # Add a 'set' column to each set
  holdout_set['set'] = 'hold_out'
  train_set['set'] = 'train'

  df_predictions = pd.concat([holdout_set, train_set], axis=0)

  return train_set, df_predictions

train_set, df_predictions = split_data(df)

In [375]:
import pandas as pd
import numpy as np

df = train_set

def transform_data(df):

  # Convert 'Discount' column to binary
  df['Discount'] = df['Discount'].map({'Yes': 1, 'No': 0}).astype(int)

  # Convert 'Date' to datetime
  df['Date'] = pd.to_datetime(df['Date'])

  # Create 'DayOfWeek' and 'MonthOfYear'
  df['DayOfWeek'] = df['Date'].dt.dayofweek
  df['MonthOfYear'] = df['Date'].dt.month

  # Sort DataFrame by 'Store_id' and 'Date'
  df.sort_values(['Store_id', 'Date'], inplace=True)

  # List of sequence features
  sequence_features = ['Sales', '#Order', 'Discount', 'DayOfWeek', 'MonthOfYear', 'Holiday']

  # Window size for features and labels (3 days for example)
  feature_window_size = 30
  label_window_size = 7

  # List to store sequences
  sequences = []

  # Generate sequences for each store
  for store_id in df['Store_id'].unique():
      df_store = df[df['Store_id'] == store_id]

      # Check if store has enough data for the window
      if len(df_store) >= (feature_window_size + label_window_size):
          sequence = {feature: ' '.join(map(str, df_store[feature].iloc[-feature_window_size-label_window_size:-label_window_size].values)) for feature in sequence_features}
          sequence['Sales_sequence_label'] = ' '.join(map(str, df_store['Sales'].iloc[-label_window_size:].values))
          sequence['Sales_sequence_label_date'] = ' '.join(map(str, df_store['Date'].iloc[-label_window_size:].dt.date.values))
          sequence['Store_id'] = store_id
          sequences.append(sequence)

  # Convert list of sequences to DataFrame
  df_sequences = pd.DataFrame(sequences)

  df_sequences.rename(columns={'#Order': 'Order'}, inplace=True)

  # Split 'Sales_sequence_label' and 'Sales_sequence_label_date' into list of values
  df_sequences['Sales_sequence_label'] = df_sequences['Sales_sequence_label'].str.split(' ')
  df_sequences['Sales_sequence_label_date'] = df_sequences['Sales_sequence_label_date'].str.split(' ')

  # Determine the maximum length of sales sequences
  max_length = df_sequences['Sales_sequence_label'].str.len().max()

  # Convert list into separate columns
  sales_columns = df_sequences['Sales_sequence_label'].apply(pd.Series)
  sales_columns_date = df_sequences['Sales_sequence_label_date'].apply(pd.Series)

  # Rename columns
  sales_columns = sales_columns.rename(columns = lambda x : 'Sales_sequence_label_' + str(df_sequences['Sales_sequence_label_date'].iloc[0][x]))

  # Concatenate the sales_columns dataframe with the original dataframe
  df_sequences = pd.concat([df_sequences[:], sales_columns[:]], axis=1)

  # Drop the original 'Sales_sequence_label' and 'Sales_sequence_label_date' columns
  df_sequences = df_sequences.drop(['Sales_sequence_label', 'Sales_sequence_label_date'], axis=1)

  return df_sequences

df_sequences = transform_data(train_set)
df_predictions = transform_data(df_predictions)

Unnamed: 0,Sales,Order,Discount,DayOfWeek,MonthOfYear,Holiday,Store_id,Sales_sequence_label_2019-05-25,Sales_sequence_label_2019-05-26,Sales_sequence_label_2019-05-27,Sales_sequence_label_2019-05-28,Sales_sequence_label_2019-05-29,Sales_sequence_label_2019-05-30,Sales_sequence_label_2019-05-31
0,27207.0 14484.0 56808.0 27330.0 36516.0 15531....,53 26 100 49 65 27 97 53 80 94 116 88 92 35 34...,0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 ...,3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 ...,4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...,1,40554.0,25035.0,33075.0,37317.0,44652.0,42387.0,39843.78
1,61110.0 28419.0 96768.0 61542.0 69372.0 46215....,122 56 186 118 136 89 284 169 214 229 258 207 ...,0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 ...,3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 ...,4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...,3,94548.0,66036.0,69930.0,72540.0,76428.0,78135.0,75790.95000000001
2,22365.0 38832.0 28824.0 76044.0 33642.0 52155....,39 70 53 139 59 90 77 84 90 116 52 72 62 63 59...,0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 ...,3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 ...,4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...,4,49185.0,53550.0,48219.0,55194.0,25938.0,37119.0,36747.81
3,34671.0 25866.0 69819.0 46002.0 46914.0 26733....,61 45 115 77 78 45 125 78 91 103 102 88 100 60...,0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 ...,3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 ...,4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...,5,55830.0,45726.0,48849.0,46806.0,43197.0,46737.0,44867.52
4,17487.0 64281.0 26178.0 59190.0 22779.0 84000....,35 123 50 111 43 153 74 123 119 157 168 152 55...,0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 ...,3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 ...,4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...,6,27150.0,60165.0,52029.0,53100.0,67809.0,72012.0,67691.28


In [376]:
# import pandas as pd
# import numpy as np

# # Convert 'Discount' column to binary
# df['Discount'] = df['Discount'].map({'Yes': 1, 'No': 0}).astype(int)

# # Convert 'Date' to datetime
# df['Date'] = pd.to_datetime(df['Date'])

# # Create 'DayOfWeek' and 'MonthOfYear'
# df['DayOfWeek'] = df['Date'].dt.dayofweek
# df['MonthOfYear'] = df['Date'].dt.month

# # Sort DataFrame by 'Store_id' and 'Date'
# df.sort_values(['Store_id', 'Date'], inplace=True)

# # List of sequence features
# sequence_features = ['Sales', '#Order', 'Discount', 'DayOfWeek', 'MonthOfYear', 'Holiday']

# # Window size for features and labels (3 days for example)
# feature_window_size = 30
# label_window_size = 7

# # List to store sequences
# sequences = []

# # Generate sequences for each store
# for store_id in df['Store_id'].unique():
#     df_store = df[df['Store_id'] == store_id]

#     # Check if store has enough data for the window
#     if len(df_store) >= (feature_window_size + label_window_size):
#         sequence = {feature: ' '.join(map(str, df_store[feature].iloc[-feature_window_size-label_window_size:-label_window_size].values)) for feature in sequence_features}
#         sequence['Sales_sequence_label'] = ' '.join(map(str, df_store['Sales'].iloc[-label_window_size:].values))
#         sequence['Store_id'] = store_id
#         sequences.append(sequence)

# # Convert list of sequences to DataFrame
# df_sequences = pd.DataFrame(sequences)

# df_sequences.rename(columns={'#Order': 'Order'}, inplace=True)
# # df_sequences

# # Split 'Sales_sequence_label' into list of values
# df_sequences['Sales_sequence_label'] = df_sequences['Sales_sequence_label'].str.split(' ')

# # Determine the maximum length of sales sequences
# max_length = df_sequences['Sales_sequence_label'].str.len().max()

# # Convert list into separate columns
# sales_columns = df_sequences['Sales_sequence_label'].apply(pd.Series)

# # Rename columns
# sales_columns = sales_columns.rename(columns = lambda x : 'Sales_sequence_label_' + str(x))

# # Concatenate the sales_columns dataframe with the original dataframe
# df_sequences = pd.concat([df_sequences[:], sales_columns[:]], axis=1)

# # Drop the original 'Sales_sequence_label' column
# df_sequences = df_sequences.drop('Sales_sequence_label', axis=1)
# df_sequences


# Plotting time series for store ID 1

In [377]:
# import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go

# def plot_data(df, column):
#     fig = px.line(df, x='Date', y=column)
#     fig.update_xaxes(tickangle=90)  # Rotate x-axis labels
#     fig.show()

# plot_data(df_example_store, 'Sales')
# plot_data(df_example_store, '#Order')

In [378]:
# # Data set splitting
# import numpy as np
# import hashlib

# def split_data(df):
#   # Create a new column 'hash_val' that is the hash of the 'Store_id' column
#   df['hash_val'] = df['Store_id'].apply(lambda x: int(hashlib.sha256(str(x).encode('utf-8')).hexdigest(), 16))

#   # Use the 'hash_val' column to create a boolean mask for the holdout set
#   is_holdout = df['hash_val'] % 10 < 2  # Approximately 20% will be in the holdout set

#   # Create the holdout and train sets
#   holdout_set = df[is_holdout].copy()
#   train_set = df[~is_holdout].copy()

#   # Add a 'set' column to each set
#   holdout_set['set'] = 'hold_out'
#   train_set['set'] = 'train'

#   return holdout_set, train_set

In [379]:
# holdout_set

In [380]:
# Concatenate the holdout and train sets
# df_predictions = pd.concat([holdout_set, train_set], axis=0)
# df_predictions.head()

In [381]:
import requests
import yaml
from ludwig.api import LudwigModel

# URL of the raw YAML file in the GitHub repository
url = 'https://raw.githubusercontent.com/john-adeojo/walmartdata/main/Walmart%20Store/timeseries.yaml'

# Send a GET request to the URL
response = requests.get(url)

# Raise an exception if the request was unsuccessful
response.raise_for_status()

# Load the YAML data from the response text
config = yaml.safe_load(response.text)

# Now you can use the config dictionary to initialize the Ludwig model
model = LudwigModel(config=config)
results = model.train(dataset=df_sequences)

In [382]:
predictions, _ = model.predict(dataset=df_predictions)
predictions.head()

KeyError: ignored

In [None]:
# First, let's reset the index of predictions dataframe to have the store id as a column
predictions = predictions.reset_index().rename(columns={'index': 'Store_id'})

# Now we convert the wide format dataframe to a long format dataframe
predictions_melted = predictions.melt(id_vars=['Store_id'], var_name='Date', value_name='Predicted_Sales')

# The 'Date' column is currently a string in the format 'Sales_sequence_label_YYYY-MM-DD_predictions', let's extract the date
predictions_melted['Date'] = predictions_melted['Date'].str.extract('(\d{4}-\d{2}-\d{2})')

# Convert 'Date' column back to datetime format
predictions_melted['Date'] = pd.to_datetime(predictions_melted['Date'])

# Now, let's merge this with the original dataframe
df_analysis = pd.merge(df_predictions, predictions_melted, on=['Store_id', 'Date'], how='left')
df_analysis


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def calculate_rmse(group):
    actual = group['Sales']
    predicted = group['Sales_predictions']
    rmse = sqrt(mean_squared_error(actual, predicted))
    return rmse

# Apply the function to each group
rmse_by_set = df_analysis.groupby('set').apply(calculate_rmse)

print(rmse_by_set)

In [None]:
# import plotly.express as px

# # Select the data for a specific store
# store_id = '105'  # replace with your store id
# df_store = df_final[df_final['Store_id'] == store_id]

# # Create a line plot of sales over time
# fig = px.line(df_store, x='Date', y='Sales', title='Sales Over Time for Store {}'.format(store_id))

# # Add a line for predicted sales
# fig.add_trace(go.Scatter(x=df_store['Date'], y=df_store['Sales_predictions'], mode='lines', name='Predicted Sales'))

# fig.show()

In [None]:
df_analysis['error'] = df_analysis['Sales'] - df_analysis['Sales_predictions']

In [None]:
import plotly.express as px

# Assuming 'category' is the name of your category column
fig = px.histogram(df_analysis, x="error", color="set", nbins=30,
                   labels={"error": "Error"},
                   title="Histogram of Error by Category",
                   template='plotly_white')

fig.show()
