In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, auc, roc_curve, f1_score, r2_score, confusion_matrix, mean_squared_error

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras import layers
from datetime import datetime
from keras.datasets import mnist
from keras.layers import Dense, LSTM

In [18]:
df = pd.read_excel("Walmart_clusters.xlsx")
class_0 = df[df["cluster_sklearn"] == 0]

In [19]:
class_0.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,yyyy,mm,dd,week_relative,week_absolute,Date,cluster_sklearn
0,3,377219,0,65.065,3.234,216.539,7.567,2010,1,1,52,53,2010-01-01,0
1,5,309443,0,61.722,3.234,213.766,6.639,2010,1,1,52,53,2010-01-01,0
2,7,669225,0,27.723,3.179,191.968,8.873,2010,1,1,52,53,2010-01-01,0
3,9,494392,0,59.275,3.234,216.77,6.476,2010,1,1,52,53,2010-01-01,0
4,15,576183,0,37.713,3.6,133.786,7.9,2010,1,1,52,53,2010-01-01,0


In [20]:
class_0 = class_0.drop(columns=class_0[["Store", "Holiday_Flag", "cluster_sklearn","week_absolute"]])


In [21]:
class_0.columns

Index(['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
       'yyyy', 'mm', 'dd', 'week_relative', 'Date'],
      dtype='object')

In [22]:
class_0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2821 entries, 0 to 2820
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Weekly_Sales   2821 non-null   int64         
 1   Temperature    2821 non-null   float64       
 2   Fuel_Price     2821 non-null   float64       
 3   CPI            2821 non-null   float64       
 4   Unemployment   2821 non-null   float64       
 5   yyyy           2821 non-null   int64         
 6   mm             2821 non-null   int64         
 7   dd             2821 non-null   int64         
 8   week_relative  2821 non-null   int64         
 9   Date           2821 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(5)
memory usage: 242.4 KB


In [23]:
class_0["Temperature"] = class_0["Temperature"].astype(object)
class_0["Fuel_Price"] = class_0["Fuel_Price"].astype(object)
class_0["CPI"] = class_0["CPI"].astype(object)
class_0["Unemployment"] = class_0["Unemployment"].astype(object)


In [24]:
class_0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2821 entries, 0 to 2820
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Weekly_Sales   2821 non-null   int64         
 1   Temperature    2821 non-null   object        
 2   Fuel_Price     2821 non-null   object        
 3   CPI            2821 non-null   object        
 4   Unemployment   2821 non-null   object        
 5   yyyy           2821 non-null   int64         
 6   mm             2821 non-null   int64         
 7   dd             2821 non-null   int64         
 8   week_relative  2821 non-null   int64         
 9   Date           2821 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(4)
memory usage: 242.4+ KB


In [27]:
x1 = pd.to_numeric(class_0["Weekly_Sales"], errors="coerce").to_numpy()
x2 = pd.to_numeric(class_0["Temperature"], errors="coerce").to_numpy()
x3 = pd.to_numeric(class_0["Fuel_Price"], errors="coerce").to_numpy()
x4 = pd.to_numeric(class_0["CPI"], errors="coerce").to_numpy()
x5 = pd.to_numeric(class_0["Unemployment"], errors="coerce").to_numpy()
x6 = pd.to_numeric(class_0["yyyy"], errors="coerce").to_numpy()
x7 = pd.to_numeric(class_0["mm"], errors="coerce").to_numpy()
x8 = pd.to_numeric(class_0["dd"], errors="coerce").to_numpy()
x9 = pd.to_numeric(class_0["week_relative"], errors="coerce").to_numpy()
x10 = pd.to_numeric(class_0["Date"], errors="coerce").to_numpy()

In [39]:
x = np.concatenate([x1.reshape(-1,1), 
                   x2.reshape(-1,1),
                   x3.reshape(-1,1),
                   x4.reshape(-1,1),
                   x5.reshape(-1,1),
                   x6.reshape(-1,1),
                   x7.reshape(-1,1),
                   x8.reshape(-1,1),
                   x9.reshape(-1,1),
                   x10.reshape(-1,1)],
                   axis=1)

In [40]:
x.shape

(2821, 10)

In [41]:
x

array([[3.7721900e+05, 6.5065000e+01, 3.2340000e+00, ..., 1.0000000e+00,
        5.2000000e+01, 1.2623040e+18],
       [3.0944300e+05, 6.1722000e+01, 3.2340000e+00, ..., 1.0000000e+00,
        5.2000000e+01, 1.2623040e+18],
       [6.6922500e+05, 2.7723000e+01, 3.1790000e+00, ..., 1.0000000e+00,
        5.2000000e+01, 1.2623040e+18],
       ...,
       [6.1937000e+05, 7.1140000e+01, 3.6010000e+00, ..., 2.8000000e+01,
        1.5700000e+02, 1.3566528e+18],
       [3.3779600e+05, 5.5100000e+01, 3.7970000e+00, ..., 2.8000000e+01,
        1.5700000e+02, 1.3566528e+18],
       [7.3446400e+05, 5.4470000e+01, 4.0000000e+00, ..., 2.8000000e+01,
        1.5700000e+02, 1.3566528e+18]])

In [47]:
seqLen = 4
featNum = 10
xSeq, y = ConvertDataToSequence(x, seqLen, featNum)
print(f"sequence shape: {xSeq.shape}")
print(f"output shape: {y.shape}")

NameError: name 'ConvertDataToSequence' is not defined