# 1. Loading Libraries

In [None]:
import pandas as pd
import numpy as np
from pandas import datetime
from sklearn import preprocessing

import h5py

import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline

# 2. Loading Data

In [None]:
# %% Setup path to data files

path = "../input/"

filename_train = "train.csv"
filename_test = "test.csv"
filename_feature = "features.csv"
filename_store = "stores.csv"

In [None]:
# %% Load data

df_store = pd.read_csv(path + filename_store)
df_feature = pd.read_csv(path + filename_feature)
df_train = pd.read_csv(path + filename_train)
df_test = pd.read_csv(path + filename_test)

# 3. Explore Data

### 3.1 Store Date

In [None]:
print(df_store.head().append(df_store.tail()),"\n")
print("Structure of Store:\n",df_store.shape, "\n")
print("Number of missing values:\n",df_store.isnull().sum().sort_values(ascending=False),"\n")

### 3.2 Feature Date

In [None]:
print(df_feature.head().append(df_feature.tail()),"\n")
print("Structure of Feature: ",df_feature.shape,"\n")
print("Summary Statistic:\n",df_feature.describe(),"\n")
print("Count of missing values:\n",df_feature.isnull().sum().sort_values(ascending=False),"\n")

In [None]:
# %% Summary Table

feature_percent_missing = df_feature.isnull().sum()*100/len(df_feature)
feature_data_type = df_feature.dtypes

feature_summary = pd.DataFrame({"Percent_missing": feature_percent_missing.round(2), 
                               "Datatypes": feature_data_type})

feature_summary

### 3.3 Train Date

In [None]:
print(df_train.head().append(df_train.tail()),"\n")
print("Structure of train:\n",df_train.shape,"\n")
print("Summary Statistic:\n",df_train.describe(),"\n")

In [None]:
# %% Summary Table

train_percent_missing = df_train.isnull().sum()*100/len(df_train)
train_datatype = df_train.dtypes

train_summary = pd.DataFrame({"Percent_Missing":train_percent_missing.round(2),
                             "Datatypes":train_datatype})

train_summary

### 3.4 Test Date

In [None]:
print(df_test.head().append(df_test.tail()),"\n")
print("Structure of test:\n",df_test.shape,"\n")
print("Summary Statistic:\n",df_test.describe(),"\n")

In [None]:
# summary

test_percent_missing = df_test.isnull().sum()*100/len(df_test)
test_datatypes = df_test.dtypes

test_summary = pd.DataFrame({"Datatypes": test_datatypes,
                             "Percent_Missing":test_percent_missing.round(2)})

test_summary

In [None]:
# Make datetypes constant for all datasets

df_feature['Date'] = pd.to_datetime(df_feature['Date'], format="%Y-%m-%d")
df_train['Date'] = pd.to_datetime(df_train['Date'], format="%Y-%m-%d")
df_test['Date'] = pd.to_datetime(df_test['Date'], format="%Y-%m-%d")

# 4. Joining Tables

In [None]:
combined_train = pd.merge(df_train, df_store, how="left", on="Store")
combined_test = pd.merge(df_test, df_store, how="left", on="Store")

print(combined_train.head(),"\n", combined_train.shape,"\n")
print(combined_test.head(),"\n", combined_test.shape,"\n")

In [None]:
combined_train = pd.merge(combined_train, df_feature, how = "inner", on=["Store","Date"])
combined_test = pd.merge(combined_test, df_feature, how = "inner", on=["Store","Date"])

print(combined_train.head())
print(combined_test.head())

In [None]:
# Drop Extra IsHoliday_y from both combined data set
combined_train = combined_train.drop(["IsHoliday_y"], axis=1)
combined_test = combined_test.drop(["IsHoliday_y"], axis=1)

print(combined_train.head())
print(combined_test.head())

In [None]:
combined_train.describe()

**NOTE:** Weekly_Sales, Markdown2 & Markdown3 have negative values in the combined_train set, which need to be replaced by 0

In [None]:
combined_test.describe()

**NOTE:** Markdown1, Markdown2, Markdown3 & Markdown5 have negative values in the combined_test set, which need to be replaced by 0

# 5. Data pre-processing

### 5.1 Replace missing Values by 0

In [None]:
# Check for count of missing values in combined_train, combined_test

print(combined_test.isnull().sum())
print(combined_train.isnull().sum())

In [None]:
processed_train = combined_train.fillna(0)
processed_test = combined_test.fillna(0)

### 5.2 Replace -ve Markdown values by 0 in both processed_train and processed_test

In [None]:
# %% Processed_train
processed_train.loc[processed_train['Weekly_Sales'] < 0.0,'Weekly_Sales'] = 0.0
processed_train.loc[processed_train['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
processed_train.loc[processed_train['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
processed_train.describe()

In [None]:
# Processed_test
processed_test.loc[processed_test['MarkDown1'] < 0.0,'MarkDown1'] = 0.0
processed_test.loc[processed_test['MarkDown2'] < 0.0,'MarkDown2'] = 0.0
processed_test.loc[processed_test['MarkDown3'] < 0.0,'MarkDown3'] = 0.0
processed_test.loc[processed_test['MarkDown5'] < 0.0,'MarkDown5'] = 0.0
processed_test.describe()

### 5.3 Perform one hot encoding for categorical and boolean data

In [None]:
# Check the datatype of all variables in processed_train and processed_test

print(processed_train.dtypes, processed_test.dtypes)

In [None]:
cat_col = ['IsHoliday_x','Type']

In [None]:
for col in cat_col:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(processed_train[col].values.astype('str'))
    processed_train[col] = lbl.transform(processed_train[col].values.astype('str'))

In [None]:
for col in cat_col:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(processed_test[col].values.astype('str'))
    processed_test[col] = lbl.transform(processed_test[col].values.astype('str'))

In [None]:
# processed_train.to_csv("Processed_data/processed_train.csv", index=False)
# processed_test.to_csv("Processed_data/processed_test.csv", index=False)

In [None]:
processed_train.head()

In [None]:
# Rearrange the response columns ('Weekly_Sales')

processed_train = processed_train[['Store', 'Dept', 'Date', 'Unemployment', 'IsHoliday_x', 'Type', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Weekly_Sales']]

In [None]:
# # Save the new processed file
# processed_train.to_csv("Processed_data/processed_train.csv", index=False)

# 6. Baseline Model

In [None]:
import keras
from sklearn.preprocessing import MinMaxScaler

In [None]:
# %% Check the date range in processed_train to perform train-val-test split
print(processed_train['Date'].min(), processed_train['Date'].max())

In [None]:
# %% Split train set into train-dev set
split_date = pd.datetime(2012,8,24)
train_set = processed_train.loc[processed_train['Date'] <= split_date]
dev_set = processed_train.loc[processed_train['Date'] > split_date]

In [None]:
# %% Split dev set into validation and test set
split_date_dev = pd.datetime(2012,9,25)
val_set = dev_set.loc[dev_set['Date'] <= split_date_dev]
test_set = dev_set.loc[dev_set['Date'] > split_date_dev]

In [None]:
train_set = train_set.set_index('Date')
val_set = val_set.set_index('Date')
test_set = test_set.set_index('Date')

In [None]:
train_set_array = train_set.iloc[:,:].values
val_set_array = val_set.iloc[:,:].values
test_set_array = test_set.iloc[:,:].values

print("Shape of train, val and test array:\n",train_set_array.shape,"\n",val_set_array.shape,"\n",test_set_array.shape)

In [None]:
# Scaling
sc = MinMaxScaler(feature_range=(0,1))
train_set_scaled = sc.fit_transform(train_set_array[:,:])
val_set_scaled = sc.fit_transform(val_set_array[:,:])
test_set_scaled = sc.fit_transform(test_set_array[:,:])

print(train_set_scaled.shape, val_set_scaled.shape, test_set_scaled.shape)

In [None]:
X_train = []
y_train = []
X_val = []
y_val = []
X_test = []
y_test = []

X_train, y_train = train_set_scaled[:,:-1], train_set_scaled[:,-1]
X_val, y_val = val_set_scaled[:,:-1], val_set_scaled[:,-1]
X_test, y_test = test_set_scaled[:,:-1], test_set_scaled[:,-1]

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

In [None]:
# Create data structure

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Flatten

In [None]:
# Initialising RNN
regressor = Sequential()

In [None]:
# Adding the first LSTM layer and some Dropout regularization
# Dropout regularization is added to avoid overfitting

regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu', input_shape = (X_train.shape[1], 14)))
regressor.add(Dropout(0.5))

In [None]:
# adding a second LSTM layer and some dropout regularization
regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu'))
regressor.add(Dropout(0.5))

In [None]:
# # adding a third LSTM layer and some dropout regularization
# regressor.add(LSTM(units = 10, return_sequences = True, activation = 'relu'))
# regressor.add(Dropout(0.7))

In [None]:
# adding a fourth LSTM layer and some dropout regularization
regressor.add(LSTM(units = 10, return_sequences = False, activation = 'relu'))
regressor.add(Dropout(0.5))

In [None]:
# Adding the output layer
#regressor.add(Flatten())
regressor.add(Dense(units=1, activation = 'sigmoid'))

In [None]:
# Compiling the RNN
regressor.compile(optimizer='adam', 
                  loss='mean_squared_error', 
                  metrics=['accuracy'])

In [None]:
# Fitting the RNN to the training set
history = regressor.fit(X_train, 
              y_train, 
              epochs = 20, 
              batch_size = 512, 
              validation_data = (X_val, y_val),
              verbose = 1)

In [None]:
# Save Baseline Model

# regressor.save('models/model6_baseline.h5')

In [None]:
# To load the model

# regressor = load_model('models/model1_baseline.h5')

# 7 Predictions

In [None]:
# %% 
predicted_sales = regressor.predict(X_test)

In [None]:
# Reshape X_test for inverse scaling
X_test = X_test.reshape((X_test.shape[0], X_test.shape[2]))

In [None]:
print(X_test[:,:].shape, predicted_sales.shape)

In [None]:
# Concatenate in the same order. In our example, values of weekly sales should be in the end. Hence X_test[:,:] followed by predicted_sales
predicted_weekly_sales = np.concatenate((X_test[:,:], predicted_sales),axis=1)
predicted_weekly_sales = sc.inverse_transform(predicted_weekly_sales)
predicted_weekly_sales = predicted_weekly_sales[:,14:15]
predicted_weekly_sales

In [None]:
y_test = y_test.reshape((y_test.shape[0],1))

In [None]:
observed_weekly_sales = np.concatenate((X_test[:,:], y_test),axis=1)
observed_weekly_sales = sc.inverse_transform(observed_weekly_sales)
observed_weekly_sales = observed_weekly_sales[:,14:15]
observed_weekly_sales

In [None]:
print(observed_weekly_sales.shape, predicted_weekly_sales.shape)

In [None]:
fontP = FontProperties()
fontP.set_size('xx-large')

plt.subplots(figsize=(60,25))
plt.plot(observed_weekly_sales,color='red',label='Real weekly sales')
plt.plot(predicted_weekly_sales,color='blue',label='Predicted weekly sales')
plt.title('Walmart Weekly Sales', fontsize=40)
plt.xlabel('Time', fontsize=25)
plt.ylabel('Sales', fontsize=25)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.legend(loc=0, ncol=1, bbox_to_anchor=(0, 0, 1, 1),
           prop = fontP,fancybox=True,shadow=False)

plt.show()

In [None]:
# obs_pred_df = pd.DataFrame({'Date': test_set['Date'],
#               'Observed Sales': observed_weekly_sales.reshape(observed_weekly_sales.shape[0]),
#               'Predicted Sales': predicted_weekly_sales.reshape(predicted_weekly_sales.shape[0])})

# obs_pred_df.head()

In [None]:
# obs_pred_df.set_index('Date', inplace=True)
# obs_pred_df.head()

In [None]:
# obs_pred_df.plot(figsize=(20,10), linewidth=5, fontsize=20)
# plt.xlabel('Year', fontsize=20)
# plt.show()

# 8 Model Evaluation

In [None]:
test_set = test_set.reset_index()
test_set.head()

In [None]:
weight = np.where(test_set['IsHoliday_x']==0, 1, 5)

df = pd.DataFrame({"Date": test_set['Date'],
                   "Weight": weight,
                   "Observed_Values":observed_weekly_sales.reshape(observed_weekly_sales.shape[0]),
                  "Predicted_Values":predicted_weekly_sales.reshape(predicted_weekly_sales.shape[0])})

df['Derived'] = df['Weight']*abs(df['Observed_Values']-df['Predicted_Values'])

df.head()

In [None]:
WMAE = sum(df['Derived']) / sum(df['Weight'])
WMAE

In [None]:
# results = regressor.evaluate(X_test, y_test)
# results

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss)+1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
del(regressor)