In [None]:
import pandas as pd
import numpy as np
import io
# from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os

import warnings; warnings.simplefilter('ignore')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import keras 
# import mlflow 
# import mlflow.sklearn

In [None]:
# !pip install dvc[gdrive]


In [None]:
import dvc.api


In [3]:
# !git clone https://github.com/isaaclucky/sales-prediction.git 


In [2]:
# cd sales-prediction/


In [1]:
# !dvc pull 


In [None]:
df_train = pd.read_csv('data/df_train_prep.csv')
df_test = pd.read_csv('data/df_test_prep.csv')

In [None]:
df_train.StateHoliday = df_train.StateHoliday.astype('string')
df_test.StateHoliday = df_test.StateHoliday.astype('string')

In [None]:
X_train = df_train.drop(columns=['Sales','Customers','Date','Unnamed: 0'],axis=1).copy(deep=True)
Y_train = df_train['Sales'].copy(deep=True)
X_test =df_test.drop(columns=['Unnamed: 0'],axis=1).copy(deep=True)

In [None]:
cont_cols = ['DayOfWeek', 'Until_Holiday', 'Since_Holiday','CompetitionOpenMonthDuration','CompetitionDistance','PromoOpenMonthDuration']
num_cols = list(X_train.select_dtypes(include = 'number').columns)
cat_cols = list(set(X_train.columns)-set(num_cols))
disc_cols = list(set(num_cols) - set(cont_cols))


In [None]:
cont_pipeline = make_pipeline(
    SimpleImputer(strategy = 'median'),
    PowerTransformer(method = 'yeo-johnson', standardize = False),
    StandardScaler()
)
disc_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = -1),
    StandardScaler()
)
cat_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'unknown'),
    OneHotEncoder()
)


preprocessor = ColumnTransformer(
    transformers = [
        ('continuous', cont_pipeline, cont_cols),
        ('discrete', disc_pipeline, disc_cols),
        ('categorical', cat_pipeline, cat_cols)
    ]
)




In [None]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.fit_transform(X_test)

In [None]:
# get_feature_names(preprocessor)

['continuous__DayOfWeek',
 'continuous__Until_Holiday',
 'continuous__Since_Holiday',
 'continuous__CompetitionOpenMonthDuration',
 'continuous__CompetitionDistance',
 'continuous__PromoOpenMonthDuration',
 'discrete__Year',
 'discrete__Open',
 'discrete__Day',
 'discrete__Week',
 'discrete__Month',
 'discrete__Quarter',
 'discrete__DayOfYear',
 'discrete__Store',
 'discrete__Promo',
 'discrete__Promo2',
 'discrete__SchoolHoliday',
 'discrete__WeekOfYear',
 'discrete__IsWeekDay',
 'onehotencoder__x0_Beginning',
 'onehotencoder__x0_End',
 'onehotencoder__x0_Mid',
 'onehotencoder__x1_Fall',
 'onehotencoder__x1_Summer',
 'onehotencoder__x2_0',
 'onehotencoder__x2_a',
 'onehotencoder__x3_0,0,0,0',
 'onehotencoder__x3_Feb,May,Aug,Nov',
 'onehotencoder__x3_Jan,Apr,Jul,Oct',
 'onehotencoder__x3_Mar,Jun,Sept,Dec',
 'onehotencoder__x4_a',
 'onehotencoder__x4_b',
 'onehotencoder__x4_c',
 'onehotencoder__x4_d',
 'onehotencoder__x5_a',
 'onehotencoder__x5_b',
 'onehotencoder__x5_c']

In [None]:
X_train.shape

(1017209, 25)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import TimeDistributed


In [None]:
train_size = int(len(df_train) * 0.70) 
test_size = len(df_train) - train_size 
train, test = X_train_trans[0:train_size], X_train_trans[train_size:]
y_train, y_test = Y_train[0:train_size],Y_train[train_size:]

In [None]:
# Initialize LSTM model
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(train.shape[1],1)))
model.add(Dropout(0.2))
model.add(LSTM(units=128,return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse', 'mae'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 41, 128)           66560     
                                                                 
 dropout (Dropout)           (None, 41, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 41, 128)           131584    
                                                                 
 dropout_1 (Dropout)         (None, 41, 128)           0         
                                                                 
 dense (Dense)               (None, 41, 1)             129       
                                                                 
Total params: 198,273
Trainable params: 198,273
Non-trainable params: 0
_________________________________________________________________


In [None]:
y_train.values.reshape(1,-1)

array([[5263, 6064, 8314, ..., 6652, 3842, 6143]])

In [None]:
scaler_a = StandardScaler()
scaler_b = StandardScaler()
y_train_scaled = scaler_a.fit_transform(y_train.values.reshape(1,-1) )
y_test_scaled = scaler_a.fit_transform(y_test.values.reshape(1,-1) )
# y_test = scaler_b.fit_transform(y_test[:].reshape(1,-1))

In [None]:
y_train_scaled = y_train_scaled.reshape(-1,1)
y_test_scaled = y_test_scaled.reshape(-1,1)

In [None]:
# Fit LSTM model first
history = model.fit(train, y_train_scaled, epochs=30, batch_size=50,
                    validation_data=(test,y_test_scaled), 
                    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                             patience=3, mode='min')],
                     shuffle=False)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


In [None]:
# Fit LSTM model second time
history = model.fit(train, y_train_scaled, epochs=15, batch_size=50,
                    validation_data=(test,y_test_scaled), 
                    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                             patience=3, mode='min')],
                     shuffle=False)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


In [None]:
from datetime import datetime
def generate_model_name() -> str:
    now = datetime.now()
    dt_string = now.strftime("%d-%m-%Y_%H:%M:%S_LSTM")
    return dt_string

In [None]:
from pickle import dump 
def save_model(model) -> None:
    try:
        file_name = generate_model_name() +'.pkl'
        with open(file_name, 'wb') as handle:
            dump(model, handle)
    except Exception as e:
        print('Error while producing model')

In [None]:
model.save( generate_model_name()+'.h5')

In [None]:
# from tensorflow.keras.models import load_model
 
# # load model
# model = load_model('model.h5')

## Prediction