<a href="https://colab.research.google.com/github/hamzafarooq/time_series/blob/master/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2019 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Time series forecasting

In [None]:
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

In [None]:
import statsmodels.api as sm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import imageio
import os
from statsmodels.graphics.tsaplots import plot_acf

## Iowa Dataset
This tutorial uses a <a href="https://console.cloud.google.com/marketplace/details/iowa-department-of-commerce/iowa-liquor-sales" class="external">Iowa Liquor Retails Sales</a>.

This dataset contains every wholesale purchase of liquor in the State of Iowa by retailers for sale to individuals since January 1, 2012. The State of Iowa controls the wholesale distribution of liquor intended for retail sale, which means this dataset offers a complete view of retail liquor sales in the entire state. The dataset contains every wholesale order of liquor by all grocery stores, liquor stores, convenience stores, etc., with details about the store and location, the exact liquor brand and size, and the number of bottles ordered.

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

In [None]:
# Save output in a variable `df`

%%bigquery --project predictiondeployment df
SELECT 
  * 
FROM `bigquery-public-data.iowa_liquor_sales.sales`
where store_number  = '2633'

Let's take a glance at the data.

In [None]:
df.head()

In [None]:
df.describe()
from datetime import datetime

In [None]:
df_single_item_aggregate =df[['date','sale_dollars']]
df_single_item_aggregate['date'] = pd.to_datetime(df_single_item_aggregate['date'])
#print(type(date_object))
#print(date_object) 

In [None]:
df_single_item_aggregate = df_single_item_aggregate.groupby(['date']).sum().rename_axis('date')

In [None]:
#df_single_item_aggregate['flag'] = pd.Series(np.where(df_single_item_aggregate.index >= np.datetime64('2020-01-25'), 1, 0),index=df_single_item_aggregate.index)
df_single_item_aggregate


In [None]:
def split_data(data, split_date):
    return data[data.index <= split_date].copy(), \
           data[data.index >  split_date].copy()

In [None]:
train, test = split_data(df_single_item_aggregate, '2020-04-01')

plt.figure(figsize=(20,10))
plt.xlabel('time')
plt.ylabel('close')
plt.plot(train.index,train)
plt.plot(test.index,test)
plt.show()


In [None]:
train.describe()

# xgboost Model

In [None]:
def create_features(df):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    df['flag'] = pd.Series(np.where(df['date'] >= np.datetime64('2020-01-25'), 1, 0), index=df.index)
    
    X = df[['dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear','flag']]
    return X

In [None]:
X_train, y_train = create_features(train), train['sale_dollars']
X_test, y_test   = create_features(test), test['sale_dollars']

X_train.shape, y_train.shape

In [None]:
X_train.head()

#

In [None]:
#df['flag'] = pd.Series(np.where(df['date'] >= np.datetime64('2020-01-25'), 1, 0), index=df.index)
X_train.tail()

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=500, #stop if 50 consequent rounds without decrease of error
        verbose=False) # Change verbose to True if you want to see it train

In [None]:
xgb.plot_importance(reg, height=0.9)


In [None]:
def plot_performance(base_data, date_from, date_to, title=None):
    plt.figure(figsize=(15,3))
    if title == None:
        plt.title('From {0} To {1}'.format(date_from, date_to))
    else:
        plt.title(title)
    plt.xlabel('time')
    plt.ylabel('close')
    plt.plot(df_single_item_aggregate.index,df_single_item_aggregate, label='data')
    plt.plot(X_test.index,X_test_pred, label='prediction')
    plt.legend()
    plt.xlim(left=date_from, right=date_to)

In [None]:
xgb.plot_importance(reg, height=0.9)
X_test_pred = reg.predict(X_test)
    
plot_performance(df_single_item_aggregate, df_single_item_aggregate.index[0].date(), df_single_item_aggregate.index[-1].date(),
                 'Original and Predicted Data')

plot_performance(y_test, y_test.index[0].date(), y_test.index[-1].date(),
                 'Test and Predicted Data')

#plot_performance(y_test, '2019-7-01', '2019-8-01', 'Snapshot')

plt.legend()

plt.show()

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mean_absolute_percentage_error(y_test,X_test_pred)

In [None]:
def calc_smape(y_hat, y):
        return 100/len(y) * np.sum(2 * np.abs(y_hat - y) / (np.abs(y) + np.abs(y_hat)))

In [None]:
calc_smape(y_test,X_test_pred)

In [None]:
error_by_week = []
random_weeks = X_test[['year', 'weekofyear']].sample(10)
for week in random_weeks.iterrows():
    index = (X_test.year == week[1].year) & \
            (X_test.weekofyear == week[1].weekofyear)
    error_by_week.append(mean_absolute_percentage_error(y_test[index], X_test_pred[index]))
pd.Series(error_by_week, index=random_weeks.index)

## Weekly Prediction

In [None]:
%%bigquery --project predictiondeployment df2
SELECT EXTRACT(YEAR FROM date) AS year, EXTRACT(WEEK FROM date) AS week, sum(sale_dollars) as sales
FROM `bigquery-public-data.iowa_liquor_sales.sales`
where store_number  = '2633' and date < '2020-03-01'
group by 1,2
order by 1,2

In [None]:
df2.head(5)

In [None]:
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
series=np.array(df2['sales'])
def plot_series(time, series, format="-", start=0, end=None):
    plt.plot(time[start:end], series[start:end], format)
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.grid(True)
#values = series
#values = values.reshape((len(values), 1))
# train the normalization
#scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = scaler.fit(values)
#print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))
# normalize the dataset and print the first 5 rows
#normalized = scaler.transform(values)
#series=normalized.ravel()

import csv
time_step = []
temps = []


step=0
for row in range(len(df2)):
  time_step.append(step)
  step = step + 1

time = np.array(time_step)
plt.figure(figsize=(25, 6))
plot_series(time, series)


In [None]:
import datetime
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(len(df2))]

In [None]:
df2['date']=sorted(date_list)

In [None]:
df2.head(5)

In [None]:
df2['date'][370:371] #59
df2.drop(['year','week'],axis=1, inplace=True)

In [None]:
'''df2.index=df2['date']
df2.drop('date',axis=1, inplace=True)

In [None]:
df2.head(5)

In [None]:
train, test = split_data(df2, '2020-05-16')

plt.figure(figsize=(20,10))
plt.xlabel('date')
plt.ylabel('sales')
plt.plot(train.index,train)
plt.plot(test.index,test)
plt.show()

In [None]:
X_train, y_train = create_features(train), train['sales']
X_test, y_test   = create_features(test), test['sales']

X_train.shape, y_train.shape

In [None]:
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50, #stop if 50 consequent rounds without decrease of error
        verbose=True) # Change verbose to True if you want to see it train

In [None]:
xgb.plot_importance(reg, height=0.9)
X_test_pred = reg.predict(X_test)
    
plot_performance(df2, df2.index[0].date(), df2.index[-1].date(),
                 'Original and Predicted Data')

plot_performance(y_test, y_test.index[0].date(), y_test.index[-1].date(),
                 'Test and Predicted Data')

#plot_performance(y_test, '2019-7-01', '2019-8-01', 'Snapshot')

plt.legend()

plt.show()

# LSTM Model



In [None]:
#importing required libraries
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

In [None]:
TRAIN_SPLIT = 867
tf.random.set_seed(13)

In [None]:

new_data=df_single_item_aggregate
#setting index
new_data.index = df_single_item_aggregate.index
#new_data.drop('Date', axis=1, inplace=True)

#creating train and test sets
dataset = new_data.values


In [None]:
train = df_single_item_aggregate[0:TRAIN_SPLIT]
valid = df_single_item_aggregate[TRAIN_SPLIT:]


#converting dataset into x_train and y_train
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)

x_train, y_train = [], []
for i in range(60,len(train)):
    x_train.append(scaled_data[i-60:i,0])
    y_train.append(scaled_data[i,0])
x_train, y_train = np.array(x_train), np.array(y_train)

x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1))

In [None]:
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(LSTM(units=50))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=1, batch_size=1, verbose=2)

#predicting 246 values, using past 60 from the train data
inputs = new_data[len(new_data) - len(valid) - 60:].values
inputs = inputs.reshape(-1,1)
inputs  = scaler.transform(inputs)

X_test = []
for i in range(60,inputs.shape[0]):
    X_test.append(inputs[i-60:i,0])
X_test = np.array(X_test)

X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
sale_dollars = model.predict(X_test)
sale_dollars = scaler.inverse_transform(sale_dollars)

In [None]:
rms=np.sqrt(np.mean(np.power((valid-sale_dollars),2)))
rms

In [None]:
#for plotting
train = df_single_item_aggregate[0:TRAIN_SPLIT]
valid = df_single_item_aggregate[TRAIN_SPLIT:]
valid['Predictions'] = sale_dollars
plt.figure(figsize=(15,5))
plt.xlabel('time')
plt.ylabel('close')
#plt.plot(train.index,energy_hourly, label='data')
plt.plot(train['sale_dollars'])
plt.plot(valid[['sale_dollars','Predictions']])
plt.show()
