# Interbus Sales Prediction - ARIMA model

<img src="connective.png" alt="drawing" align="left" width="300"/> <img src="madlib.png" alt="drawing" align="left>" width="150"/> 

### [Documentation ](http://madlib.apache.org/docs/latest)

In [None]:
from sqlalchemy.engine import create_engine
engine = create_engine("postgresql://gpadmin:pivotal@10.0.2.6:5432/gpadmin")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
%load_ext sql

In [None]:
%sql postgresql://gpadmin:pivotal@10.0.2.6:5432/gpadmin

In [None]:
%sql select version()

In [None]:
%%sql
drop table if exists madlib.ventas_timeseries_hourly;

create table madlib.ventas_timeseries_hourly as (
SELECT count(*) as ventas, date_trunc('hour', ventas."fecha_venta")::timestamp as fecha
from interbus.ventas
group by fecha
order by fecha);

select count(*) from madlib.ventas_timeseries_hourly;

In [None]:
# load SQL table with original data from databse as dataframe
sql = """
select * from madlib.ventas_timeseries_hourly
order by fecha;
"""
df0 = pd.read_sql_query(sql, engine)

# rename column of forecast
df0.rename(columns = {"fecha":"dates", "ventas":"sales"}, inplace=True)


# Obtain wich days are weekdays and non-working days  
df0['weekday'] = df0['dates'].dt.weekday
df0['weekend'] = 'False'
df0.loc[df0['weekday']==5, 'weekend'] = 'True'
df0.loc[df0['weekday']==6, 'weekend'] = 'True'


# set the index dataframe of the original results as the dates
if not isinstance(df0.index[0], pd.Timestamp):
    df0.set_index('dates', drop=True, inplace=True)
    
# set name of the day
days = {0:'Monday', 1:'Tuesday',2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
df0['weekday'] = df0['weekday'].map(days)

# set working days
df0.loc[:, 'type_day'] = 'working_day'

# set not working days
df0.loc[df0['weekday']=='Saturday', 'type_day'] = 'non_working_day'
df0.loc[df0['weekday']=='Sunday', 'type_day'] = 'non_working_day'

# set non_working days (from january to august)

df0.loc['2019-01-01', ['type_day']] = 'non_working_day'
df0.loc['2019-01-06', ['type_day']] = 'non_working_day'
df0.loc['2019-04-09', ['type_day']] = 'non_working_day'
df0.loc['2019-05-01', ['type_day']] = 'non_working_day'
df0.loc['2019-05-02', ['type_day']] = 'non_working_day'
df0.loc['2019-08-15', ['type_day']] = 'non_working_day'

# set holydays period of time 
df0.loc[pd.date_range('2019-04-10', '2019-04-19'), ['type_day']] = 'holydays'
df0.loc[pd.date_range('2019-07-15', '2019-08-24'), ['type_day']] = 'holydays'



In [None]:
# lleva mucho tiempo
# execute SQL quesry deleting table to be loaded
engine.execute("""drop table if exists madlib.ventas_timeseries_hourly;""")

# load dataframe results as a table in database
df0.to_sql('ventas_timeseries_hourly', schema='madlib', con=engine)

In [None]:
# group by type to cluster
df0_grouped = pd.DataFrame(df0.groupby(['weekday', 'weekend', 'type_day']).mean())
df0_grouped

In [None]:
%%sql 
drop table if exists madlib.ventas_timeseries_hourly_output,
                     madlib.ventas_timeseries_hourly_output_residual,
                     madlib.ventas_timeseries_hourly_output_summary;
SELECT madlib.arima_train( 'madlib.ventas_timeseries_hourly',
                           'madlib.ventas_timeseries_hourly_output',
                           'dates',
                           'sales',
                           NULL,
                           FALSE,
                           ARRAY[6, 1, 1]
                         );
SELECT * FROM madlib.ventas_timeseries_hourly_output_summary;

In [None]:
%%sql
SELECT * FROM madlib.ventas_timeseries_hourly_output;

In [None]:
%%sql
SELECT * FROM madlib.ventas_timeseries_hourly_output_residual;

In [None]:
%%sql
drop table if exists madlib.ventas_timeseries_hourly_forecast_output;
SELECT madlib.arima_forecast( 'madlib.ventas_timeseries_hourly_output',
                              'madlib.ventas_timeseries_hourly_forecast_output',
                              130
                            );
SELECT * FROM madlib.ventas_timeseries_hourly_forecast_output
order by steps_ahead;

In [None]:
# create an index of dates beginning with the last date of the original data up to 
# the steps ahead forecasted
start_date = df0.index[len(df0)-2]
index = pd.date_range(start = start_date, periods = 130+1)
index = index[1:]

# Load table results as dataframe
sql = """
SELECT * FROM madlib.ventas_timeseries_hourly_forecast_output
order by steps_ahead;
"""
df = pd.read_sql_query(sql, engine)

# eliminate Nan Values
df.dropna(inplace=True)

# Sort by steps ahead
df.sort_values(by=['steps_ahead'], inplace=True)

# include as colunm in the dataframe
df['dates'] = index 

df['weekday'] = df['dates'].dt.weekday
df['weekend'] = 'False'
df.loc[df['weekday']==5, 'weekend'] = 'True'
df.loc[df['weekday']==6, 'weekend'] = 'True'


# set data frame index as the dates
df.set_index('dates', drop=True, inplace=True)

# set name of the day
df['weekday'] = df['weekday'].map(days)
    
# set non_working_days (from january to august)
df['type_day'] = 'working_day'
df.loc['2019-10-12', ['type_day']] = 'non_working_day'
df.loc['2019-11-01', ['type_day']] = 'non_working_day'
df.loc['2019-12-06', ['type_day']] = 'non_working_day'
df.loc['2019-12-08', ['type_day']] = 'non_working_day'
df.loc['2019-12-25', ['type_day']] = 'non_working_day'

df.loc[df['weekday']=='Sunday', 'type_day'] = 'non_working_day'
df.loc[df['weekday']=='Saturday', 'type_day'] = 'non_working_day'

# set holydays period of time 
df.loc['2019-08-25 00:00:00': '2019-09-01 00:00:00', ['type_day']] = 'holydays'
df.loc['2019-12-06': '2019-12-08', ['type_day']] = 'holydays'
df.loc['2019-12-24': '2019-12-31', ['type_day']] = 'holydays'

# eliminate column of steps ahead
df.drop(['steps_ahead'], inplace=True, axis=1)

# rename column of forecast
df.rename(columns = {"forecast_value":"ARIMA_forecast", "dates":"dates"}, inplace=True)


df.head()
# Plot resutls
#df[['forecast_value']].plot(kind='line')
#plt.show()

In [None]:
# mean value of sales in each day of the week
sales_mo = df0[df0['weekday']==days[0]].sales
sales_tu = df0[df0['weekday']==days[1]].sales
sales_we = df0[df0['weekday']==days[2]].sales
sales_th = df0[df0['weekday']==days[3]].sales
sales_fr = df0[df0['weekday']==days[4]].sales
sales_sa = df0[df0['weekday']==days[5]].sales
sales_su = df0[df0['weekday']==days[6]].sales

# eliminate outliers
Q1 = np.quantile(sales_mo, 0.25)
Q3 = np.quantile(sales_mo, 0.75)
sales_mo = sales_mo[(sales_mo<(Q3+1.5*(Q3-Q1))) & (sales_mo>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_th, 0.25)
Q3 = np.quantile(sales_th, 0.75)
sales_th = sales_th[(sales_th<(Q3+1.5*(Q3-Q1))) & (sales_th>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_we, 0.25)
Q3 = np.quantile(sales_we, 0.75)
sales_we = sales_we[(sales_we<(Q3+1.5*(Q3-Q1))) & (sales_we>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_tu, 0.25)
Q3 = np.quantile(sales_tu, 0.75)
sales_tu = sales_tu[(sales_tu<(Q3+1.5*(Q3-Q1))) & (sales_tu>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_fr, 0.25)
Q3 = np.quantile(sales_fr, 0.75)
sales_fr = sales_fr[(sales_fr<(Q3+1.5*(Q3-Q1))) & (sales_fr>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_sa, 0.25)
Q3 = np.quantile(sales_sa, 0.75)
sales_sa = sales_sa[(sales_sa<(Q3+1.5*(Q3-Q1))) & (sales_sa>(Q1-1.5*(Q3-Q1)))]

Q1 = np.quantile(sales_su, 0.25)
Q3 = np.quantile(sales_su, 0.75)
sales_su = sales_su[(sales_su<(Q3+1.5*(Q3-Q1))) & (sales_su>(Q1-1.5*(Q3-Q1)))]

# extract mean values
mv_mo = sales_mo.mean()
mv_tu = sales_tu.mean()
mv_we = sales_we.mean()
mv_th = sales_th.mean()
mv_fr = sales_fr.mean()
mv_sa = sales_sa.mean()
mv_su = sales_su.mean()

# extract standard deviation 
sd_mo = sales_mo.std()
sd_tu = sales_tu.std()
sd_we = sales_we.std()
sd_th = sales_th.std()
sd_fr = sales_fr.std()
sd_sa = sales_sa.std()
sd_su = sales_su.std()

# plot 
mw_week = np.array([mv_mo, mv_tu, mv_we, mv_th, mv_fr, mv_sa, mv_su])
sd_week = np.array([sd_mo, sd_tu, sd_we, sd_th, sd_fr, sd_sa, sd_su])
x_label = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']

plt.figure(figsize=(12,4))
plt.bar(x = x_label, height=mw_week)
plt.ylabel('Mean Value')
plt.show()

plt.figure(figsize=(12,4))
plt.bar(x = x_label, height=sd_week)
plt.ylabel('Standard Deviation')
plt.show()

plt.figure(figsize=(12,4))
plt.ylabel('Sales Boxplot')
plt.boxplot(x = [sales_mo, sales_th, sales_we, sales_tu, sales_fr, sales_sa, sales_su], labels = x_label )
plt.show()

In [None]:
# create a copy of the resutls dataframe
df_aux = df.copy()

# add mean value by the cluster weekday-weekend-type_day
for i in range(0,len(df0_grouped)):
    weekday = df0_grouped.iloc[i].name[0]
    weekend = df0_grouped.iloc[i].name[1]
    type_day = df0_grouped.iloc[i].name[2]
    
    df_aux.loc[(df_aux['weekday']==weekday) & (df_aux['weekend']==weekend) & (df_aux['type_day']==type_day), 'descriptive_cluster'] =  df0_grouped.iloc[i].sales
    
# add mean value by weekday
df_weekday = pd.DataFrame(df0.groupby(['weekday']).mean())
df_aux.loc[df_aux.weekday==days[0], 'descriptive_weekday'] = df_weekday.loc[days[0]].sales
df_aux.loc[df_aux.weekday==days[1], 'descriptive_weekday'] = df_weekday.loc[days[1]].sales
df_aux.loc[df_aux.weekday==days[2], 'descriptive_weekday'] = df_weekday.loc[days[2]].sales
df_aux.loc[df_aux.weekday==days[3], 'descriptive_weekday'] = df_weekday.loc[days[3]].sales
df_aux.loc[df_aux.weekday==days[4], 'descriptive_weekday'] = df_weekday.loc[days[4]].sales
df_aux.loc[df_aux.weekday==days[5], 'descriptive_weekday'] = df_weekday.loc[days[5]].sales
df_aux.loc[df_aux.weekday==days[6], 'descriptive_weekday'] = df_weekday.loc[days[6]].sales

# add mean value by weekend
df_weekend = pd.DataFrame(df0.groupby(['weekend']).mean())
df_aux.loc[df_aux.weekend=='True', 'descriptive_weekend'] = df_weekend.sales[True]
df_aux.loc[df_aux.weekend=='False', 'descriptive_weekend'] = df_weekend.sales[False]


# add mean value by type_day
df_typeday = pd.DataFrame(df0.groupby(['type_day']).mean())
df_aux.loc[df_aux.type_day=='holydays', 'descriptive_typeday'] = df_typeday.sales['holydays']
df_aux.loc[df_aux.type_day=='non_working_day', 'descriptive_typeday'] = df_typeday.sales['non_working_day']
df_aux.loc[df_aux.type_day=='working_day', 'descriptive_typeday'] = df_typeday.sales['working_day']

df_aux.head()

In [None]:
# parameters of the hybrid predictive model
# weight factor associated with ARIMA model, the bigger, the more important ARIMA is
coefs = [0.2, 0.4, 0.2, 0.4]

# create new predictions based on descriptive model
df_aux['ARIMA_forecast_descriptive'] = df_aux['ARIMA_forecast']*coefs[0] + df_aux['descriptive_weekday']*coefs[1] + df_aux['descriptive_weekend']*coefs[2] + df_aux['descriptive_typeday']*coefs[3]


In [None]:

# Obtain wich days are weekdays and non-working days  
#df_aux['dates'] = df_aux.index
#df_aux['weekday'] = df_aux['dates'].dt.weekday
#df_aux['workday'] = np.logical_and(df_aux['weekday'] != 5, df_aux['weekday'] != 6)
#df_aux.drop(['dates'], inplace=True, axis=1)

# set name of the day
#df_aux['weekday'] = df_aux['weekday'].map(days)



# Obtain mean values for sales in weerkday and non-working days 
#mv_wd = pd.read_sql_query('select sum(mv_wd) from interbus.ventas_time_series_stats', engine).iloc[0,0]
#mv_we = pd.read_sql_query('select sum(mv_we) from interbus.ventas_time_series_stats', engine).iloc[0,0]
#print('average daily sales in weekday:',mv_wd,'\n', 'average daily sales in weekend:', mv_we)


In [None]:
# Concat dataframes with real data and predictions
df_total = pd.concat([df0[['sales']], df_aux], axis=1)


# eliminate Nan values
df_total.fillna(0) 
if 0 == 1:
    df_total.loc[df_total.sales.isna(), 'sales'] = 0
    df_total.loc[df_total.ARIMA_forecast.isna(), 'ARIMA_forecast'] = 0
    df_total.loc[df_total.ARIMA_forecast_descriptive.isna(), 'ARIMA_forecast_descriptive'] = 0

In [None]:
# Plot sales
#plt.figure(figsize=(12,8))
df_total[['sales', 'ARIMA_forecast','descriptive_cluster', 'descriptive_weekday','descriptive_typeday', 'ARIMA_forecast_descriptive']].plot(figsize=(14,4))
plt.show()

In [None]:
# execute SQL quesry deleting table to be loaded
engine.execute("""drop table if exists madlib.sales_prediction_hourly;""")

# load dataframe results as a table in database
df_total.to_sql('sales_prediction_hourly', schema='madlib', con=engine)

In [None]:
df_total.tail()