In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import boto3
from datetime import datetime
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from pmdarima import model_selection
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# -- Read in data

df = pd.read_csv('s3://xxxxxx/xxxxx.csv')

df = df.rename(columns = {'COUNT(X.REFNUM)': 'AVOL'})

# -- Select the matter name for time series analysis

df['M_NAME'].unique

In [None]:
df = df[df['M_NAME'] == 'MNS8']

In [None]:
# -- Inspect data

print(df.head())
print('\n Data Types:')
print(df.dtypes)

In [None]:
# -- convert month received column to dtype dates

df['MONTH'] = pd.to_datetime(df['MONTH'])
df.set_index('MONTH', inplace=True)

 # -- check dtype of index
df.index

In [None]:
 # -- convert apps volume to time series
time_series = pd.DataFrame(df['AVOL'])
time_series.head(3)

In [None]:
# -- Sort on index

time_series = time_series.sort_index(axis=0)

In [None]:
# -- Find total range

# print(min(df['MONTH']) + " " +
# max(df['MONTH']))

# -- required (minus March as data incomplete). total range = '2016-03':'2022-02'

time_series = time_series['2016-04':'2022-02']

In [None]:
ts_plot = plt.plot(time_series)
ts_plot

In [None]:
adf_test = ADFTest(alpha = 0.05)
adf_test.should_diff(time_series)

In [None]:
# -- Test Stationarity of Time Series

def test_stationarity(tseries):
    # Determining rolling statistics:
    rolmean = tseries.rolling(window=12).mean()
    rolstd = tseries.rolling(window=12).std()
    
    # Plot rolling statistics:
    plt.plot(tseries, color='blue', label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling StD')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    # Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(tseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:3], index=['Test Statistic', 'p-value', '#Lag Used' 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(time_series)

In [None]:
# -- Plot test and train data
train = time_series[:85].dropna()
test = time_series[-20:].dropna()
plt.plot(train)
plt.plot(test)

In [None]:
# arima_model = pm.auto_arima(train, error_action='ignore', trace=True,
#                       suppress_warnings=True, maxiter=150,
#                       seasonal=True, m=12, return_conf_int=True)

"""-- MNS8 Best model: ARIMA(p=0,d=1,q=1)(P=2/4?,D=0,Q=0)[12]
    -- MNDA Best model: ARIMA(2,1,0)(2,0,0)[12]
    -- MNPLF Best model:  ARIMA(0,1,1)(2,0,0)[12]
    -- MNPL Best model:  ARIMA(4,1,1)(2,0,0)[12]  
"""

arima_model = pm.arima.ARIMA(order=(0,1,1),seasonal_order=(2,0,0,12),error_action='ignore', trace=True,
                             suppress_warnings=True, maxiter=10, seasonal=True, m=12, return_conf_int=True)
arima_model.fit(train)

In [None]:
arima_model.summary()

In [None]:
# -- collate and define results / figures

predictions = arima_model.predict(n_periods = 20, return_conf_int=True)
forecast = predictions[0]
conf_int = predictions[1]
low_conf, high_conf = zip(*conf_int)

forecast = pd.Series(forecast, index=test.index)
high_conf = pd.Series(high_conf, index=test.index)
low_conf = pd.Series(low_conf,index=test.index)

In [None]:
# -- plot results

plt.figure(figsize=(8,5))
plt.fill_between(test.index,high_conf,low_conf,color="springgreen", label="Confidence Int (95%)")
plt.plot(train, label="Training", color="black")
plt.plot(forecast,label="Predicted", color="seagreen")
plt.legend(loc = "upper left")
plt.grid(which='minor')
plt.show()

In [None]:
# -- show R squared

print(r2_score(test, forecast))

In [None]:
# -- Apply forecast to new time periods

new_dates =pd.Index(['2022-03-01', '2022-04-01','2022-05-01','2022-06-01',
            '2022-07-01','2022-08-01','2022-09-01','2022-10-01',
            '2022-11-01','2022-12-01','2023-01-01','2023-02-01'])
time_series.index.append(new_dates)
time_series.index = pd.to_datetime(time_series.index)
#time_series.index

In [None]:
# -- plot final forecast results

plt.plot(time_series)