<a href="https://colab.research.google.com/github/konorbj/sales_forecast/blob/master/SalesAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import scipy
import numpy as np
import pandas as pd
from dateutil.parser import parse

from google.colab import drive
from statsmodels.tsa.stattools import adfuller

In [None]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
df = pd.read_csv('/content/gdrive/My Drive/Sales/SalesByDate.csv')

In [None]:
df.head()

Unnamed: 0,Date_Part,Year,Sales
0,"Jan 1, 2016",2016,204
1,"Jan 2, 2016",2016,5615
2,"Jan 3, 2016",2016,828
3,"Jan 4, 2016",2016,2607
4,"Jan 5, 2016",2016,2518


In [None]:
df['Sales'].describe()

count     1651.000000
mean      3060.120533
std       2585.821457
min          1.000000
25%       1445.000000
50%       2214.000000
75%       3882.500000
max      19078.000000
Name: Sales, dtype: float64

# Check for stationarity and count sample entropy

In [None]:
X = df['Sales'].values
X = np.log(X)
result = adfuller(X)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

ADF Statistic: -4.852573
p-value: 0.000043
	1%: -3.434
	5%: -2.863
	10%: -2.568


In [None]:
def sampen(L, m, r):
    N = len(L)
    B = 0.0
    A = 0.0
    
    
    # Split time series and save all templates of length m
    xmi = np.array([L[i : i + m] for i in range(N - m)])
    xmj = np.array([L[i : i + m] for i in range(N - m + 1)])

    # Save all matches minus the self-match, compute B
    B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= r) - 1 for xmii in xmi])

    # Similar for computing A
    m += 1
    xm = np.array([L[i : i + m] for i in range(N - m + 1)])

    A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= r) - 1 for xmi in xm])

    # Return SampEn
    return -np.log(A / B)

In [None]:
sampen(df['Sales'], 2, np.std(df['Sales']) * 0.2)

0.7395657878048252

# Apply difference transform to the time series

In [None]:
series = pd.read_csv('/content/gdrive/My Drive/Sales/SalesByDate.csv',
                     index_col='Date_Part', parse_dates=True)
series.drop(columns=['Year'], inplace=True)

In [None]:
series.head()

Unnamed: 0_level_0,Sales
Date_Part,Unnamed: 1_level_1
2016-01-01,204
2016-01-02,5615
2016-01-03,828
2016-01-04,2607
2016-01-05,2518


In [None]:
from datetime import timedelta

freq_data = pd.infer_freq(series.index)
df_time_delta = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq_data))

time_lags_expected = [int(timedelta(days=1) / df_time_delta), 
                      int(timedelta(days=7) / df_time_delta), 
                      int(timedelta(days=365) / df_time_delta),
                      int(timedelta(days=183) / df_time_delta),
                      int(timedelta(days=90) / df_time_delta),]

In [None]:
from scipy.fftpack import fft, fftfreq

# get top 3 seasons
no_of_seasons=5
nseries = np.asarray(series['Sales'])
# Compute FFT
series_fft = fft(nseries)

# Compute the power
power = np.abs(series_fft)

# Get the corresponding frequencies
sample_freq = fftfreq(series_fft.size)

# Find the peak frequency: we only need the positive frequencies
pos_mask = np.where(sample_freq > 0)
freqs = sample_freq[pos_mask]
powers = power[pos_mask]

# find top frequencies and corresponding time periods for seasonal pattern
top_powers = np.argpartition(powers, -no_of_seasons)[-no_of_seasons:]

time_periods_from_fft = 1 / freqs[top_powers]
time_periods = time_periods_from_fft.astype(int)

In [None]:
from statsmodels.tsa.stattools import acf

# One of the seasonality returned from FFT should be within range of Expected time period
for time_lag in time_lags_expected:
    nearest_time_lag = time_periods.flat[np.abs(time_periods - time_lag).argmin()]
    
    #Using 5% for range comparison
    if nearest_time_lag in range(
            time_lag - int(0.05 * time_lag),
            time_lag + int(0.05 * time_lag)):

        # Check ACF value with lags identified from FFT.
        acf_score = acf(series, nlags=nearest_time_lag)[-1]
        
        # Check ACF is significant or not.
        if acf_score >= 2 / np.sqrt(len(series)):
            # ACF is significant and FFT identifies seasonality
            print('Metrics is seasonal')
        else:
            print('ACF value is not significant')
    else:
        print('Seasonality could not be identified')
        

Seasonality could not be identified
Seasonality could not be identified
Seasonality could not be identified
Metrics is seasonal
ACF value is not significant




In [None]:
from statsmodels.tsa.stattools import kpss
def kpss_test(series, **kw):    
    statistic, p_value, n_lags, critical_values = kpss(series, **kw)
    # Format Output
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')
    print(f'Result: The series is {"not " if p_value < 0.05 else ""}stationary')

kpss_test(df['Sales'])

KPSS Statistic: 0.2528451636081054
p-value: 0.1
num lags: 25
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
Result: The series is stationary




In [None]:
series = df['Sales']
result = adfuller(series, autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

ADF Statistic: -5.773421619779879
p-value: 5.326991877177091e-07
Critial Values:
   1%, -3.434380559955255
Critial Values:
   5%, -2.8633202588932547
Critial Values:
   10%, -2.567717771455621
