In [1]:
import pandas as pd
import numpy as np
import datetime
import sys, os
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from keras.layers import Dense

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from statsmodels.tsa.stattools import adfuller

2021-07-23 15:11:12.758819: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-23 15:11:12.758875: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
pathForData = 'data/dfSalesAnonimized.pkl'
pathForImages = 'images/'


## To-do:
- [x] Use Dickey-Fuller test to check if TS is stationary  

In [3]:
df = pd.read_pickle(pathForData)
df.head()

Unnamed: 0,date,product,quantity,sum,company,taxNumber
0,2018-09-13,product_item_1,quantity_item_1,35750.23432,company_item_1,taxNumber_item_1
1,2018-10-11,product_item_1,quantity_item_1,35912.426543,company_item_1,taxNumber_item_1
2,2018-07-14,product_item_1,quantity_item_1,82452.931598,company_item_1,taxNumber_item_1
3,2018-08-14,product_item_1,quantity_item_1,82452.931598,company_item_1,taxNumber_item_1
4,2018-09-27,product_item_1,quantity_item_1,82452.931598,company_item_1,taxNumber_item_1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308243 entries, 0 to 308242
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   date       308243 non-null  datetime64[ns]
 1   product    308243 non-null  object        
 2   quantity   308243 non-null  object        
 3   sum        308243 non-null  float64       
 4   company    308243 non-null  object        
 5   taxNumber  308243 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 14.1+ MB


In [5]:
df.describe()

Unnamed: 0,sum
count,308243.0
mean,111538.6
std,295342.9
min,-1363824.0
25%,32146.21
50%,44837.7
75%,81304.85
max,19392930.0


In [6]:
df.date.min()

Timestamp('2018-01-03 00:00:00')

In [7]:
df.date.max()

Timestamp('2021-04-30 00:00:00')

In [8]:
df.sample(10)

Unnamed: 0,date,product,quantity,sum,company,taxNumber
137678,2019-04-10,product_item_395,quantity_item_2,51951.609154,company_item_71,taxNumber_item_71
162913,2019-10-01,product_item_556,quantity_item_1,39243.420608,company_item_7,taxNumber_item_7
203862,2020-07-30,product_item_177,quantity_item_4,66927.775173,company_item_43,taxNumber_item_43
195805,2020-09-18,product_item_113,quantity_item_4,52837.717242,company_item_718,taxNumber_item_716
293043,2021-01-28,product_item_414,quantity_item_5,168297.914179,company_item_88,taxNumber_item_88
239428,2020-09-11,product_item_446,quantity_item_1,56411.999641,company_item_718,taxNumber_item_716
222205,2020-07-13,product_item_341,quantity_item_1,32146.206243,company_item_60,taxNumber_item_60
40069,2018-04-05,product_item_301,quantity_item_5,95150.683064,company_item_29,taxNumber_item_29
196683,2020-02-05,product_item_114,quantity_item_4,43960.980745,company_item_727,taxNumber_item_725
122879,2019-12-16,product_item_243,quantity_item_1,32146.206243,company_item_6,taxNumber_item_6


In [None]:
result = adfuller(np.array(df['sum']))
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

In [None]:
dfCount = df.groupby(by='date').count()

In [None]:
fig = px.line(dfCount['quantity'], height=400, width = 1200, labels={'value':'Number of sales'})
fig.update_traces(line=dict(width=1))

fig.show()

In [None]:
df['year']=df['date'].apply(lambda x: x.year)
#day of year extraction
df['day']=df['date'].apply(lambda x: x.timetuple().tm_yday)
df.head()

In [None]:
fig = go.Figure()
for year in [2018,2019,2020,2021]:
    dfT = df[df['year']== year]
    fig.add_trace(go.Scatter(x=dfT['day'],
                             y=dfT['sum'],
                             mode='markers', 
                             marker=dict(opacity=0.3,size=4),
                             name=year))
    
fig.update_layout(yaxis_range=(-2000000,10000000), title='Daily sales',  width=1200, height=400)
fig.write_html(pathForImages+'dailySales.html')
fig.write_image(pathForImages+'dailySales.png', width=1200, height=400, scale=2)

In [None]:
#DataFrame on a weekly basis
dfW = df
dfW = dfW.groupby(by=pd.Grouper(key='date',freq='W')).sum()

# Multiplicative Decomposition 
result_mul = seasonal_decompose(dfW['sum'], model='multiplicative', extrapolate_trend='freq')

# Additive Decomposition
#result_add = seasonal_decompose(dfW['sum'], model='additive', extrapolate_trend='freq')

fig = make_subplots(rows=4, cols=1, 
                    shared_xaxes=True,
                    subplot_titles=("Initial Data", "Trend", "Seasonal", "Residual") )

fig.add_trace(
    go.Scatter(x=dfW.index,y=result_mul.observed),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=dfW.index,y=result_mul.trend),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(x=dfW.index,y=result_mul.seasonal),
    row=3, col=1
)

fig.add_trace(
    go.Scatter(x=dfW.index,y=result_mul.resid, mode='markers'),
    row=4, col=1
)
fig.update_layout(height=800, title_text="Multiplicative Decompose",showlegend=False)
fig.show()
fig.write_html(pathForImages+'decomposition.html')

In [None]:
fig = px.line(dfW, x=dfW.index, y="sum", title='Weekly sales')
fig.show()

In [None]:
dfW = df.set_index('date')
dfW = dfW.groupby(by=pd.Grouper(freq='W')).sum()[['sum']]
fig = px.line(dfW, y="sum", title='Weekly sales', color = dfW.index.year)
fig.show()
fig.write_html(pathForImages+'weeklySales.html')

In [None]:
dfM = df.set_index('date')

dfM = dfM.groupby(by=pd.Grouper(freq='M')).sum()[['sum']]


dfM['monthNumber']=dfM.index.month


fig = px.line(dfM, x=dfM.index, y="sum", title='Monthly sales')
fig.show()
fig.write_html(pathForImages+'monthlySales.html')

In [None]:
fig = px.line(dfM, 
              x='monthNumber', 
              y="sum", 
              title='Monthly sales', 
              color = dfM.index.year)
fig.show()
fig.write_html(pathForImages+'monthlySalesY2Y.html')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,7), dpi= 300)
sns.boxplot(x=dfM.index.year, y='sum', data=dfM, ax=axes[0])
sns.boxplot(x=dfM.index.month, y='sum', data=dfM, ax=axes[1])

axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18); 
axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18)
plt.show()