## HTML

### Weekly Sales Summary

Let's first take a close look on the trend of weekly sales during this two year period. The sales plot captures the sales trend ranges from Feb-2011 - Oct-2012. As we can see that the sales ranges approximate between 0 to 4 million. The sales trends for each store move flat over time except the two holiday seasons which start approximately from Nov 20 to Dec 31 for both years. As we all know, this period covers two big holidays which are Thanksgiving and Christmas retailers traditionally offers huge discount discount to customers. During these high selling period, the sales hike 1.5 - 2 times of the original sales.  So that we ought to pay more attention to the prediction for this holiday period.

## Reloading Dataset

In [2]:
import pandas as pd
import numpy as np

trainDf = pd.read_csv("./data/train.csv")
feaDf = pd.read_csv("./data/features.csv").drop(["IsHoliday"], axis=1)
storesDf = pd.read_csv("./data/stores.csv")


df1 = pd.merge(trainDf, feaDf, on = ["Store", "Date"])
#print(df1.shape)
train = pd.merge(df1, storesDf, on = "Store")
print("Dataset size:",train.shape)

#Rename column names
train.columns = ['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature',
       'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
       'MarkDown5', 'CPI', 'Unemployment', 'Store_Type', 'Size']

display(train.head(3))

Dataset size: (421570, 16)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Store_Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1,2,2010-02-05,50605.27,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
2,1,3,2010-02-05,13740.12,False,42.31,2.572,,,,,,211.096358,8.106,A,151315


### Dropping Columns (MarkDown1, MarkDown2, MarkDown3, MarkDown4)

In [3]:
train.drop(columns=["MarkDown1","MarkDown2","MarkDown3","MarkDown4", "MarkDown5"], inplace = True)
display(train.head(5))

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Store_Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,211.096358,8.106,A,151315
1,1,2,2010-02-05,50605.27,False,42.31,2.572,211.096358,8.106,A,151315
2,1,3,2010-02-05,13740.12,False,42.31,2.572,211.096358,8.106,A,151315
3,1,4,2010-02-05,39954.04,False,42.31,2.572,211.096358,8.106,A,151315
4,1,5,2010-02-05,32229.38,False,42.31,2.572,211.096358,8.106,A,151315


## Data Analysis

In [4]:
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline

init_notebook_mode(connected=True)

In [5]:
namesLst=list(map(str,train.Store.unique()))
dateLst = train.Date.unique()

aa=[]
salesLst=[]
timeLst=[]
storeSalesDict={}
for name in namesLst:
    print(name)
    storeSalesDict[int(name)]=[]
    for date in dateLst:
        aa=round(train.Weekly_Sales[train.Store == int(name)][train.Date == date].sum()/ 1000000,2)
        storeSalesDict[int(name)].append(aa)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


In [6]:
traces=[]

for i, sales in storeSalesDict.items():
    #print(i, sales)
    traces.append(go.Scatter(
    x= pd.to_datetime(dateLst),
    y=sales,
    name="Store#"+str(i)
    ))


In [7]:
import numpy as np
aa= np.zeros((len(storeSalesDict),len(storeSalesDict)))
aa = [[str(aa[i][j]) for i in range(0,aa.shape[0])] for j in range(0,aa.shape[1])]
aa = [[False for i in range(0,len(aa))] for j in range(0,len(aa))]
for i in range(len(aa)):
    aa[i][i] = True

In [8]:
updatemenus = list([
    dict(type="buttons",
         active=-1,
         buttons=list([
            dict(label = 'Store 1',
                 method = 'update',
                 args = [{'visible': aa[0]},
                         {'title': 'Store 1 Sales'}])]))])

#[True, False, False, False,False, False, False]

In [15]:
layout = go.Layout(
    title = 'Weekly Sales (Figure 1)',
    #yaxis =dict(autorange=True, showgrid=True,zeroline=True, autotick=False),
    
    autosize=True,
    #width=700,
    #height=1000,
    yaxis =dict(title = "Weekly Sales (in Millions)", 
                exponentformat='e',
                showexponent='all',
                titlefont=dict(size=18),
                tick0=5,ticks="outside", 
                dtick=1, 
                tickwidth=2, 
                showgrid=True),
    xaxis = dict(title="Time Series (in Months)",
                 titlefont=dict(size=18),
                 exponentformat='e',
                 showexponent='all',
                 zeroline=True,  
                 showgrid=False,
                 rangeselector=dict(visible=True, 
                                    buttons=list([
                                        dict(count=1, 
                                             label="1m", 
                                             step="month", 
                                             stepmode ="backward"),
                                        dict(count=3, 
                                                 label="3m", 
                                                 step="month", 
                                                 stepmode ="backward"),
                                         dict(count=6, 
                                                 label="6m", 
                                                 step="month", 
                                                 stepmode ="backward"),
                                         dict(count=12, 
                                                 label="12m", 
                                                 step="month", 
                                                 stepmode ="backward"),
                                        dict(step ="all")])),
                 rangeslider = dict(visible=True)
                ),
    margin = dict(l=60,r=30, b=80, t=40),
    showlegend=False,
    updatemenus=updatemenus
)

In [16]:
fig = go.Figure(data=traces, layout=layout)
iplot(fig, show_link=True)

Let's first take a close look on the trend of weekly sales during this two year period. The sales plot captures the sales trend ranges from Feb-2011 - Oct-2012. As we can see that the sales ranges approximate between 0 to 4 million. The sales trends for each store move flat over time except the two holiday seasons which start approximately from Nov 20 to Dec 31 for both years. As we all know, this period covers two big holidays which are Thanksgiving and Christmas retailers traditionally offers huge discount discount to customers. During these high selling period, the sales hike 1.5 - 2 times of the original sales.  So that we ought to pay more attention to the prediction for this holiday period.

### Weekly Sales in Store_Type

In [10]:
trace1 = []

salesA = round(train.Weekly_Sales[train.Store_Type=="A"][train.IsHoliday==False].sum() / 1000000,2)
salesB = round(train.Weekly_Sales[train.Store_Type=="B"][train.IsHoliday==False].sum() / 1000000,2)
salesC = round(train.Weekly_Sales[train.Store_Type=="C"][train.IsHoliday==False].sum() / 1000000,2)

trace1 = go.Bar(
    x = sorted(list(set(train.Store_Type))),
    y = [salesA, salesB, salesC],
    name = "Working day"
)
salesA = round(train.Weekly_Sales[train.Store_Type=="A"][train.IsHoliday==True].sum() / 1000000,2)
salesB = round(train.Weekly_Sales[train.Store_Type=="B"][train.IsHoliday==True].sum() / 1000000,2)
salesC = round(train.Weekly_Sales[train.Store_Type=="C"][train.IsHoliday==True].sum() / 1000000,2)

trace2=[]
trace2 = go.Bar(
    x = sorted(list(set(train.Store_Type))),
    y = [salesA, salesB, salesC],
    name = "Holiday"
)

data = [trace1, trace2]
layout = go.Layout(
    title = "Store Type Sale (in Mil)",
    barmode='group',
    showlegend=True,
    xaxis = dict(title = "Store Type"),
    yaxis = dict(title = " Sales (in Million)")
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=True)

### BarPlot - Sales vs Department

In [36]:
display(train.head(3))

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Store_Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,211.096358,8.106,A,151315
1,1,2,2010-02-05,50605.27,False,42.31,2.572,211.096358,8.106,A,151315
2,1,3,2010-02-05,13740.12,False,42.31,2.572,211.096358,8.106,A,151315


In [6]:
sales_dept = []
for dept in list(set(train.Dept)):
    sales_dept.append(round(train.Weekly_Sales[train.Dept == dept].sum()/1000000,2))
    
trace3 = []
trace3 = go.Bar(
    x= sales_dept,
    y = list(set(train.Dept)),
    orientation="h"
    #name = "Working day"
)

data = [trace3]
layout = go.Layout(
    barmode='group',
     width=700,
    height=1200, 
    #showlegend=True
    title = 'Departments Weekly Sales (in Mil)',
    yaxis =dict(title = "Dept Number", 
                exponentformat='e',
                showexponent='all',
                titlefont=dict(size=18),
                tick0=5,
                ticks="outside", 
                dtick=1, 
                tickwidth=2, 
                showgrid=False),
    xaxis = dict(title="Weekly Sales(in Million $)",
                 titlefont=dict(size=18),
                 zeroline=True, 
                 #range=[2,5], 
                 showgrid=True)
)

annotations=[]
for i in range(len(list(set(train.Dept)))):
    #print(i)
    #print(i,sales_dept[i], list(set(train.Dept))[i], sales_dept[i])
    annotations.append(dict(x=sales_dept[i-1], 
                            y=list(set(train.Dept))[i-1], 
                            text="%0.2fM"%(sales_dept[i-1]),
                            font=dict(family='Arial', size=12,
                                      color='red'),
                            showarrow=True,
                            align = "center",
                            ax=40,ay=0,
                            arrowhead=0))
    layout['annotations'] = annotations


fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=True)

In [65]:
len(sales_dept)
len(list(set(train.Dept)))

81