In [1]:
import os
import tarfile
import urllib
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import dask.dataframe as dd
import dask.array as da
import dask.bag as dbs
import warnings
warnings.filterwarnings('ignore')


### Data Analysis
- The thrust of the data consists of multiple liquor vendors selling to thousands of Iowa retail stores
the thousands of liquor products. The retail stores order these products from different vendors to satisfy the local demands
from the consumers. Even though the individual consumers do not enter the data, the aggregation of 
all the retail stores' sales in the same area does reflect the local demands on the liquor products.

- The data analysis can be carried out from the prospective of the vendors, 
the stores and the products.  As in any market within the capitalism system, the stores are open and are closed.
The new products are introduced into the market, while some of the existing products cease to 
gain popularity among the consumers. They are forced to exit the liquor market.

- **Vendor Analysis**:
    - As there are near $10K$ liquor products and only $300+$ vendors. 
    So most of the vendors must be selling multiple products. How many product does a vendor sell?
    How does it evolve w.r.t. time?  How many categories do these product fall into?
    
    - How many sales channels (retail stores) does each vendor have and how does it evolve w.r.t. time?
    - Are different vendors supplying the same product? Are different vendors supplying the same stores?
    - Are there direct competitions among different vendors? Depict a story (case study) on their sale-channel
    growth/shrinkage.

In [17]:
countiesdf = pd.read_csv('county.csv', index_col = 0)
pricesdf = pd.read_csv('price.csv', index_col = 0)
productsdf = pd.read_csv('product.csv', index_col = 0)
storesdf = pd.read_csv('store.csv', index_col = 0)
vendorsdf = pd.read_csv('vendor.csv', index_col = 0)
categoriesdf = pd.read_csv('category.csv', index_col = 0)
transactionsdf = pd.read_csv('transaction.csv', index_col = 0)

- **Vendor Analysis**:
    - As there are near $10K$ liquor products and only $300+$ vendors. 
    So most of the vendors must be selling multiple products. How many product does a vendor sell?
    How does it evolve w.r.t. time?  How many categories do these product fall into?
 

In [53]:
vendor_products = transactionsdf[["Date", 'VendorNumber', 'ItemNumber']]
vendor_products = pd.merge(vendor_products , vendorsdf, on='VendorNumber', how='left')
vendor_products.head()

Unnamed: 0,Date,VendorNumber,ItemNumber,VendorName
0,2012-01-03,380,84636,Phillips Beverage
1,2012-01-03,65,20247,Jim Beam Brands
2,2012-01-03,300,65199,McCormick Distilling Co.
3,2012-01-03,260,41715,DIAGEO AMERICAS
4,2012-01-03,260,43331,DIAGEO AMERICAS


In [54]:
vendor_products.dtypes

Date            object
VendorNumber     int64
ItemNumber       int64
VendorName      object
dtype: object

In [56]:
vendor_products['Date'] = pd.to_datetime(vendor_products['Date'], infer_datetime_format=True)
vendor_products.dtypes



Date            datetime64[ns]
VendorNumber             int64
ItemNumber               int64
VendorName              object
dtype: object

In [75]:
vendor_products =  vendor_products.drop(['VendorNumber'], axis=1)
vendor_products

Unnamed: 0,Date,ItemNumber,VendorName
0,2012-01-03,84636,Phillips Beverage
1,2012-01-03,20247,Jim Beam Brands
2,2012-01-03,65199,McCormick Distilling Co.
3,2012-01-03,41715,DIAGEO AMERICAS
4,2012-01-03,43331,DIAGEO AMERICAS
...,...,...,...
23774986,2022-04-29,35315,SAZERAC COMPANY INC
23774987,2022-04-29,48105,MOET HENNESSY USA
23774988,2022-04-29,37996,DIAGEO AMERICAS
23774989,2022-04-29,41692,Phillips Beverage


In [109]:
vendor_products = vendor_products.groupby([pd.Grouper(key='Date', freq='Y'),
                                           'VendorName']).nunique()



KeyError: 'The grouper name Date is not found'

In [111]:
vendor_products.rename(columns={ 'ItemNumber' : "ProductCount"}, inplace = True)


In [186]:
vendor_products

pandas.core.frame.DataFrame

# - How many sales channels (retail stores) does each vendor have and how does it evolve w.r.t. time?


  # - Are different vendors supplying the same product? Are different vendors supplying the same stores?

- **Store Analysis**:
    - Categorize the retail liquor stores into different types, chain-super market, specialized
    liquor stores, convenience stores, news stands, tobacco stores,..... and report the relevant statistics.
    - New stores pop up and some of them shut down after a few years. Analyze the store opening dynamics
    and report the findings on store life-cycle, store survival analysis, survial curve, survival
    probabilities, etc. Refine your findings based on the store categories you design. Is it easy for
    the stores to experience sales volume growth? With new stores popping up for competition, how does
    it affect the sales of the existing stores?  
    Does the pie (Gross sales volume in **Iowa**) grow bigger? Do the stores share the bigger pie?
    Does the winner take all or more players share smaller slices?
    - In terms of store inventories, report the varieties of product/product category for different
    store types. 
    What product/product category are the major sales contributors for different types of stores?
    - Study the monthly sales volumes and gross profits (without taking price-discounts into account)
    of the stores. 
    - Use the county population information, 
    say https://www.iowa-demographics.com/counties_by_population, to estimate the stores-per-capita in different
        counties. Use the chain-supermarket **Hy-Vee** as a case study, depict its store distribution
        in different counties of Iowa and the their liquor sales dynamics.
    - Within the context of **B2B**, 
    the stores play the role of the vendors' customers.
    Thus the concept and tools of customer analytics can be used to analyze the stores.
    As not all the stores can continue to survive or they might change their vendors, the vendors often like to know the
    **customer lifetime value** of their customers (the stores).
    Provide an analysis on the **CLTV** (in a fixed time horizon) based on the natures of the stores.
    You may visit https://exponea.com/blog/customer-lifetime-value-guide/
    for an introduction to customer life-time value computation. Note that
    the 'customer churning', in the context of the liquor stores as the vendors' customers, must
    include the store shutting-down in your discussion.

In [162]:
transactionsdf.head()

Unnamed: 0_level_0,Date,StoreNumber,Category,VendorNumber,ItemNumber,BottlesSold,Volume(Gallons),Sale
InvoiceNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S03401400012,2012-01-03,2233,1081315,380,84636,12,,124.92
S03389000026,2012-01-03,3583,1011200,65,20247,2,,13.9
S03387100016,2012-01-03,4132,1081200,300,65199,4,,71.96
S03411000130,2012-01-03,2625,1031200,260,41715,2,,23.12
S03377200020,2012-01-03,4438,1062310,260,43331,2,,14.5


In [160]:
transactionsdf.columns

Index(['Date', 'StoreNumber', 'Category', 'VendorNumber', 'ItemNumber',
       'BottlesSold', 'Volume(Gallons)', 'Sale'],
      dtype='object')

In [18]:
transactionsdf['Date'] = pd.to_datetime(transactionsdf['Date'], 
                                        infer_datetime_format=True)
transactionsdf['Date'] = transactionsdf['Date'].dt.year







In [19]:
x = transactionsdf[['Date', 'StoreNumber','Sale']]


In [20]:
x = x.reset_index().drop(['InvoiceNumber'], axis=1)

In [30]:
x['Sale']= x['Sale'].str.replace(',','')


In [31]:
x.loc[:, 'Sale'] = pd.to_numeric(x['Sale'])

In [32]:
y = x.groupby(['Date','StoreNumber']).agg({'Sale' : 'sum'})


In [33]:
y

Unnamed: 0_level_0,Unnamed: 1_level_0,Sale
Date,StoreNumber,Unnamed: 2_level_1
2012,2106,992310.70
2012,2113,37908.58
2012,2130,1038701.83
2012,2132,57185.95
2012,2152,99460.09
...,...,...
2022,9039,29097.24
2022,9041,14390.40
2022,9042,9752.04
2022,9047,27878.40


In [37]:
dft = y.pivot_table(values = 'Sale',index = ['Date'],columns = 'StoreNumber')
dft

StoreNumber,2106,2113,2130,2132,2152,2161,2178,2190,2191,2200,...,9928,9929,9930,9931,9932,9934,9936,9937,9938,9946
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,992310.7,37908.58,1038701.83,57185.95,99460.09,42363.05,256012.21,7376906.0,840531.37,236079.07,...,,,,,,,,,,
2013,1040650.27,99257.86,1065606.14,,106852.55,,282712.16,969511.1,844405.86,219279.51,...,,,,,,,,,,
2014,1342161.28,99913.51,1034376.03,,94892.9,,275606.81,1323649.0,1009545.66,212404.67,...,,,,,,,,,,
2015,1434369.85,85763.42,1108184.99,,72080.36,,277987.96,1228572.0,1275911.76,223914.99,...,,,,,,,,,,
2016,1305048.54,84942.05,1114976.91,,13752.24,,276450.17,1472957.0,1233432.76,234775.51,...,19.5,78.0,144.0,,,,,,,
2017,1286825.13,83315.77,1120884.72,,,,301828.34,1569203.0,1160123.69,232860.72,...,1872.0,,144.0,1260.0,388.8,346.56,,,,
2018,1337244.94,80578.24,1184941.2,,,,311540.25,1755604.0,1195099.55,226023.45,...,,,,1260.0,,346.56,31.14,157.5,246.72,
2019,1223186.94,78947.79,1173271.75,,,,330402.66,2124405.0,1285690.27,244140.88,...,,,,,,,,,,
2020,786504.43,7437.09,951198.97,,,,362299.62,2683365.0,1055849.43,260461.55,...,,,,,,,,,,4320.0
2021,1033483.12,,1242342.77,,,,2933.14,2629204.0,1346072.45,276140.46,...,,,,,,,,,,


In [38]:
dft.fillna(0, inplace=True)
dft.sort_values(list(dft.columns),inplace=True)
dft = dft.sort_index()

In [39]:
dft

StoreNumber,2106,2113,2130,2132,2152,2161,2178,2190,2191,2200,...,9928,9929,9930,9931,9932,9934,9936,9937,9938,9946
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,992310.7,37908.58,1038701.83,57185.95,99460.09,42363.05,256012.21,7376906.0,840531.37,236079.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,1040650.27,99257.86,1065606.14,0.0,106852.55,0.0,282712.16,969511.1,844405.86,219279.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014,1342161.28,99913.51,1034376.03,0.0,94892.9,0.0,275606.81,1323649.0,1009545.66,212404.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015,1434369.85,85763.42,1108184.99,0.0,72080.36,0.0,277987.96,1228572.0,1275911.76,223914.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016,1305048.54,84942.05,1114976.91,0.0,13752.24,0.0,276450.17,1472957.0,1233432.76,234775.51,...,19.5,78.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017,1286825.13,83315.77,1120884.72,0.0,0.0,0.0,301828.34,1569203.0,1160123.69,232860.72,...,1872.0,0.0,144.0,1260.0,388.8,346.56,0.0,0.0,0.0,0.0
2018,1337244.94,80578.24,1184941.2,0.0,0.0,0.0,311540.25,1755604.0,1195099.55,226023.45,...,0.0,0.0,0.0,1260.0,0.0,346.56,31.14,157.5,246.72,0.0
2019,1223186.94,78947.79,1173271.75,0.0,0.0,0.0,330402.66,2124405.0,1285690.27,244140.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020,786504.43,7437.09,951198.97,0.0,0.0,0.0,362299.62,2683365.0,1055849.43,260461.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4320.0
2021,1033483.12,0.0,1242342.77,0.0,0.0,0.0,2933.14,2629204.0,1346072.45,276140.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
dft.iloc[:, 0:-1] = dft.iloc[:, 0:-1].cumsum()


In [41]:
dft

StoreNumber,2106,2113,2130,2132,2152,2161,2178,2190,2191,2200,...,9928,9929,9930,9931,9932,9934,9936,9937,9938,9946
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,992310.7,37908.58,1038701.83,57185.95,99460.09,42363.05,256012.21,7376906.0,840531.37,236079.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,2032960.97,137166.44,2104307.97,57185.95,206312.64,42363.05,538724.37,8346417.0,1684937.23,455358.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014,3375122.25,237079.95,3138684.0,57185.95,301205.54,42363.05,814331.18,9670067.0,2694482.89,667763.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015,4809492.1,322843.37,4246868.99,57185.95,373285.9,42363.05,1092319.14,10898640.0,3970394.65,891678.24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016,6114540.64,407785.42,5361845.9,57185.95,387038.14,42363.05,1368769.31,12371600.0,5203827.41,1126453.75,...,19.5,78.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017,7401365.77,491101.19,6482730.62,57185.95,387038.14,42363.05,1670597.65,13940800.0,6363951.1,1359314.47,...,1891.5,78.0,288.0,1260.0,388.8,346.56,0.0,0.0,0.0,0.0
2018,8738610.71,571679.43,7667671.82,57185.95,387038.14,42363.05,1982137.9,15696400.0,7559050.65,1585337.92,...,1891.5,78.0,288.0,2520.0,388.8,693.12,31.14,157.5,246.72,0.0
2019,9961797.65,650627.22,8840943.57,57185.95,387038.14,42363.05,2312540.56,17820810.0,8844740.92,1829478.8,...,1891.5,78.0,288.0,2520.0,388.8,693.12,31.14,157.5,246.72,0.0
2020,10748302.08,658064.31,9792142.54,57185.95,387038.14,42363.05,2674840.18,20504170.0,9900590.35,2089940.35,...,1891.5,78.0,288.0,2520.0,388.8,693.12,31.14,157.5,246.72,4320.0
2021,11781785.2,658064.31,11034485.31,57185.95,387038.14,42363.05,2677773.32,23133380.0,11246662.8,2366080.81,...,1891.5,78.0,288.0,2520.0,388.8,693.12,31.14,157.5,246.72,0.0


In [1]:
import bar_chart_race as bcr
bcr.bar_chart_race(df = dft, 
                   n_bars = 20,
                   period_length = 3500,
                   bar_size = 0.85,
                   steps_per_period = 45,
                   sort='desc',
                   title='Store Sales since 2012')


NameError: name 'dft' is not defined