### Loading and inspecting

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

breadbasket = pd.read_csv('BreadBasket_DMS.csv')

In [2]:
breadbasket.dtypes

Date           object
Time           object
Transaction     int64
Item           object
dtype: object

In [3]:
breadbasket.head()

Unnamed: 0,Date,Time,Transaction,Item
0,2016-10-30,09:58:11,1,Bread
1,2016-10-30,10:05:34,2,Scandinavian
2,2016-10-30,10:05:34,2,Scandinavian
3,2016-10-30,10:07:57,3,Hot chocolate
4,2016-10-30,10:07:57,3,Jam


In [4]:
breadbasket.describe()

Unnamed: 0,Transaction
count,21293.0
mean,4951.990889
std,2787.7584
min,1.0
25%,2548.0
50%,5067.0
75%,7329.0
max,9684.0


    we have 9684 transactions in the dataset

### Exploring the variables

    to examine the date and time, we must reformat this variable

In [5]:
breadbasket['DateTime'] = pd.to_datetime(breadbasket.Date + ' ' + breadbasket.Time)

In [6]:
# How many products are sold by the bakery and the most popular ones

breadbasket.Item.unique()

array(['Bread', 'Scandinavian', 'Hot chocolate', 'Jam', 'Cookies',
       'Muffin', 'Coffee', 'Pastry', 'Medialuna', 'Tea', 'NONE', 'Tartine',
       'Basket', 'Mineral water', 'Farm House', 'Fudge', 'Juice',
       "Ella's Kitchen Pouches", 'Victorian Sponge', 'Frittata',
       'Hearty & Seasonal', 'Soup', 'Pick and Mix Bowls', 'Smoothies',
       'Cake', 'Mighty Protein', 'Chicken sand', 'Coke',
       'My-5 Fruit Shoot', 'Focaccia', 'Sandwich', 'Alfajores', 'Eggs',
       'Brownie', 'Dulce de Leche', 'Honey', 'The BART', 'Granola',
       'Fairy Doors', 'Empanadas', 'Keeping It Local', 'Art Tray',
       'Bowl Nic Pitt', 'Bread Pudding', 'Adjustment', 'Truffles',
       'Chimichurri Oil', 'Bacon', 'Spread', 'Kids biscuit', 'Siblings',
       'Caramel bites', 'Jammie Dodgers', 'Tiffin', 'Olum & polenta',
       'Polenta', 'The Nomad', 'Hack the stack', 'Bakewell',
       'Lemon and coconut', 'Toast', 'Scone', 'Crepes', 'Vegan mincepie',
       'Bare Popcorn', 'Muesli', 'Crisps', 'Pi

In [7]:
breadbasket.Item.value_counts()

Coffee          5471
Bread           3325
Tea             1435
Cake            1025
Pastry           856
                ... 
Raw bars           1
Adjustment         1
Chicken sand       1
Polenta            1
Bacon              1
Name: Item, Length: 95, dtype: int64

    we need to introduce the classification system since there are 95 items

In [8]:
# Classification

beverage = ['Hot chocolate', 'Coffee', 'Tea', 'Mineral water', 'Juice', 'Coke', 'Smoothies']
other = ['NONE', 'Christmas common', 'Gift voucher', "Valentine's card", 'Tshirt', 'Afternoon with the baker', 'Postcard', 'Siblings', 'Nomad bag', 'Adjustment', 'Drinking chocolate spoons ', 'Coffee granules ']
kids = ["Ella's Kitchen Pouches", 'My-5 Fruit Shoot', 'Kids biscuit']
snacks = ['Mighty Protein', 'Pick and Mix Bowls', 'Caramel bites', 'Bare Popcorn', 'Crisps', 'Cherry me Dried fruit', 'Raw bars']
bread = ['Bread', 'Toast', 'Baguette', 'Focaccia', 'Scandinavian']
breakfast_pastry = ['Muffin', 'Pastry', 'Medialuna', 'Scone']
dessert = ['Cookies', 'Tartine', 'Fudge', 'Victorian Sponge', 'Cake', 'Alfajores', 'Brownie', 'Bread Pudding', 'Bakewell', 'Raspberry shortbread sandwich', 'Lemon and coconut', 'Crepes', 'Chocolates', 'Truffles', 'Panatone']
condiments = ['Jam', 'Dulce de Leche', 'Honey', 'Gingerbread syrup', 'Extra Salami or Feta', 'Bacon', 'Spread', 'Chimichurri Oil']
breakfast = ['Eggs', 'Frittata', 'Granola', 'Muesli', 'Duck egg', 'Brioche and salami']
lunch = ['Soup', 'Sandwich', 'Chicken sand', 'Salad', 'Chicken Stew']
other_food = [x for x in breadbasket.Item.unique() if x not in beverage 
                and x not in other and x not in kids and x not in snacks 
                and x not in bread and x not in breakfast_pastry 
                and x not in dessert and x not in condiments 
                and x not in breakfast and x not in lunch]

In [9]:
# Dummies

breadbasket['beverage'] = np.where(breadbasket.Item.isin(beverage), 1, 0)
breadbasket['other'] = np.where(breadbasket.Item.isin(other), 1, 0)
breadbasket['kids'] = np.where(breadbasket.Item.isin(kids), 1, 0)
breadbasket['snacks'] = np.where(breadbasket.Item.isin(snacks), 1, 0)
breadbasket['bread'] = np.where(breadbasket.Item.isin(bread), 1, 0)
breadbasket['breakfast_pastry'] = np.where(breadbasket.Item.isin(breakfast_pastry), 1, 0)
breadbasket['dessert'] = np.where(breadbasket.Item.isin(dessert), 1, 0)
breadbasket['condiments'] = np.where(breadbasket.Item.isin(condiments), 1, 0)
breadbasket['breakfast'] = np.where(breadbasket.Item.isin(breakfast), 1, 0)
breadbasket['lunch'] = np.where(breadbasket.Item.isin(lunch), 1, 0)
breadbasket['other_food'] = np.where(breadbasket.Item.isin(other_food), 1, 0)

In [10]:
breadbasket.head()

Unnamed: 0,Date,Time,Transaction,Item,DateTime,beverage,other,kids,snacks,bread,breakfast_pastry,dessert,condiments,breakfast,lunch,other_food
0,2016-10-30,09:58:11,1,Bread,2016-10-30 09:58:11,0,0,0,0,1,0,0,0,0,0,0
1,2016-10-30,10:05:34,2,Scandinavian,2016-10-30 10:05:34,0,0,0,0,1,0,0,0,0,0,0
2,2016-10-30,10:05:34,2,Scandinavian,2016-10-30 10:05:34,0,0,0,0,1,0,0,0,0,0,0
3,2016-10-30,10:07:57,3,Hot chocolate,2016-10-30 10:07:57,1,0,0,0,0,0,0,0,0,0,0
4,2016-10-30,10:07:57,3,Jam,2016-10-30 10:07:57,0,0,0,0,0,0,0,1,0,0,0


### Processing the Data

In [11]:
# Data aggregated by transaction

bread_group = breadbasket.groupby(['Transaction', 'DateTime']).sum()

In [12]:
bread_group

Unnamed: 0_level_0,Unnamed: 1_level_0,beverage,other,kids,snacks,bread,breakfast_pastry,dessert,condiments,breakfast,lunch,other_food
Transaction,DateTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2016-10-30 09:58:11,0,0,0,0,1,0,0,0,0,0,0
2,2016-10-30 10:05:34,0,0,0,0,2,0,0,0,0,0,0
3,2016-10-30 10:07:57,1,0,0,0,0,0,1,1,0,0,0
4,2016-10-30 10:08:41,0,0,0,0,0,1,0,0,0,0,0
5,2016-10-30 10:13:03,1,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9680,2017-04-09 14:24:03,0,0,0,0,1,0,0,0,0,0,0
9681,2017-04-09 14:30:09,1,1,0,0,0,0,1,0,0,0,1
9682,2017-04-09 14:32:58,2,0,0,0,0,1,0,0,0,0,1
9683,2017-04-09 14:57:06,1,0,0,0,0,1,0,0,0,0,0


In [13]:
# Resetting the index

bread_group.reset_index(level=['DateTime'], inplace = True)

In [14]:
# Creating a column for day of week and for hour

bread_group['hour'] = bread_group.DateTime.dt.hour
bread_group['day'] = bread_group.DateTime.dt.day_name()
bread_group.day.value_counts()

Saturday     2068
Friday       1488
Sunday       1264
Thursday     1252
Tuesday      1203
Monday       1135
Wednesday    1121
Name: day, dtype: int64

In [15]:
bread_group.hour.value_counts()

11    1445
12    1347
10    1267
13    1163
14    1130
9     1007
15     924
16     583
8      375
17     160
18      52
19      34
7       16
20      15
22       7
23       3
21       2
1        1
Name: hour, dtype: int64

    11am has the most transactions, then comes 10am

In [16]:
# Dummy variables out of the day column

bread_days = pd.get_dummies(data = bread_group, columns = ['day'])
bread_days.drop(columns = 'DateTime', inplace = True, axis = 1)

    our plan is to use k-means clustring
    usage of Principal Component Analysis(PCA):
        PCA projects our data onto a lower dimensional subspace

In [17]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 4)

principalComponents = pca.fit_transform(bread_days)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1','pc2','pc3','pc4'])

principalDf.head()

Unnamed: 0,pc1,pc2,pc3,pc4
0,3.197413,-0.926669,0.244933,0.005982
1,2.227896,-1.274346,1.18337,-0.15923
2,2.107647,0.424881,-0.219768,0.879731
3,2.19896,-0.580674,-0.783352,-0.031002
4,2.206812,0.037386,0.443747,-0.333642


        k means clustering

In [18]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 5)
bread_clusters = kmeans.fit(principalDf)
bread_clusters.cluster_centers_

array([[  3.46747051e+00,  -5.43764991e-02,  -3.01345914e-02,
         -3.01334314e-03],
       [ -3.07553455e-01,  -2.48882808e-02,  -3.58032734e-02,
         -1.33269902e-02],
       [ -4.42371509e+00,  -8.64897886e-02,   1.99164332e-02,
         -1.07060284e-02],
       [  1.63146200e+00,   5.06637926e-02,   2.89305612e-02,
          7.19914885e-03],
       [ -2.29642498e+00,   3.66004677e-02,   1.77781698e-02,
          1.32943920e-02]])

In [19]:
# Applying the labels back to the original data

bread_days['labels'] = bread_clusters.fit_predict(principalDf)
bread_days.reset_index('Transaction',inplace = True)
bread_merged = pd.merge(breadbasket, bread_days[['Transaction','labels']],on = 'Transaction', how = 'outer')
bread_merged.head()

Unnamed: 0,Date,Time,Transaction,Item,DateTime,beverage,other,kids,snacks,bread,breakfast_pastry,dessert,condiments,breakfast,lunch,other_food,labels
0,2016-10-30,09:58:11,1,Bread,2016-10-30 09:58:11,0,0,0,0,1,0,0,0,0,0,0,3
1,2016-10-30,10:05:34,2,Scandinavian,2016-10-30 10:05:34,0,0,0,0,1,0,0,0,0,0,0,3
2,2016-10-30,10:05:34,2,Scandinavian,2016-10-30 10:05:34,0,0,0,0,1,0,0,0,0,0,0,3
3,2016-10-30,10:07:57,3,Hot chocolate,2016-10-30 10:07:57,1,0,0,0,0,0,0,0,0,0,0,3
4,2016-10-30,10:07:57,3,Jam,2016-10-30 10:07:57,0,0,0,0,0,0,0,1,0,0,0,3


In [20]:
bread_merged.labels.value_counts()

4    5296
3    4446
2    4041
1    3932
0    3578
Name: labels, dtype: int64

    the largest cluster is 5

In [21]:
# Checking if the clusters captured a different type of transaction by looking at
# the hour breakdown for each cluster

pd.crosstab(bread_days.hour, bread_days.labels)

labels,0,1,2,3,4
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,1,0
7,0,0,0,16,0
8,0,0,0,375,0
9,25,0,0,982,0
10,263,0,0,1004,0
11,334,1111,0,0,0
12,267,1080,0,0,0
13,33,0,0,0,1130
14,2,0,0,0,1128
15,0,0,924,0,0


    clusters 0,2,4 center around noon
    cluster 1 is in the early morning
    cluster 3 is an evening cluster

In [22]:
pd.crosstab(bread_group.day, bread_days.labels)

labels,0,1,2,3,4
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Friday,160,358,269,326,355
Monday,95,243,232,315,250
Saturday,224,441,411,393,506
Sunday,127,297,209,295,304
Thursday,120,287,194,387,264
Tuesday,116,293,223,300,271
Wednesday,71,225,213,329,283


In [23]:
# Let's look at the top 5 products per cluster

a = bread_merged.groupby(['labels']).Item.value_counts()
b = a.to_frame('counts').reset_index()
b.set_index('Item', inplace = True)
b.groupby('labels').counts.nlargest(5)

labels  Item         
0       Coffee           1372
        Tea               290
        Hot chocolate     177
        Bread             168
        Pastry            149
1       Bread             893
        Coffee            821
        NONE              185
        Cake              164
        Pastry            164
2       Coffee            927
        Bread             560
        Tea               382
        Cake              328
        Hot chocolate     183
3       Coffee           1163
        Bread            1027
        Pastry            389
        Medialuna         260
        Tea               198
4       Coffee           1188
        Bread             677
        Tea               402
        Sandwich          381
        Cake              302
Name: counts, dtype: int64

    Result:
        - in cluster 0 the most popular is coffee
        - in cluster 1: Bread
        - 2: Coffee
        - 3: Coffee
        - 4: Coffee