# Practical case - Groceries

In [1]:
%pylab
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

import numpy as np

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In the `data/groceries.csv` file there are data of the transactions got during a month in a groceries shop. On each row there is a list with the elements of each ticket.

With that data, we have to (using as Apriori as FP-Growth algorithms):
- get the elements list with a minimum support of 0.15. 
- get the association rules with that dataset with a minimum support of 0.05 and confidence of 0.25.

### Load dataset

In [2]:
import csv

# Read the csv file
groceries_file = csv.reader(open("data/groceries.csv", "r"))

# Load the data on groceries
groceries = []
for row in groceries_file :
    groceries.append(row)
    
groceries

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese ', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product'],
 ['whole milk', 'butter', 'yogurt', 'rice', 'abrasive cleaner'],
 ['rolls/buns'],
 ['other vegetables',
  'UHT-milk',
  'rolls/buns',
  'bottled beer',
  'liquor (appetizer)'],
 ['pot plants'],
 ['whole milk', 'cereals'],
 ['tropical fruit',
  'other vegetables',
  'white bread',
  'bottled water',
  'chocolate'],
 ['citrus fruit',
  'tropical fruit',
  'whole milk',
  'butter',
  'curd',
  'yogurt',
  'flour',
  'bottled water',
  'dishes'],
 ['beef'],
 ['frankfurter', 'rolls/buns', 'soda'],
 ['chicken', 'tropical fruit'],
 ['butter', 'sugar', 'fruit/vegetable juice', 'newspapers'],
 ['fruit/vegetable juice'],
 ['packaged fruit/vegetables'],
 ['chocolate'],
 ['specialty bar'],
 ['other vegetables'],
 ['butter milk

### Apriori algorithm 

In [3]:
import apriori

#### Get the elements list with a minimum support of 0.15.

In [4]:
# Let's apply apriori algorithm:
F, support = apriori.apriori(groceries, min_support = 0.15, verbose = True)

{soda}:  sup = 0.174
{rolls/buns}:  sup = 0.184
{other vegetables}:  sup = 0.193
{whole milk}:  sup = 0.256


#### Get the association rules with that dataset with a minimum support of 0.05 and confidence of 0.25.

In [5]:
# Let's apply apriori algorithm:
F, support = apriori.apriori(groceries, min_support = 0.05, verbose = True)

{domestic eggs}:  sup = 0.063
{whipped/sour cream}:  sup = 0.072
{pork}:  sup = 0.058
{napkins}:  sup = 0.052
{shopping bags}:  sup = 0.099
{brown bread}:  sup = 0.065
{sausage}:  sup = 0.094
{canned beer}:  sup = 0.078
{root vegetables}:  sup = 0.109
{pastry}:  sup = 0.089
{newspapers}:  sup = 0.08
{fruit/vegetable juice}:  sup = 0.072
{soda}:  sup = 0.174
{frankfurter}:  sup = 0.059
{beef}:  sup = 0.052
{curd}:  sup = 0.053
{bottled water}:  sup = 0.111
{bottled beer}:  sup = 0.081
{rolls/buns}:  sup = 0.184
{butter}:  sup = 0.055
{other vegetables}:  sup = 0.193
{pip fruit}:  sup = 0.076
{whole milk}:  sup = 0.256
{yogurt}:  sup = 0.14
{tropical fruit}:  sup = 0.105
{coffee}:  sup = 0.058
{margarine}:  sup = 0.059
{citrus fruit}:  sup = 0.083
{whole milk, rolls/buns}:  sup = 0.057
{whole milk, yogurt}:  sup = 0.056
{whole milk, other vegetables}:  sup = 0.075


In [6]:
# Let's generate the rules:
H = apriori.generate_rules(F, support, min_confidence = 0.25, verbose = True)

{rolls/buns} ---> {whole milk}:  conf = 0.308, sup = 0.057
{yogurt} ---> {whole milk}:  conf = 0.402, sup = 0.056
{other vegetables} ---> {whole milk}:  conf = 0.387, sup = 0.075
{whole milk} ---> {other vegetables}:  conf = 0.293, sup = 0.075


### FP-Growth algorithm 

In [7]:
import pyfpgrowth

#### Get the elements list with a minimum support of 0.15.

In this case, with FP-Growth we have to give it the number of minimum appareances on dataset. So, we have to calculate it before:

In [8]:
import math

num_tickets = len(groceries)
support_threshold = math.ceil(num_tickets * 0.15)
support_threshold

1476

In [9]:
# Let's apply FG-Growth algorithm
patterns = pyfpgrowth.find_frequent_patterns(groceries, 
                                             support_threshold = support_threshold)
patterns

{('other vegetables',): 1903,
 ('rolls/buns',): 1809,
 ('soda',): 1715,
 ('whole milk',): 2513}

#### Get the association rules with that dataset with a minimum support of 0.05 and confidence of 0.25.

In [10]:
import math

num_tickets = len(groceries)
support_threshold = math.ceil(num_tickets * 0.05)
support_threshold

492

In [11]:
# Let's apply FG-Growth algorithm
patterns = pyfpgrowth.find_frequent_patterns(groceries, 
                                             support_threshold = support_threshold)
patterns

{('beef',): 516,
 ('bottled beer',): 792,
 ('bottled water',): 1087,
 ('brown bread',): 638,
 ('butter',): 545,
 ('canned beer',): 764,
 ('citrus fruit',): 814,
 ('coffee',): 571,
 ('curd',): 524,
 ('domestic eggs',): 624,
 ('frankfurter',): 580,
 ('fruit/vegetable juice',): 711,
 ('margarine',): 576,
 ('napkins',): 515,
 ('newspapers',): 785,
 ('other vegetables',): 1903,
 ('other vegetables', 'whole milk'): 736,
 ('pastry',): 875,
 ('pip fruit',): 744,
 ('pork',): 567,
 ('rolls/buns',): 1809,
 ('rolls/buns', 'whole milk'): 557,
 ('root vegetables',): 1072,
 ('sausage',): 924,
 ('shopping bags',): 969,
 ('soda',): 1715,
 ('tropical fruit',): 1032,
 ('whipped/sour cream',): 705,
 ('whole milk',): 2513,
 ('whole milk', 'yogurt'): 551,
 ('yogurt',): 1372}

In [12]:
# Let's generate the rules:
rules = pyfpgrowth.generate_association_rules(patterns, confidence_threshold = 0.25)
rules

{('other vegetables',): (('whole milk',), 0.38675775091960063),
 ('rolls/buns',): (('whole milk',), 0.30790491984521834),
 ('whole milk',): (('other vegetables',), 0.29287703939514526),
 ('yogurt',): (('whole milk',), 0.40160349854227406)}