This example is created based on `mlxtend` library's documentation page ([link](https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/)).

### Import Packages

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from itertools import combinations

### Data

Let's create a toy data set for this exercise.

In [2]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

### Prepare (transform) data

Binary representation

In [21]:
te = TransactionEncoder()

te_ary = te.fit(dataset).transform(dataset)

te_ary[:10]

array([[False, False, False,  True, False,  True,  True,  True,  True,
        False,  True],
       [False, False,  True,  True, False,  True, False,  True,  True,
        False,  True],
       [ True, False, False,  True, False,  True,  True, False, False,
        False, False],
       [False,  True, False, False, False,  True,  True, False, False,
         True,  True],
       [False,  True, False,  True,  True,  True, False, False,  True,
        False, False]])

In [22]:
te_ary.astype("int")

array([[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1],
       [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
       [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0]])

In [24]:
df = pd.DataFrame(te_ary, columns=te.columns_)

df.head()

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [27]:
df2 = df.drop(["Unicorn", "Ice cream", "Dill",
"Apple"], axis=1)

In [29]:
df2

Unnamed: 0,Corn,Eggs,Kidney Beans,Milk,Nutmeg,Onion,Yogurt
0,False,True,True,True,True,True,True
1,False,True,True,False,True,True,True
2,False,True,True,True,False,False,False
3,True,False,True,True,False,False,True
4,True,True,True,False,False,True,False


In [5]:
len(df)

5

### `Support`

Calculate `support` for each individual product.

In [30]:
item_supports2 = df2.sum() / len(df2)

item_supports2 = item_supports2.sort_values(ascending=False)

print (item_supports2)

Kidney Beans    1.0
Eggs            0.8
Yogurt          0.6
Onion           0.6
Milk            0.6
Nutmeg          0.4
Corn            0.4
dtype: float64


In [25]:
item_supports = df.sum() / len(df)

item_supports = item_supports.sort_values(ascending=False)

print (item_supports)

Kidney Beans    1.0
Eggs            0.8
Yogurt          0.6
Onion           0.6
Milk            0.6
Nutmeg          0.4
Corn            0.4
Unicorn         0.2
Ice cream       0.2
Dill            0.2
Apple           0.2
dtype: float64


Extract all two-way combinations.

In [26]:
list(combinations(dataset[0], 2))

[('Milk', 'Onion'),
 ('Milk', 'Nutmeg'),
 ('Milk', 'Kidney Beans'),
 ('Milk', 'Eggs'),
 ('Milk', 'Yogurt'),
 ('Onion', 'Nutmeg'),
 ('Onion', 'Kidney Beans'),
 ('Onion', 'Eggs'),
 ('Onion', 'Yogurt'),
 ('Nutmeg', 'Kidney Beans'),
 ('Nutmeg', 'Eggs'),
 ('Nutmeg', 'Yogurt'),
 ('Kidney Beans', 'Eggs'),
 ('Kidney Beans', 'Yogurt'),
 ('Eggs', 'Yogurt')]

Calculate `support` for each pair of items.

In [45]:
item_pairs = []

for i, order in enumerate(dataset):
    pairs = combinations(set(order), 2)
    
    # For each product pair
    for item_pair in pairs:
        item_pairs.append(item_pair)
        
len(item_pairs)

56

In [42]:
import numpy as np
randomset = np.random.randint(5, size=(8, 8))

In [43]:
item_pairs = []

for i, order in enumerate(randomset):
    pairs = combinations(set(order), 2)
    
    # For each product pair
    for item_pair in pairs:
        item_pairs.append(item_pair)
        
len(item_pairs)

61

In [46]:
item_pairs

[('Eggs', 'Yogurt'),
 ('Eggs', 'Onion'),
 ('Eggs', 'Nutmeg'),
 ('Eggs', 'Milk'),
 ('Eggs', 'Kidney Beans'),
 ('Yogurt', 'Onion'),
 ('Yogurt', 'Nutmeg'),
 ('Yogurt', 'Milk'),
 ('Yogurt', 'Kidney Beans'),
 ('Onion', 'Nutmeg'),
 ('Onion', 'Milk'),
 ('Onion', 'Kidney Beans'),
 ('Nutmeg', 'Milk'),
 ('Nutmeg', 'Kidney Beans'),
 ('Milk', 'Kidney Beans'),
 ('Eggs', 'Yogurt'),
 ('Eggs', 'Dill'),
 ('Eggs', 'Onion'),
 ('Eggs', 'Nutmeg'),
 ('Eggs', 'Kidney Beans'),
 ('Yogurt', 'Dill'),
 ('Yogurt', 'Onion'),
 ('Yogurt', 'Nutmeg'),
 ('Yogurt', 'Kidney Beans'),
 ('Dill', 'Onion'),
 ('Dill', 'Nutmeg'),
 ('Dill', 'Kidney Beans'),
 ('Onion', 'Nutmeg'),
 ('Onion', 'Kidney Beans'),
 ('Nutmeg', 'Kidney Beans'),
 ('Eggs', 'Apple'),
 ('Eggs', 'Milk'),
 ('Eggs', 'Kidney Beans'),
 ('Apple', 'Milk'),
 ('Apple', 'Kidney Beans'),
 ('Milk', 'Kidney Beans'),
 ('Yogurt', 'Unicorn'),
 ('Yogurt', 'Milk'),
 ('Yogurt', 'Corn'),
 ('Yogurt', 'Kidney Beans'),
 ('Unicorn', 'Milk'),
 ('Unicorn', 'Corn'),
 ('Unicorn', 'Kidney

Count how frequent each item-pair is.

In [47]:
from collections import Counter

Counter(tuple(sorted(elem)) for elem in item_pairs)

Counter({('Eggs', 'Yogurt'): 2,
         ('Eggs', 'Onion'): 3,
         ('Eggs', 'Nutmeg'): 2,
         ('Eggs', 'Milk'): 2,
         ('Eggs', 'Kidney Beans'): 4,
         ('Onion', 'Yogurt'): 2,
         ('Nutmeg', 'Yogurt'): 2,
         ('Milk', 'Yogurt'): 2,
         ('Kidney Beans', 'Yogurt'): 3,
         ('Nutmeg', 'Onion'): 2,
         ('Milk', 'Onion'): 1,
         ('Kidney Beans', 'Onion'): 3,
         ('Milk', 'Nutmeg'): 1,
         ('Kidney Beans', 'Nutmeg'): 2,
         ('Kidney Beans', 'Milk'): 3,
         ('Dill', 'Eggs'): 1,
         ('Dill', 'Yogurt'): 1,
         ('Dill', 'Onion'): 1,
         ('Dill', 'Nutmeg'): 1,
         ('Dill', 'Kidney Beans'): 1,
         ('Apple', 'Eggs'): 1,
         ('Apple', 'Milk'): 1,
         ('Apple', 'Kidney Beans'): 1,
         ('Unicorn', 'Yogurt'): 1,
         ('Corn', 'Yogurt'): 1,
         ('Milk', 'Unicorn'): 1,
         ('Corn', 'Unicorn'): 1,
         ('Kidney Beans', 'Unicorn'): 1,
         ('Corn', 'Milk'): 1,
         ('Corn',

Let's save the results in an array and sort it in descending order of frequency.

In [52]:
type(item_pair_ct)

collections.Counter

In [53]:
item_pair_ct = Counter(tuple(sorted(elem)) for elem in item_pairs)

sorted(item_pair_ct.items(), key=lambda x: x[1], reverse=True)

[(('Eggs', 'Kidney Beans'), 4),
 (('Eggs', 'Onion'), 3),
 (('Kidney Beans', 'Yogurt'), 3),
 (('Kidney Beans', 'Onion'), 3),
 (('Kidney Beans', 'Milk'), 3),
 (('Eggs', 'Yogurt'), 2),
 (('Eggs', 'Nutmeg'), 2),
 (('Eggs', 'Milk'), 2),
 (('Onion', 'Yogurt'), 2),
 (('Nutmeg', 'Yogurt'), 2),
 (('Milk', 'Yogurt'), 2),
 (('Nutmeg', 'Onion'), 2),
 (('Kidney Beans', 'Nutmeg'), 2),
 (('Corn', 'Kidney Beans'), 2),
 (('Milk', 'Onion'), 1),
 (('Milk', 'Nutmeg'), 1),
 (('Dill', 'Eggs'), 1),
 (('Dill', 'Yogurt'), 1),
 (('Dill', 'Onion'), 1),
 (('Dill', 'Nutmeg'), 1),
 (('Dill', 'Kidney Beans'), 1),
 (('Apple', 'Eggs'), 1),
 (('Apple', 'Milk'), 1),
 (('Apple', 'Kidney Beans'), 1),
 (('Unicorn', 'Yogurt'), 1),
 (('Corn', 'Yogurt'), 1),
 (('Milk', 'Unicorn'), 1),
 (('Corn', 'Unicorn'), 1),
 (('Kidney Beans', 'Unicorn'), 1),
 (('Corn', 'Milk'), 1),
 (('Eggs', 'Ice cream'), 1),
 (('Corn', 'Eggs'), 1),
 (('Ice cream', 'Onion'), 1),
 (('Corn', 'Ice cream'), 1),
 (('Ice cream', 'Kidney Beans'), 1),
 (('Corn',

In [54]:
item_pair_ct_sorted = sorted(item_pair_ct.items(), key=lambda x: x[1], reverse=True)

# Let's calculate the %s from counts

item_pair_pct_sorted = {}

for i, item_pair in enumerate(item_pair_ct_sorted):
    item_pair_pct_sorted[item_pair[0]] = item_pair[1] / len(dataset)
    
print (item_pair_pct_sorted)

{('Eggs', 'Kidney Beans'): 0.8, ('Eggs', 'Onion'): 0.6, ('Kidney Beans', 'Yogurt'): 0.6, ('Kidney Beans', 'Onion'): 0.6, ('Kidney Beans', 'Milk'): 0.6, ('Eggs', 'Yogurt'): 0.4, ('Eggs', 'Nutmeg'): 0.4, ('Eggs', 'Milk'): 0.4, ('Onion', 'Yogurt'): 0.4, ('Nutmeg', 'Yogurt'): 0.4, ('Milk', 'Yogurt'): 0.4, ('Nutmeg', 'Onion'): 0.4, ('Kidney Beans', 'Nutmeg'): 0.4, ('Corn', 'Kidney Beans'): 0.4, ('Milk', 'Onion'): 0.2, ('Milk', 'Nutmeg'): 0.2, ('Dill', 'Eggs'): 0.2, ('Dill', 'Yogurt'): 0.2, ('Dill', 'Onion'): 0.2, ('Dill', 'Nutmeg'): 0.2, ('Dill', 'Kidney Beans'): 0.2, ('Apple', 'Eggs'): 0.2, ('Apple', 'Milk'): 0.2, ('Apple', 'Kidney Beans'): 0.2, ('Unicorn', 'Yogurt'): 0.2, ('Corn', 'Yogurt'): 0.2, ('Milk', 'Unicorn'): 0.2, ('Corn', 'Unicorn'): 0.2, ('Kidney Beans', 'Unicorn'): 0.2, ('Corn', 'Milk'): 0.2, ('Eggs', 'Ice cream'): 0.2, ('Corn', 'Eggs'): 0.2, ('Ice cream', 'Onion'): 0.2, ('Corn', 'Ice cream'): 0.2, ('Ice cream', 'Kidney Beans'): 0.2, ('Corn', 'Onion'): 0.2}


### `Support` Filter

In [55]:
# For this exercise, we will use a support threshold of 0.6

min_support = 0.6

# Extract all items that satisfy the support criterion

item_supports[item_supports >= min_support]

Kidney Beans    1.0
Eggs            0.8
Yogurt          0.6
Onion           0.6
Milk            0.6
dtype: float64

In [56]:
# Print all item-sets that satisfy the support criterion

for key, value in item_pair_pct_sorted.items():
    if value >= min_support:
        print (key, value)

('Eggs', 'Kidney Beans') 0.8
('Eggs', 'Onion') 0.6
('Kidney Beans', 'Yogurt') 0.6
('Kidney Beans', 'Onion') 0.6
('Kidney Beans', 'Milk') 0.6


### Using `mlxtend`

In [57]:
from mlxtend.frequent_patterns import apriori

frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Eggs, Kidney Beans)"
6,0.6,"(Eggs, Onion)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Yogurt, Kidney Beans)"


In [58]:
from mlxtend.frequent_patterns import association_rules

association_rules(frequent_itemsets, metric='confidence', min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
1,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
2,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
3,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
5,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
6,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,"(Eggs, Onion)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
8,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
9,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf


If you are interested in rules according to a different metric of interest, you can simply adjust the metric and min_threshold arguments . E.g. if you are only interested in rules that have a lift score of >= 1.2, you would do the following:

In [59]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.2)

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
1,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
2,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6
3,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
4,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6
5,(Onion),"(Eggs, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf


Pandas DataFrames make it easy to filter the results further. Let's say we are ony interested in rules that satisfy the following criteria:

1. at least 2 antecedents
2. a confidence > 0.75
3. a lift score > 1.2

We could compute the antecedent length as follows:

In [62]:
rules['antecedents']

0                   (Eggs)
1                  (Onion)
2     (Eggs, Kidney Beans)
3    (Onion, Kidney Beans)
4                   (Eggs)
5                  (Onion)
Name: antecedents, dtype: object

In [60]:
rules['antecedent_len'] = rules['antecedents'].apply(lambda x: len(x))

rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
0,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1
1,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,1
2,"(Eggs, Kidney Beans)",(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,2
3,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
4,(Eggs),"(Onion, Kidney Beans)",0.8,0.6,0.6,0.75,1.25,0.12,1.6,1
5,(Onion),"(Eggs, Kidney Beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf,1


Then, we can use pandas' selection syntax as shown below:

In [63]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.2) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len
3,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,2
