# Association rules with apriori frequent itemset mining

In [1]:
# Import libs

from apriori_algorithm import apriori

In [2]:
# Define a set of transactions

itemSetList = [
    ['Beer', 'Nuts', 'Diaper'],
    ['Beer', 'Coffee', 'Diaper'],
    ['Beer', 'Diaper', 'Eggs'],
    ['Nuts', 'Eggs', 'Milk'],
    ['Nuts', 'Coffee', 'Diaper', 'Eggs', 'Milk']
]

In [3]:
# Run algorithm
# Let's define minimup support = 0.5
# and mininum confidence = 0.5
freqItemSet, rules = apriori(itemSetList, minSup=0.5, minConf=0.5)

In [4]:
# Print the frequent itemsets
for key in freqItemSet:
    values = [list(x) for x in freqItemSet[key]]
    print(f'{key}: {values}')

1: [['Diaper'], ['Beer'], ['Nuts'], ['Eggs']]
2: [['Beer', 'Diaper']]


In [5]:
# Print rules
for rule in rules:
    print(f'{rule[0]} --> {rule[1]} [conf: {rule[2]}]')

{'Diaper'} --> {'Beer'} [conf: 0.75]
{'Beer'} --> {'Diaper'} [conf: 1.0]


# Association rule mining with mlxtend library


In [6]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

In [7]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.6,(Milk)
3,0.6,(Onion)
4,0.6,(Yogurt)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Onion, Eggs)"
7,0.6,"(Milk, Kidney Beans)"
8,0.6,"(Kidney Beans, Onion)"
9,0.6,"(Kidney Beans, Yogurt)"


In [8]:
from mlxtend.frequent_patterns import association_rules

assoc_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
assoc_rules.iloc[:, 0:7]


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0
1,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0
2,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25
3,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25
4,(Milk),(Kidney Beans),0.6,1.0,0.6,1.0,1.0
5,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0
6,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0
7,"(Kidney Beans, Onion)",(Eggs),0.6,0.8,0.6,1.0,1.25
8,"(Kidney Beans, Eggs)",(Onion),0.8,0.6,0.6,0.75,1.25
9,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0


# The cerials and basketball example from the slides

In [16]:
# Generate the data (5000 transactions in total - see the slides of the lecture)

data = []

# Play basketball and eat cerial (2000 cases)
for i in range(2000):
    data.append(['basketball', 'cerial'])

# Play basketball and not eat cerial (1000 cases)
for i in range(1000):
    data.append(['basketball', 'not_cerial'])

# Not basketball and eat cerial (1750 cases)
for i in range(1750):
    data.append(['not_basketball', 'cerial'])

# Not basketball and not cerial (250 cases)
for i in range(250):
    data.append(['not_basketball', 'not_cerial'])

# Should be 5000 cases in total
print(f'Total cases: {len(data)}')

# The data look like this:
# [
#   ['basketball', 'cerial'],
#   ['basketball', 'cerial'],
# ...
#   ['basketball', 'not_cerial']
# ...
#   ['not_basketball', 'cerial']
# ...
#   ['not_basketball', 'not_cerial']
# ]

Total cases: 5000


In [21]:
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.6,(basketball)
1,0.75,(cerial)
2,0.4,(not_basketball)
3,0.25,(not_cerial)
4,0.4,"(basketball, cerial)"
5,0.2,"(basketball, not_cerial)"
6,0.35,"(cerial, not_basketball)"


In [24]:
assoc_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
assoc_rules.iloc[:, 0:7]

# Take a look at the example rules of the lecture's slide:
# basketball -> cerial [40%, 66.7%] (Lift: 0.89)
# basketball -> not cerial [20%, 33.3%] (Lift: 1.33)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(basketball),(cerial),0.6,0.75,0.4,0.666667,0.888889
1,(cerial),(basketball),0.75,0.6,0.4,0.533333,0.888889
2,(basketball),(not_cerial),0.6,0.25,0.2,0.333333,1.333333
3,(not_cerial),(basketball),0.25,0.6,0.2,0.8,1.333333
4,(cerial),(not_basketball),0.75,0.4,0.35,0.466667,1.166667
5,(not_basketball),(cerial),0.4,0.75,0.35,0.875,1.166667
