# Association Analysis: Frequent Itemsets

In [None]:
#!pip install mlxtend

In [1]:
#Example taken from http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
#Create transaction data frame
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

#One-hot-encoding (which is a requirement for this apriori implementation)
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
print(te_ary)

[[False False False  True False  True  True  True  True False  True]
 [False False  True  True False  True False  True  True False  True]
 [ True False False  True False  True  True False False False False]
 [False  True False False False  True  True False False  True  True]
 [False  True False  True  True  True False False  True False False]]


In [13]:
#Transform one-hot-encoded transactions into data frame
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [14]:
#Alternative A: Apriori
from mlxtend.frequent_patterns import apriori

#Return itemsets with min_support
apriori(df, min_support=0.8,use_colnames=True)

Unnamed: 0,support,itemsets
0,0.8,(Eggs)
1,1.0,(Kidney Beans)
2,0.8,"(Eggs, Kidney Beans)"


In [15]:
#Alternative B: FP-Growth
from mlxtend.frequent_patterns import fpgrowth

fpgrowth(df, min_support=0.8, use_colnames=True)

Unnamed: 0,support,itemsets
0,1.0,(Kidney Beans)
1,0.8,(Eggs)
2,0.8,"(Eggs, Kidney Beans)"


In [16]:
#Show indices instead of item names
apriori(df, min_support=0.6, use_colnames=False)

Unnamed: 0,support,itemsets
0,0.8,(3)
1,1.0,(5)
2,0.6,(6)
3,0.6,(8)
4,0.6,(10)
5,0.8,"(3, 5)"
6,0.6,"(8, 3)"
7,0.6,"(5, 6)"
8,0.6,"(8, 5)"
9,0.6,"(10, 5)"


In [17]:
#Filter by number of items and support
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

#temsets of length 2 that have a support of at least 80 percent
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.6) ]

Unnamed: 0,support,itemsets,length
5,0.8,"(Eggs, Kidney Beans)",2
6,0.6,"(Eggs, Onion)",2
7,0.6,"(Kidney Beans, Milk)",2
8,0.6,"(Kidney Beans, Onion)",2
9,0.6,"(Yogurt, Kidney Beans)",2


In [18]:
#Apply filters on content of itemsets 
frequent_itemsets[ frequent_itemsets['itemsets'] == {'Onion', 'Eggs'} ]

Unnamed: 0,support,itemsets,length
6,0.6,"(Eggs, Onion)",2


In [19]:
#Sort ascending by support value
frequent_itemsets.sort_values(by=['support'])

Unnamed: 0,support,itemsets,length
2,0.6,(Milk),1
3,0.6,(Onion),1
4,0.6,(Yogurt),1
6,0.6,"(Eggs, Onion)",2
7,0.6,"(Kidney Beans, Milk)",2
8,0.6,"(Kidney Beans, Onion)",2
9,0.6,"(Yogurt, Kidney Beans)",2
10,0.6,"(Eggs, Kidney Beans, Onion)",3
0,0.8,(Eggs),1
5,0.8,"(Eggs, Kidney Beans)",2
