In [1]:
with open('store_data.csv') as f:
    records = []
    for line in f:
        records.append(line.strip().split(','))

# Association Rule Mining

This exercise sheet covers the following concepts.
- Finding frequent item sets
- Picking thresholds for the creation of good rules

## Libraries and Data

The first part of the exercise is about association rule mining. In Python, you can use the ```mlxtend``` library for the mining of association rules. 

We use data about [store baskets](https://user.informatik.uni-goettingen.de/~sherbold/store_data.csv) in this exercise. You can use the following code to load the data. The code creates a list of records, where each record is a list of the items that are part of the transaction.

In [2]:
inst_all= []
for el in records:
    for x in el:
        inst_all.append(x)
#print(inst_all)

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


#dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
#           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
#           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

dataset=  records

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
print(df.shape)
df.head(3)

(7501, 120)


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.033329,(avocado)
2,0.010799,(barbecue sauce)
3,0.014265,(black tea)
4,0.011465,(body spray)
...,...,...
252,0.011065,"(ground beef, mineral water, milk)"
253,0.017064,"(ground beef, spaghetti, mineral water)"
254,0.015731,"(spaghetti, mineral water, milk)"
255,0.010265,"(olive oil, spaghetti, mineral water)"


## Mining rules from the frequent itemsets

Determine good rules from the results for this data. Use lift and confidence as metrics for your evaluations. 

In [14]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)

In [15]:
7501*0.01

75.01

In [16]:
from ipywidgets import *
dfx =[]
def func(f):
    global dfxx
    dfx=association_rules(frequent_itemsets, metric="lift", min_threshold=f)
    dfxx = dfx.iloc[[index for  index,  setx in enumerate(dfx['antecedents']) if 'milk' in setx]] # antecends must contain spagetti
    #dfxx=dfxx[dfxx['antecedents'].apply(lambda x: len(x)) == 1 ] #access only values, where antecends have two entries
    
    display(dfxx)
interact(func,f=FloatSlider(1.1,min=0, max=3, step=0.1,continuous_update=False)); 


interactive(children=(FloatSlider(value=1.1, continuous_update=False, description='f', max=3.0), Output()), _d…

In [17]:
#dfxx = dfx.iloc[[index for  index,  setx in enumerate(dfx['antecedents']) if 'spaghetti' in setx]] # antecends must contain spagetti
#dfxx[dfxx['antecedents'].apply(lambda x: len(x)) == 2 ] #access only values, where antecends have two entries

In [18]:
dfxx["consequents"].iloc[1]

frozenset({'cake'})

In [20]:
#dfxx = dfx.iloc[[index for  index,  setx in enumerate(dfx['antecedents']) if 'milk' in setx]] # antecends must contain spagetti
dfxx= dfxx[dfxx["antecedents"] == frozenset({'milk'})]
dfxx.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
17,(milk),(burgers),0.129583,0.087188,0.017864,0.13786,1.581175,0.006566,1.058774
35,(milk),(cake),0.129583,0.081056,0.013332,0.102881,1.269256,0.002828,1.024328
53,(milk),(chicken),0.129583,0.059992,0.014798,0.114198,1.903546,0.007024,1.061194
77,(milk),(chocolate),0.129583,0.163845,0.032129,0.247942,1.513276,0.010898,1.111823
103,(milk),(cooking oil),0.129583,0.05106,0.011465,0.088477,1.732817,0.004849,1.041049


## Validation of the rules

Randomly split your records into two sets with roughly 50% of data each. Now use the Apriori algorithm to determine rules on both of these sets. Do you find similar rules on both sets? What does the similarity/the differences indicate?

In [24]:
#splitting_idea1
df1= df[0:int(len(df)/2)]
df2= df.drop(df1.index)

In [28]:
df1 = df.sample(frac=0.5, replace=False)
df2 = df.drop(df1.index)
# df2 = df1.merge(df, indicator=True, how='outer')
print(len(df1), len(df2))
display(pd.merge(df1,df2, how='inner'))

3750 3751


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31941,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
31942,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
31943,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
31944,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
frequent_itemsets1=apriori(df1, min_support=0.005, use_colnames=True)
frequent_itemsets2=apriori(df2, min_support=0.005, use_colnames=True)

association_rules1=association_rules(frequent_itemsets1, metric="lift", min_threshold=2.34)
association_rules2=association_rules(frequent_itemsets2, metric="lift", min_threshold=2.34)
similarity= pd.merge(association_rules1, association_rules2, how='inner', on=['antecedents','consequents'])
print(len(association_rules1), len(association_rules2))
print(len(similarity))
similarity.sort_values(by=['confidence_x'], inplace=False)

304 466
120


Unnamed: 0,antecedents,consequents,antecedent support_x,consequent support_x,support_x,confidence_x,lift_x,leverage_x,conviction_x,antecedent support_y,consequent support_y,support_y,confidence_y,lift_y,leverage_y,conviction_y
66,(spaghetti),"(olive oil, frozen vegetables)",0.168267,0.009867,0.005067,0.030111,3.051784,0.003406,1.020873,0.179952,0.012797,0.006398,0.035556,2.778519,0.004096,1.023598
76,(spaghetti),"(ground beef, grated cheese)",0.168267,0.010933,0.005067,0.030111,2.754049,0.003227,1.019773,0.179952,0.011730,0.005599,0.031111,2.652222,0.003488,1.020003
101,(spaghetti),"(ground beef, pancakes)",0.168267,0.012800,0.005067,0.030111,2.352417,0.002913,1.017848,0.179952,0.016262,0.007731,0.042963,2.641870,0.004805,1.027899
23,(spaghetti),"(ground beef, burgers)",0.168267,0.011467,0.005600,0.033281,2.902370,0.003671,1.022565,0.179952,0.012530,0.005332,0.029630,2.364697,0.003077,1.017622
98,(spaghetti),"(olive oil, ground beef)",0.168267,0.013867,0.005600,0.033281,2.400037,0.003267,1.020082,0.179952,0.014396,0.006665,0.037037,2.572702,0.004074,1.023512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,"(ground beef, eggs)",(spaghetti),0.017867,0.168267,0.008533,0.477612,2.838423,0.005527,1.592175,0.022127,0.179952,0.009331,0.421687,2.343329,0.005349,1.418000
50,"(ground beef, frozen vegetables)",(spaghetti),0.018400,0.168267,0.008800,0.478261,2.842279,0.005704,1.594156,0.015463,0.179952,0.008531,0.551724,3.065951,0.005749,1.829338
22,"(ground beef, burgers)",(spaghetti),0.011467,0.168267,0.005600,0.488372,2.902370,0.003671,1.625661,0.012530,0.179952,0.005332,0.425532,2.364697,0.003077,1.427491
63,"(olive oil, frozen vegetables)",(spaghetti),0.009867,0.168267,0.005067,0.513514,3.051784,0.003406,1.709674,0.012797,0.179952,0.006398,0.500000,2.778519,0.004096,1.640096
