### Associating rule learning  
Plan:
1. Define Support, Confidence and Lift of all pairs of products
2. Find TOP-3 products
3. Find TOP-3 pairs of products
4. Save df to Excel file

In [1]:
#importing libraries

import numpy as np
import pandas as pd
from apyori import apriori


In [2]:
# creating dataframe from our csv file

gr_data = pd.read_csv('Groceries.csv', header=None)

print(gr_data.shape)
gr_data.head()

(9835, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [3]:
# making list of lists with all transactions

transactions = []
for i in range(gr_data.shape[0]):
    transactions.append([str(gr_data.values[i,j]) for j in range(gr_data.shape[1]) if pd.notna(gr_data.values[i,j])])

In [4]:
# building association rules

association_rules = apriori(transactions, min_support=0.0045, min_confidence=0.2, min_lift=3, max_length=2)

In [5]:
# list of rules

association_results = list(association_rules)
association_results[0]

RelationRecord(items=frozenset({'baking powder', 'whipped/sour cream'}), support=0.004575495678698526, ordered_statistics=[OrderedStatistic(items_base=frozenset({'baking powder'}), items_add=frozenset({'whipped/sour cream'}), confidence=0.25862068965517243, lift=3.607850330154072)])

In [6]:
association_results[0][0]

frozenset({'baking powder', 'whipped/sour cream'})

In [7]:
# beautiful output for better analysis

for item in association_results:
    products = [x for x in item[0]]
    print("Rule: " + products[0] + " -> " + products[1] )

    print("Support: " + str(item[1]))

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule: baking powder -> whipped/sour cream
Support: 0.004575495678698526
Confidence: 0.25862068965517243
Lift: 3.607850330154072
Rule: beef -> root vegetables
Support: 0.017386883579054397
Confidence: 0.3313953488372093
Lift: 3.0403668431100312
Rule: berries -> whipped/sour cream
Support: 0.009049313675648195
Confidence: 0.27217125382262997
Lift: 3.796885505454703
Rule: liquor -> bottled beer
Support: 0.004677173360447382
Confidence: 0.4220183486238532
Lift: 5.240594013529793
Rule: red/blush wine -> bottled beer
Support: 0.004880528723945094
Confidence: 0.253968253968254
Lift: 3.1537598204264876
Rule: sugar -> flour
Support: 0.00498220640569395
Confidence: 0.28654970760233917
Lift: 8.46311223504206
Rule: root vegetables -> herbs
Support: 0.007015760040671073
Confidence: 0.43124999999999997
Lift: 3.956477378731343
Rule: sausage -> sliced cheese
Support: 0.007015760040671073
Confidence: 0.2863070539419087
Lift: 3.047434930215013


In [8]:
# creating a new df from the list of rules
# using dictionary

dictt = {}
i = 0
for item in association_results:
    dictt[i] = [item[0], item[1], item[2][0][2], item[2][0][3]]
    i += 1
df = pd.DataFrame(dictt, index = ['Pairs of products', 'Support', 'Confidence', 'Lift'])

In [9]:
# swaping rows and columns

df = df.T
df.head()

Unnamed: 0,Pairs of products,Support,Confidence,Lift
0,"(baking powder, whipped/sour cream)",0.0045755,0.258621,3.60785
1,"(beef, root vegetables)",0.0173869,0.331395,3.04037
2,"(berries, whipped/sour cream)",0.00904931,0.272171,3.79689
3,"(liquor, bottled beer)",0.00467717,0.422018,5.24059
4,"(red/blush wine, bottled beer)",0.00488053,0.253968,3.15376


In [10]:
# creating a new df
# using list comprehensions

df = pd.DataFrame({'Pairs of products' : [item[0] for item in association_results],
                   'Support' : [item[1] for item in association_results],
                   'Confidence' : [item[2][0][2] for item in association_results],
                   'Lift' : [item[2][0][3] for item in association_results]})

In [11]:
df.head()

Unnamed: 0,Pairs of products,Support,Confidence,Lift
0,"(baking powder, whipped/sour cream)",0.004575,0.258621,3.60785
1,"(beef, root vegetables)",0.017387,0.331395,3.040367
2,"(berries, whipped/sour cream)",0.009049,0.272171,3.796886
3,"(liquor, bottled beer)",0.004677,0.422018,5.240594
4,"(red/blush wine, bottled beer)",0.004881,0.253968,3.15376


In [12]:
# TOP-3 pairs of products means max support

df.sort_values('Support', ascending=False, inplace=True)
df.head(3)

Unnamed: 0,Pairs of products,Support,Confidence,Lift
1,"(beef, root vegetables)",0.017387,0.331395,3.040367
2,"(berries, whipped/sour cream)",0.009049,0.272171,3.796886
6,"(root vegetables, herbs)",0.007016,0.43125,3.956477


In [13]:
# TOP-3 products

# making flattened array that contains all items
array = gr_data.to_numpy().reshape(-1)

# making pd.Series from array
ser = pd.Series(array).dropna()

# getting top-3
ser.value_counts()[:3]

whole milk          2513
other vegetables    1903
rolls/buns          1809
dtype: int64

In [14]:
# saving to file

# df.iloc[0,0] --> frozenset({'beef', 'root vegetables'})
# need to cut

df['Pairs of products'] = df['Pairs of products'].astype(str).str.lstrip('frozenset({)').str.rstrip('})')

df.to_excel('ass.rules.groceries.xlsx')