In [1]:
import pandas as pd
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


In [2]:
# Load and preprocess data set 
df = pd.read_csv('instacart_orders.csv')
product_df= pd.read_csv("instacart_products.csv")

In [3]:
# use product name instead of id
merged_products_df = pd.merge(df, product_df, how="left", on=["product_id"])[['order_id','product_name']]

In [4]:
merged_products_df.head(25)

Unnamed: 0,order_id,product_name
0,1,Bulgarian Yogurt
1,1,Organic 4% Milk Fat Whole Milk Cottage Cheese
2,1,Organic Celery Hearts
3,1,Cucumber Kirby
4,1,Lightly Smoked Sardines in Olive Oil
5,1,Bag of Organic Bananas
6,1,Organic Hass Avocado
7,1,Organic Whole String Cheese
8,36,Grated Pecorino Romano Cheese
9,36,Spring Water


In [5]:
store_record = {}
for index, values in merged_products_df.iterrows():
    order_id = str(values.order_id)
    if order_id not in store_record:
        store_record[order_id] = [str(values.product_name)]
    else:
        store_record[order_id].append(str(values.product_name))

In [6]:
store_record['1']

['Bulgarian Yogurt',
 'Organic 4% Milk Fat Whole Milk Cottage Cheese',
 'Organic Celery Hearts',
 'Cucumber Kirby',
 'Lightly Smoked Sardines in Olive Oil',
 'Bag of Organic Bananas',
 'Organic Hass Avocado',
 'Organic Whole String Cheese']

In [7]:
item_database = []
for record in store_record.values():
    if len(record) > 1:
        item_database.append(record)

In [8]:
item_database[2]

['Shelled Pistachios',
 'Organic Biologique Limes',
 'Organic Raw Unfiltered Apple Cider Vinegar',
 'Organic Baby Arugula',
 'Organic Hot House Tomato',
 'Green Peas',
 'Bunched Cilantro',
 'Flat Parsley, Bunch',
 'Fresh Dill']

In [9]:
# using TransactionEncorder to do One hot encording.

from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()
# te_ary = te.fit(item_database).transform(item_database)




oht_ary = te.fit(item_database).transform(item_database, sparse=True)
sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
sparse_df



Unnamed: 0,#2 Coffee Filters,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,0 Calorie Fuji Apple Pear Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Greek Yogurt Black Cherry on the Bottom,0% Fat Organic Greek Vanilla Yogurt,0% Fat Peach Greek Yogurt,...,with Olive Oil Mayonnaise,with Olive Oil Mayonnaise Dressing,with Sweet & Smoky BBQ Sauce Cheeseburger Sliders,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum,with Xylitol Original Flavor 18 Sticks Sugar Free Gum,with Xylitol Unwrapped Original Flavor 50 Sticks Sugar Free Gum,with Xylitol Watermelon Twist 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45104,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# create frequent itemsets
itemsets = apriori(sparse_df, min_support=0.01, use_colnames=True)

# and convert into rules
rules = association_rules(itemsets, metric='confidence', min_threshold=0.01)
rules.sort_values(by=['support'], ascending=False).head(8)


print(rules.sort_values(by=['support'], ascending=False)
      .drop(columns=['antecedent support', 'consequent support', 'conviction'])
      .head(8))

                 antecedents               consequents   support  confidence  \
8   (Bag of Organic Bananas)    (Organic Strawberries)  0.024874    0.202893   
9     (Organic Strawberries)  (Bag of Organic Bananas)  0.024874    0.290298   
4   (Bag of Organic Bananas)    (Organic Hass Avocado)  0.019198    0.156600   
5     (Organic Hass Avocado)  (Bag of Organic Bananas)  0.019198    0.325686   
15         (Organic Avocado)                  (Banana)  0.018289    0.302419   
14                  (Banana)         (Organic Avocado)  0.018289    0.123540   
0   (Bag of Organic Bananas)    (Organic Baby Spinach)  0.017491    0.142676   
1     (Organic Baby Spinach)  (Bag of Organic Bananas)  0.017491    0.222441   

        lift  leverage  
8   2.367946  0.014369  
9   2.367946  0.014369  
4   2.656611  0.011972  
5   2.656611  0.011972  
15  2.042757  0.009336  
14  2.042757  0.009336  
0   1.814447  0.007851  
1   1.814447  0.007851  


In [11]:
# filter to get rules with single consequents only
rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['support'], ascending=False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(Bag of Organic Bananas),(Organic Strawberries),0.122595,0.085683,0.024874,0.202893,2.367946,0.014369,1.147044
9,(Organic Strawberries),(Bag of Organic Bananas),0.085683,0.122595,0.024874,0.290298,2.367946,0.014369,1.2363
4,(Bag of Organic Bananas),(Organic Hass Avocado),0.122595,0.058947,0.019198,0.1566,2.656611,0.011972,1.115785
5,(Organic Hass Avocado),(Bag of Organic Bananas),0.058947,0.122595,0.019198,0.325686,2.656611,0.011972,1.301183
15,(Organic Avocado),(Banana),0.060477,0.148045,0.018289,0.302419,2.042757,0.009336,1.2213
14,(Banana),(Organic Avocado),0.148045,0.060477,0.018289,0.12354,2.042757,0.009336,1.071952


In [12]:
rules.sort_values(by=['lift'], ascending=False).head(8)


print(rules.sort_values(by=['lift'], ascending=False)
      .drop(columns=['antecedent support', 'consequent support', 'conviction'])
      .head(8))

                 antecedents               consequents   support  confidence  \
23                   (Limes)             (Large Lemon)  0.012858    0.266667   
22             (Large Lemon)                   (Limes)  0.012858    0.198087   
34     (Organic Raspberries)    (Organic Strawberries)  0.012570    0.287234   
35    (Organic Strawberries)     (Organic Raspberries)  0.012570    0.146701   
24             (Large Lemon)         (Organic Avocado)  0.011040    0.170082   
25         (Organic Avocado)             (Large Lemon)  0.011040    0.182551   
6   (Bag of Organic Bananas)     (Organic Raspberries)  0.014521    0.118445   
7      (Organic Raspberries)  (Bag of Organic Bananas)  0.014521    0.331814   

        lift  leverage  
23  4.108197  0.009728  
22  4.108197  0.009728  
34  3.352278  0.008820  
35  3.352278  0.008820  
24  2.812338  0.007115  
25  2.812338  0.007115  
6   2.706591  0.009156  
7   2.706591  0.009156  


In [13]:
# filter to get rules with single consequents only
rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
23,(Limes),(Large Lemon),0.048218,0.064911,0.012858,0.266667,4.108197,0.009728,1.275122
22,(Large Lemon),(Limes),0.064911,0.048218,0.012858,0.198087,4.108197,0.009728,1.18689
34,(Organic Raspberries),(Organic Strawberries),0.043762,0.085683,0.01257,0.287234,3.352278,0.00882,1.282773
35,(Organic Strawberries),(Organic Raspberries),0.085683,0.043762,0.01257,0.146701,3.352278,0.00882,1.120637
24,(Large Lemon),(Organic Avocado),0.064911,0.060477,0.01104,0.170082,2.812338,0.007115,1.132067
25,(Organic Avocado),(Large Lemon),0.060477,0.064911,0.01104,0.182551,2.812338,0.007115,1.143912
