In [32]:
!pip install apyori

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [33]:
import os 
import pandas as pd
import numpy as np
from apyori import apriori
from collections import Counter
from datetime import datetime
from itertools import combinations
import matplotlib.pyplot as plt

In [34]:
df_order_products=pd.read_csv("order_products__prior.csv")
df_products=pd.read_csv("products.csv")

**Table infos**

In [35]:
df_order_products.count()

order_id             32434489
product_id           32434489
add_to_cart_order    32434489
reordered            32434489
dtype: int64

In [36]:
df_products.count()

product_id       49688
product_name     49688
aisle_id         49688
department_id    49688
dtype: int64

In [37]:
df_order_products.count()

order_id             32434489
product_id           32434489
add_to_cart_order    32434489
reordered            32434489
dtype: int64

**Association rules for products**

In [38]:
#keeping only orderid and productid
transactions_df = df_order_products[['order_id', 'product_id']]
transactions_df.count()

order_id      32434489
product_id    32434489
dtype: int64

In [39]:
#calculating unique number of orders and product
n_orders = len(set(transactions_df.order_id))
n_products = len(set(transactions_df.product_id))
print(n_orders, n_products)

3214874 49677


3214874 orders and have 49677 products

In [40]:
product_frequency = transactions_df.product_id.value_counts() / n_orders
product_frequency.count()

49677

In [41]:
#
min_support = 0.01

products_apriori = product_frequency[product_frequency >= min_support]
print(products_apriori)

24852    0.146993
13176    0.118030
21137    0.082331
21903    0.075251
47209    0.066436
           ...   
33000    0.010547
20995    0.010230
21709    0.010199
19678    0.010120
40604    0.010063
Name: product_id, Length: 102, dtype: float64


In [42]:
products_apriori.count()

102

In [43]:

transactions_apriori = transactions_df[transactions_df.product_id.isin(products_apriori.index)]


In [44]:
order_sizes = transactions_apriori.order_id.value_counts()
order_sizes

325855     29
1007609    28
1730767    27
1564244    27
129627     26
           ..
2553157     1
1211777     1
1211768     1
1211766     1
3421083     1
Name: order_id, Length: 2354512, dtype: int64

In [45]:
orders_apriori = order_sizes[order_sizes >= 2] # no point in having list less than 2 products as they will be only 1
print(orders_apriori)

325855     29
1007609    28
1730767    27
1564244    27
129627     26
           ..
1670095     2
980523      2
939234      2
1759208     2
170203      2
Name: order_id, Length: 1664863, dtype: int64


In [46]:
transactions_apriori = transactions_apriori[transactions_apriori.order_id.isin(orders_apriori.index)]
transactions_apriori

Unnamed: 0,order_id,product_id
1,2,28985
5,2,17794
10,3,24838
12,3,21903
14,3,46667
...,...,...
32434456,3421080,27845
32434459,3421080,41950
32434460,3421080,31717
32434474,3421082,16797


In [47]:
#combining name of products with its names
arr1 = {k: v for k, v in zip(df_products.product_id, df_products.product_name)}

In [48]:
#driver function
def association_rules(order_products, support, min_len = 2, max_len=5, 
                      confidence = 0.2, lift = 1.0):
   
    #support calculation
    order_prodects_new = order_products[['order_id', 'product_id']]

   
    n_orders = len(set(order_prodects_new.order_id))
    nume=order_prodects_new.product_id.value_counts()
    
    freq = nume/n_orders
    prod_2 = freq[freq >= support]
    
    prod_3 = order_prodects_new[order_prodects_new.product_id.isin(prod_2.index)]
    
    n = prod_3.order_id.value_counts()
    orders_new = n[n >= min_len]
    prod_3 = prod_3[prod_3.order_id.isin(orders_new.index)]
    
   
    
    def prod_fun(order_prodects_new, max_len = max_len):
      #combinations support calculation
        transactions_by_order = order_prodects_new.groupby('order_id')['product_id']
        max_reference = max_len
        for order_id, order_list in transactions_by_order:
            max_len= min(max_reference, len(order_list)) #
            order_list = sorted(order_list)
            
            for l in range(2, max_len + 1):
                prod_fun = combinations(order_list, l)
                
                for combination in prod_fun:
                    yield combination




        
   #dataframe  genration for Rules
    x = prod_fun(prod_3)
    counter = Counter(x).items()
    x_count = pd.Series([x[1] for x in counter], index = [x[0] for x in counter])
    x_freq = x_count/n_orders
    y = x_freq[x_freq >= support]
    y = y[y.index.map(len) >= min_len]

  #list to hold product items
    A = []
    B = [] 
    AB = []
    for i in y.index:
        len_1 = len(i)
        for j in range(1, len_1):
            comb = combinations(i, j)
            for a in comb:
                AB.append(i) #adding on list
                b = list(i)
                for e in a:
                    b.remove(e)
                b = tuple(b)
                if len(a) == 1:
                    a = a[0]
                A.append(a)
                if len(b) == 1:
                    b = b[0]
                B.append(b)
            
    prod_apriori = pd.DataFrame({'A': A,
                               'B': B,
                               'AB': AB})
    support = {**{k: v for k, v in prod_2.items()}, 
               **{k: v for k, v in x_freq.items()}}
    prod_apriori[['support_A', 'support_B', 'support_AB']] = prod_apriori[['A', 'B', 'AB']].applymap(lambda x: support[x])
    prod_apriori.drop('AB', axis = 1, inplace = True)
    #confidence calculation
    prod_apriori['confidence'] = prod_apriori.support_AB/prod_apriori.support_A
    #lift calculation
    prod_apriori['lift'] = prod_apriori.confidence / prod_apriori.support_B
   #filtering on the basis of confidence
    prod_apriori = prod_apriori[prod_apriori.confidence >= confidence]
    #filtering on the basis of lift
    prod_apriori = prod_apriori[prod_apriori.lift >= lift]
    prod_apriori = prod_apriori.sort_values(by = 'lift', ascending = False).reset_index(drop = True)
    
    #productid to name
    def prod_name(p_id):
        if type(p_id) == int:
            return arr1[p_id]
        arr = []
        for prod in p_id:
            name = arr1[prod]
            arr.append(name)
        arr = tuple(arr)
        return arr
    
    prod_apriori[['A', 'B']] = prod_apriori[['A', 'B']].applymap(prod_name)

    print('{} rules were generated'.format(len(prod_apriori)))

    return prod_apriori

In [49]:
start = datetime.now()
rules1 = association_rules(df_order_products, support = 0.01, max_len = 4)
print('lapse time: ', datetime.now() - start)

11 rules were generated
lapse time:  0:02:15.937587


In [50]:
rules1

Unnamed: 0,A,B,support_A,support_B,support_AB,confidence,lift
0,Organic Raspberries,Organic Strawberries,0.042632,0.082331,0.010533,0.247072,3.000973
1,Organic Fuji Apple,Banana,0.02788,0.146993,0.010558,0.378693,2.576259
2,Organic Raspberries,Bag of Organic Bananas,0.042632,0.11803,0.012599,0.295519,2.503775
3,Organic Hass Avocado,Bag of Organic Bananas,0.066436,0.11803,0.019391,0.29188,2.472945
4,Organic Avocado,Banana,0.054999,0.146993,0.016609,0.301982,2.054395
5,Organic Strawberries,Bag of Organic Bananas,0.082331,0.11803,0.01917,0.232837,1.972702
6,Strawberries,Banana,0.044466,0.146993,0.012825,0.288434,1.962229
7,Large Lemon,Banana,0.047485,0.146993,0.012716,0.26779,1.821783
8,Organic Baby Spinach,Bag of Organic Bananas,0.075251,0.11803,0.015668,0.208217,1.764107
9,Organic Baby Spinach,Banana,0.075251,0.146993,0.015987,0.212445,1.445272


In [51]:
start = datetime.now()
rules2 = association_rules(df_order_products, support = 0.005, max_len = 3)
print('lapse time: ', datetime.now() - start)

37 rules were generated
;apse time:  0:02:43.444100


In [52]:
rules2

Unnamed: 0,A,B,support_A,support_B,support_AB,confidence,lift
0,Organic Cilantro,Limes,0.021626,0.043743,0.005464,0.252647,5.775753
1,Organic Garlic,Organic Yellow Onion,0.034147,0.035282,0.006866,0.201069,5.698983
2,Organic Lemon,Organic Hass Avocado,0.027294,0.066436,0.006609,0.242131,3.64456
3,Organic Cucumber,Organic Hass Avocado,0.025006,0.066436,0.00543,0.217136,3.268339
4,Organic Raspberries,Organic Strawberries,0.042632,0.082331,0.010533,0.247072,3.000973
5,Organic Blueberries,Organic Strawberries,0.031124,0.082331,0.007389,0.237418,2.883704
6,Organic Large Extra Fancy Fuji Apple,Bag of Organic Bananas,0.02338,0.11803,0.007267,0.310836,2.633546
7,Organic Fuji Apple,Banana,0.02788,0.146993,0.010558,0.378693,2.576259
8,Organic Raspberries,Bag of Organic Bananas,0.042632,0.11803,0.012599,0.295519,2.503775
9,Organic Cucumber,Organic Strawberries,0.025006,0.082331,0.00515,0.205928,2.501234


In [53]:
start = datetime.now()
rules3 = association_rules(df_order_products, support = 0.003, max_len = 2)
print('lapse time: ', datetime.now() - start)

97 rules were generated
lapse time:  0:01:33.449797


In [54]:
rules3

Unnamed: 0,A,B,support_A,support_B,support_AB,confidence,lift
0,Lime Sparkling Water,Sparkling Water Grapefruit,0.014478,0.023605,0.004132,0.285395,12.090626
1,Bunched Cilantro,Limes,0.014162,0.043743,0.003885,0.274347,6.271838
2,Jalapeno Peppers,Limes,0.013239,0.043743,0.003544,0.267686,6.119579
3,Organic Ginger Root,Organic Garlic,0.016202,0.034147,0.003357,0.207192,6.067660
4,Organic Cilantro,Limes,0.021626,0.043743,0.005464,0.252647,5.775753
...,...,...,...,...,...,...,...
92,Lime Sparkling Water,Banana,0.014478,0.146993,0.003041,0.210072,1.429125
93,Organic Baby Arugula,Banana,0.022654,0.146993,0.004737,0.209106,1.422556
94,Sparkling Water Grapefruit,Banana,0.023605,0.146993,0.004911,0.208062,1.415453
95,Apple Honeycrisp Organic,Banana,0.026446,0.146993,0.005422,0.205034,1.394853
