In [28]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

! pip install mlxtend



# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [11]:
# load the data set and show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [17]:
unique_products = df.stack().dropna().unique()
print(set(unique_products))

{'Milk', 'Meat', 'Cheese', 'Wine', 'Diaper', 'Bread', 'Pencil', 'Eggs'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [23]:
#create an itemset based on the products


# encoding the feature
one_hot_encoded = pd.get_dummies(df.stack().dropna()).groupby(level = 0).max()



In [26]:
  # create new dataframe from the encoded features
unique_products_order = df.stack().dropna().unique()
itemset = one_hot_encoded.reindex(columns = unique_products_order, fill_value = 0)
  # show the new dataframe
print(itemset)

   Bread  Cheese  Meat  Wine  Pencil  Eggs  Diaper  Milk
0      1       1     1     0       0     0       0     0
1      0       1     1     1       1     0       0     0
2      0       0     1     1       0     1       0     0
3      0       0     1     0       0     0       1     1
4      0       1     0     1       0     0       0     0
5      0       0     0     0       1     0       0     1
6      0       0     0     0       1     0       1     0


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [27]:
itemset_cleaned = itemset.iloc[:, 1:]
print(itemset_cleaned)

   Cheese  Meat  Wine  Pencil  Eggs  Diaper  Milk
0       1     1     0       0     0       0     0
1       1     1     1       1     0       0     0
2       0     1     1       0     1       0     0
3       0     1     0       0     0       1     1
4       1     0     1       0     0       0     0
5       0     0     0       1     0       0     1
6       0     0     0       1     0       1     0


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [41]:
transactions = df.applymap(lambda x: str(x) if pd.notna(x) else 'NaN').values.tolist()
te = TransactionEncoder()
one_hot_encoded = te.fit(transactions).transform(transactions)
df_one_hot = pd.DataFrame(one_hot_encoded, columns=te.columns_)
frequent_itemsets = apriori(df_one_hot, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.2)
print(frequent_itemsets)
print()

     support              itemsets
0   0.425397               (Bagel)
1   0.504762               (Bread)
2   0.501587              (Cheese)
3   0.406349              (Diaper)
4   0.438095                (Eggs)
5   0.476190                (Meat)
6   0.501587                (Milk)
7   0.869841                 (NaN)
8   0.361905              (Pencil)
9   0.438095                (Wine)
10  0.279365        (Bread, Bagel)
11  0.225397         (Milk, Bagel)
12  0.336508          (Bagel, NaN)
13  0.238095       (Bread, Cheese)
14  0.231746       (Bread, Diaper)
15  0.206349         (Bread, Meat)
16  0.279365         (Bread, Milk)
17  0.396825          (Bread, NaN)
18  0.200000       (Bread, Pencil)
19  0.244444         (Bread, Wine)
20  0.200000      (Cheese, Diaper)
21  0.298413        (Cheese, Eggs)
22  0.323810        (Meat, Cheese)
23  0.304762        (Milk, Cheese)
24  0.393651         (Cheese, NaN)
25  0.200000      (Cheese, Pencil)
26  0.269841        (Cheese, Wine)
27  0.317460        

Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [43]:
transactions = df.applymap(lambda x: str(x) if pd.notna(x) else 'NaN').values.tolist()
te = TransactionEncoder()
one_hot_encoded = te.fit(transactions).transform(transactions)
df_one_hot = pd.DataFrame(one_hot_encoded, columns=te.columns_)
frequent_itemsets = apriori(df_one_hot, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

       antecedents consequents   support  confidence      lift
0          (Bagel)     (Bread)  0.279365    0.656716  1.301042
1          (Bagel)       (NaN)  0.336508    0.791045  0.909413
2          (Bread)       (NaN)  0.396825    0.786164  0.903801
3           (Eggs)    (Cheese)  0.298413    0.681159  1.358008
4           (Meat)    (Cheese)  0.323810    0.680000  1.355696
5         (Cheese)      (Meat)  0.323810    0.645570  1.355696
6           (Milk)    (Cheese)  0.304762    0.607595  1.211344
7         (Cheese)      (Milk)  0.304762    0.607595  1.211344
8         (Cheese)       (NaN)  0.393651    0.784810  0.902245
9           (Wine)    (Cheese)  0.269841    0.615942  1.227986
10        (Diaper)       (NaN)  0.317460    0.781250  0.898152
11          (Eggs)      (Meat)  0.266667    0.608696  1.278261
12          (Eggs)       (NaN)  0.336508    0.768116  0.883053
13          (Meat)       (NaN)  0.368254    0.773333  0.889051
14          (Milk)       (NaN)  0.409524    0.816456  0

Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

In [None]:
# Antecedent Support (antecedent support):
The proportion of transactions in the dataset that contain the antecedent (the items on the left-hand side of the rule).

# Consequent Support (consequent support):
The proportion of transactions in the dataset that contain the consequent (the items on the right-hand side of the rule).
   
# Support (support):
The proportion of transactions in the dataset that contain both the antecedent and the consequent.
    
# Confidence (confidence):
Confidence measures how often the rule has been found to be true. It is the ratio of the number of transactions that include both the antecedent and the consequent to the number of transactions that include the antecedent.
    
# Lift (lift):
Lift measures the ratio of the observed support to the expected support if the antecedent and consequent were independent.
    
# Leverage (leverage):
Leverage measures the difference between the observed frequency of the antecedent and consequent appearing together and the frequency that would be expected if they were independent.
    
# Conviction (conviction):
Conviction measures the degree of implication of the consequent from the antecedent. A high conviction means that the consequent is highly dependent on the antecedent.