In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [10]:
# load the data set and show the first five transaction
url = 'https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [11]:
unique_products = set(np.concatenate(df.apply(pd.unique)))
print(unique_products)

{nan, 'Meat', 'Bread', 'Pencil', 'Cheese', 'Bagel', 'Eggs', 'Milk', 'Diaper', 'Wine'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [12]:
#create an itemset based on the products
df = df.applymap(lambda x: str(x))
te = TransactionEncoder()
te_ary = te.fit_transform(df.values.tolist())


# encoding the feature
df_encoded = pd.DataFrame(te_ary.astype(int), columns=te.columns_)

In [13]:
  # create new dataframe from the encoded features
df_encoded = pd.concat([df_encoded.loc[:, 'Bagel':'nan']], axis=1)

  # show the new dataframe
df_encoded.loc[:, 'Bagel':'nan']

Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine,nan
0,0,1,1,1,1,1,0,1,1,0
1,0,1,1,1,0,1,1,1,1,0
2,0,0,1,0,1,1,1,0,1,1
3,0,0,1,0,1,1,1,0,1,1
4,0,0,0,0,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
310,0,1,1,0,1,0,0,0,0,1
311,0,0,0,0,0,1,1,1,0,1
312,0,1,1,1,1,1,0,1,1,0
313,0,0,1,0,0,1,0,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [14]:
df_new_encoded = df_encoded.drop(['nan'], axis=1)
df_new_encoded

Unnamed: 0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...
310,0,1,1,0,1,0,0,0,0
311,0,0,0,0,0,1,1,1,0
312,0,1,1,1,1,1,0,1,1
313,0,0,1,0,0,1,0,0,0


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [15]:
frequent_itemsets = apriori(df_new_encoded, min_support=0.2, use_colnames=True)

print("\nFrequently Purchased Products:")
frequent_itemsets


Frequently Purchased Products:




Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bread, Bagel)"


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [16]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

print("\nAssociation Rules based on Confidence:")
rules[['antecedents', 'consequents', 'antecedent support', 'consequent support', 'support', 'confidence', 'lift', 'leverage', 'conviction']]


Association Rules based on Confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
3,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773
9,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

- Antecedent Support: the proportion of transactions in the dataset that contain the antecedent of a rule. It measures how frequently the items in the rule co-occur in the dataset.
- Consequent Support: the proportion of transactions in the dataset that contain the consequent of a rule. Consequent support focuses on the occurrence of the items in the consequent of a rule.
- Confidence: measures the probability of the occurrence of the consequent given the antecedent.It indicates how likely it is that the rule is true.
- Lift: measures how much more likely the antecedent and consequent are to occur together compared to if they were statistically independent. Lift > 1 indicates a positive association, while Lift < 1 indicates a negative association.
- Leverage: measures the difference between the observed frequency of the items in the rule and the expected frequency if they were independent.  Leverage > 0 indicates a positive association, while Leverage < 0 indicates a negative association.
- Conviction: measures the ratio of the expected frequency that A occurs without B to the observed frequency of A not occurring with B. High conviction values indicate strong dependency between the antecedent and consequent.
