In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#! pip install mlxtend

In [1]:
from mlxtend.frequent_patterns import apriori, association_rules

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [3]:
# load the data set and show the first five transaction
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [4]:
uniqueProd = df["6"].unique()
print(set(uniqueProd))

{'Diaper', 'Milk', 'Pencil', 'Eggs', 'Meat', 'Wine', 'Bread', nan, 'Bagel', 'Cheese'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [5]:
#create an itemset based on the products

itemset = set(uniqueProd)
# encoding the feature
encodedValue = []
for index, row in df.iterrows():
    rowset = set(row) 
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for i in uncommons:
        labels[i] = 0
    for j in commons:
        labels[j] = 1
    encodedValue.append(labels)


In [6]:
  # create new dataframe from the encoded features
encodedDf = pd.DataFrame(encodedValue)
  # show the new dataframe
encodedDf

Unnamed: 0,NaN,Milk,Bagel,Diaper,Pencil,Eggs,Meat,Wine,Bread,Cheese
0,0,0,0,1,1,1,1,1,1,1
1,0,1,0,1,1,0,1,1,1,1
2,1,1,0,0,0,1,1,1,0,1
3,1,1,0,0,0,1,1,1,0,1
4,1,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
310,1,0,0,0,0,1,0,0,1,1
311,1,1,0,0,1,0,1,0,0,0
312,0,0,0,1,1,1,1,1,1,1
313,1,0,0,0,0,0,1,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [10]:
encodedDf = encodedDf.drop(encodedDf.columns[0], axis=1)
encodedDf

Unnamed: 0,Milk,Bagel,Diaper,Pencil,Eggs,Meat,Wine,Bread,Cheese
0,0,0,1,1,1,1,1,1,1
1,1,0,1,1,0,1,1,1,1
2,1,0,0,0,1,1,1,0,1
3,1,0,0,0,1,1,1,0,1
4,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...
310,0,0,0,0,1,0,0,1,1
311,1,0,0,1,0,1,0,0,0
312,0,0,1,1,1,1,1,1,1
313,0,0,0,0,0,1,0,0,1


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [11]:
freqPurchasedProd = apriori(encodedDf, min_support=0.2, use_colnames=True)
freqPurchasedProd.head(33)



Unnamed: 0,support,itemsets
0,0.501587,(Milk)
1,0.425397,(Bagel)
2,0.406349,(Diaper)
3,0.361905,(Pencil)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.438095,(Wine)
7,0.504762,(Bread)
8,0.501587,(Cheese)
9,0.225397,"(Milk, Bagel)"


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [12]:
assRules = association_rules(freqPurchasedProd, metric="confidence", min_threshold=0.6)
assRules.head(14)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
3,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
4,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
5,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
6,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
7,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
8,"(Milk, Meat)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137,0.524816
9,"(Milk, Cheese)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429,0.410959


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

# Antecedent Support:
- Explanation: The support of the antecedent itemset A.
- Formula: support(A)=proportion of transaction containing A.
- Therefore, Measures the frequency of occurrence of the antecedent itemset A in the dataset.

# Consequent Support:
- Explanation: The support of the consequent itemset C.
- Formula: support(C)=proportion of transaction containing C.
- Therefore, Measures the frequency of occurrence of the antecedent itemset C in the dataset.

# Support:
- Explanation: The support of the combined itemset A∪C
- Formula: support(A→C)=support(A∪C).
- Therefore, Measures the frequency of occurrence of the combined itemset A∪C in the dataset.

# Confidence:
- Explanation: The conditional probability of observing the consequent C given the antecedent A.
- Formula: confidence(A→C)=support(A→C)/support(A).
- Therefore, Measures how often the rule A→C is true.

# Lift:
- Explanation: Compares the likelihood of observing A and C together against the likelihood if they were independent.
- Formula: lift(A→C)=confidence(A→C)/support(C).
- Therefore, A lift value greater than 1 indicates a positive correlation between A and C.

# Leverage:
- Explanation: Measures the difference between the observed frequency of A and C appearing together and the frequency expected under independence.
- Formula: levarage(A→C)=support(A→C)−support(A)×support(C).
- Therefore, A value of 0 indicates independence.

# Conviction:
- Explanation: Measures how much the consequent C depends on the antecedent A.
- Formula: conviction(A→C)=(1−support(C))/(1−confidence(A→C)).
- Therefore, A high conviction value indicates a strong dependency.