In [125]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder
! pip install mlxtend

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [126]:
# load the data set and show the first five transaction
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [127]:
items = set(df.values.flatten())
items

{'Bagel',
 'Bread',
 'Cheese',
 'Diaper',
 'Eggs',
 'Meat',
 'Milk',
 'Pencil',
 'Wine',
 nan}

## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [128]:
#create an itemset based on the products
te = TransactionEncoder()
itemset = te.fit_transform(df.apply(lambda x: x.dropna().tolist()))
df_itemset = pd.DataFrame(itemset, columns=te.columns_)
# encoding the feature
encoded_vals = []
for index, row in df.iterrows(): 
    labels = {}
    uncommons = list(set(df_itemset) - set(row))
    commons = list(set(df_itemset).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)

In [129]:
# create new dataframe from the encoded features
df_ohe = pd.DataFrame(encoded_vals)
# show the new dataframe
df_ohe

Unnamed: 0,Bagel,Milk,Diaper,Wine,Meat,Pencil,Eggs,Bread,Cheese
0,0,0,1,1,1,1,1,1,1
1,0,1,1,1,1,1,0,1,1
2,0,1,0,1,1,0,1,0,1
3,0,1,0,1,1,0,1,0,1
4,0,0,0,1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
310,0,0,0,0,0,0,1,1,1
311,0,1,0,0,1,1,0,0,0
312,0,0,1,1,1,1,1,1,1
313,0,0,0,0,1,0,0,0,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [133]:
#The empty or NaN column has already dropped on the itemset before

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [131]:
frequently_items = apriori(df_ohe, min_support = 0.2, use_colnames = True, verbose = 1)

Processing 120 combinations | Sampling itemset size 3




Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [132]:
association_rules(frequently_items, metric = "confidence", min_threshold = 0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
3,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
4,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
5,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
6,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
7,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
8,"(Meat, Cheese)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845,0.296655
9,"(Meat, Milk)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137,0.524816


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

In [None]:
# Antecedent Support:
# Definition: The support of the antecedent (left-hand side) of an association rule.
# Purpose: Indicates the frequency of occurrence of the antecedent.

# Consequent Support:
# Definition: The support of the consequent (right-hand side) of an association rule.
# Purpose: Indicates the frequency of occurrence of the consequent.
    
# Support:
# Definition: Support measures the frequency of a particular itemset in the dataset.
# Purpose: Indicates how often the itemset appears in the dataset.
    
# Confidence:
# Definition: Confidence measures the reliability of the rule by indicating the proportion of transactions containing the antecedent where the consequent is also present.
# Purpose: Provides an indication of how often the rule has been found to be true.
    
# Lift:
# Definition: Lift measures how much more likely the consequent is given the antecedent, compared to its likelihood without the antecedent.
# Purpose: Lift > 1 indicates that the presence of the antecedent increases the likelihood of the consequent.
    
# Leverage:
# Definition: Leverage measures the difference between the observed frequency of the itemset and what would be expected if the items were independent.
# Purpose: Indicates whether the antecedent and consequent appearing together is a result of a meaningful pattern.
    
# Conviction:
# Definition: Conviction measures how much more likely the antecedent implies the consequent, compared to if they were independent.
# Purpose: A high conviction value indicates a strong implication from the antecedent to the consequent.
