In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

Collecting mlxtend==0.23.1
  Obtaining dependency information for mlxtend==0.23.1 from https://files.pythonhosted.org/packages/1c/07/512f6a780239ad6ce06ce2aa7b4067583f5ddcfc7703a964a082c706a070/mlxtend-0.23.1-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
    --------------------------------------- 0.0/1.4 MB 682.7 kB/s eta 0:00:03
   - -------------------------------------- 0.1/1.4 MB 787.7 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.4 MB 845.5 kB/s eta 0:00:02
   ---- ----------------------------------- 0.2/1.4 MB 833.5 kB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.4 MB 892.5 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.4 MB 927.4 kB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.4 MB 951.8 kB/s eta 0:00:02
   --------- --------------------

# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [100]:
# load the data set ans show the first five transaction
df = pd.read_csv('https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv', delimiter=",")

df.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [101]:
print(df['0'].unique())

['Bread' 'Cheese' 'Meat' 'Eggs' 'Wine' 'Bagel' 'Pencil' 'Diaper' 'Milk']


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [102]:
#create an itemset based on the products
encode_dict = {
    'Milk': 1,
    'Bagel': 1,
    np.nan: 0,
    'Wine': 1,
    'Cheese': 1,
    'Diaper': 1,
    'Meat': 1,
    'Eggs': 1,
    'Bread': 1,
    'Pencil': 1
}

# encoding the feature
filtered_dict = {k: v for k, v in encode_dict.items() if pd.notna(k)}
df_encoded = pd.DataFrame([filtered_dict])
print(df_encoded)

   Milk  Bagel  Wine  Cheese  Diaper  Meat  Eggs  Bread  Pencil
0     1      1     1       1       1     1     1      1       1


In [108]:
# create new dataframe from the encoded features
df_melt = df.stack().reset_index(level=1, drop=True).reset_index()
df_melt.columns = ['Row', 'Item']
df_melt['Encoded'] = df_melt['Item'].map(encode_dict).fillna(0).astype(int)
new_df = df_melt.pivot_table(index='Row', columns='Item', values='Encoded', aggfunc='max', fill_value=0)
new_df = df_encoded.reset_index(drop=True)

 # show the new dataframe
df_encoded.head()

Item,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


In [109]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(new_df), columns=new_df.columns)
df_imputed.head(5)

Item,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [110]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemsets = apriori(new_df, min_support=0.2, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# printing the frequent itemset
frequent_itemsets.head(33)



Unnamed: 0,support,itemsets,length
0,0.425397,(Bagel),1
1,0.504762,(Bread),1
2,0.501587,(Cheese),1
3,0.406349,(Diaper),1
4,0.438095,(Eggs),1
5,0.47619,(Meat),1
6,0.501587,(Milk),1
7,0.361905,(Pencil),1
8,0.438095,(Wine),1
9,0.279365,"(Bread, Bagel)",2


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [112]:
rules = association_rules(frequent_itemsets, metric ="lift", min_threshold = 0.6) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head(14)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
50,"(Milk, Meat)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,0.080564,2.952137,0.524816
44,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717
45,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773,0.487091
18,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
20,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
46,"(Cheese, Meat)",(Eggs),0.32381,0.438095,0.215873,0.666667,1.521739,0.074014,1.685714,0.507042
51,"(Milk, Cheese)",(Meat),0.304762,0.47619,0.203175,0.666667,1.4,0.05805,1.571429,0.410959
1,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
21,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
52,"(Cheese, Meat)",(Milk),0.32381,0.501587,0.203175,0.627451,1.250931,0.040756,1.337845,0.296655


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)

Antecedent support refers to the proportion of transactions that contain the antecedent (e.g., Milk, Meat) relative to the total number of transactions, indicating how frequently the antecedent appears. Consequent support is the proportion of transactions containing the consequent (e.g., Cheese), reflecting how common the consequent is in the dataset.

Support measures how often the rule (antecedent to consequent) occurs in the dataset, i.e., the proportion of transactions that include both the antecedent and consequent. Confidence indicates the likelihood that the consequent will appear given that the antecedent is present. A higher confidence suggests a stronger association between the antecedent and consequent.

Lift compares the observed support of the rule with the expected support if the antecedent and consequent were independent. Values greater than 1 indicate a positive association. Leverage quantifies how much more likely the antecedent and consequent co-occur than would be expected by chance, with higher values indicating a stronger relationship.

Conviction measures the degree to which the rule is reliable by indicating the ratio of the likelihood of observing the consequent when the antecedent occurs versus when it does not. A value greater than 1 suggests that the rule is likely to be valid.

Zhang's Metric is a measure used to evaluate the strength of association rules. It combines several factors, including support, confidence, and lift, to provide a more comprehensive assessment of the rule’s effectiveness. A higher Zhang's metric indicates that the rule is strong and reliable.

For example, in the case of the rule "Milk and Meat to Cheese," a confidence of 0.83 means there's an 83% chance that Cheese will be bought if Milk and Meat are bought. The lift of 1.66 shows it's more likely to happen than by random chance, and the conviction of 2.95 shows the rule is fairly reliable.