In [10]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder

!pip install mlxtend==0.23.1

  and should_run_async(code)




# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here:
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [11]:
# load the data set ans show the first five transaction
df = pd.read_csv("https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv")
df.head()

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


In [12]:
print(df['1'].unique())

['Wine' 'Cheese' 'Meat' 'Pencil' 'Bread' 'Diaper' 'Eggs' nan 'Bagel'
 'Milk']


  and should_run_async(code)


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [13]:
#create an itemset based on the products
all_items = df.values.flatten()

unique_items = set(all_items)

presence_map = {item: 0 for item in unique_items}

first_row_items = df.iloc[0].values
for item in first_row_items:
    presence_map[item] = 1

# encoding the feature

# Reshape the data to be a list of items per customer (excluding NaN)
reshaped_data = df.values.tolist()

# Flatten the list of items
flat_items = [item for sublist in reshaped_data for item in sublist]

# Reshape the data to a 2D array where each item is a row
flat_items_array = np.array(flat_items).reshape(-1, 1)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
encoded_data = encoder.fit_transform(flat_items_array)

# Convert the encoded data to integer (1 and 0)
encoded_data = encoded_data.astype(int)

# Create a DataFrame with item names as columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.categories_[0])

# Now, create a customer ID list for the rows to map the one-hot encoding back to the original customers
customer_ids = []
for i, row in enumerate(reshaped_data):
    customer_ids.extend([i] * len(row))

# Add the customer IDs to the DataFrame
encoded_df['customer_id'] = customer_ids

# Pivot the DataFrame to get one-hot encoding by customer
final_df = encoded_df.groupby('customer_id').sum()

presence_map

  and should_run_async(code)


{nan: 0,
 'Eggs': 1,
 'Milk': 0,
 'Bread': 1,
 'Meat': 1,
 'Pencil': 1,
 'Bagel': 0,
 'Cheese': 1,
 'Diaper': 1,
 'Wine': 1}

In [14]:
  # create new dataframe from the encoded features
transformed_df = final_df

  # show the new dataframe
transformed_df.head()

  and should_run_async(code)


Unnamed: 0_level_0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine,nan
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1,1,1,1,1,0,1,1,0
1,0,1,1,1,0,1,1,1,1,0
2,0,0,1,0,1,1,1,0,1,2
3,0,0,1,0,1,1,1,0,1,2
4,0,0,0,0,0,1,0,1,1,4


In [15]:
# Since, the encoded dataframe consist of the empty column. We will drop the NaN column or u can use the index.
transformed_df = transformed_df.drop(columns=["nan"])

transformed_df.head()

  and should_run_async(code)


Unnamed: 0_level_0,Bagel,Bread,Cheese,Diaper,Eggs,Meat,Milk,Pencil,Wine
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,1,1,1,1,1,0,1,1
1,0,1,1,1,0,1,1,1,1
2,0,0,1,0,1,1,1,0,1
3,0,0,1,0,1,1,1,0,1
4,0,0,0,0,0,1,0,1,1


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products.
For this case study, we will min_support=0.2

In [16]:
#Set threshold value untuk digunakan dalam penghitungan support
from mlxtend.frequent_patterns import apriori, association_rules

apriori(transformed_df, min_support=0.2, use_colnames=True)

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bread, Bagel)"


The we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [17]:
frequent_itemsets = apriori(transformed_df, min_support=0.2, use_colnames=True)

confidence_threshold = 0.6
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence_threshold)
rules.drop(columns=['zhangs_metric'], inplace=True)
rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203
2,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891
3,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754
4,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
5,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624
8,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773
9,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__, __conviction__, __conviction__ and the interpretation from the case above (please use text section)


Antecedent support, consequent support, and support are key metrics in association rule mining that provide foundational insights into item relationships. Antecedent support measures the proportion of transactions that include the antecedent item(s), reflecting how frequently these items appear in the dataset. For example, if "Cheese" is present in 50% of all transactions, its antecedent support is 0.50, indicating that it is a commonly purchased item. Similarly, consequent support represents the proportion of transactions that contain the consequent item(s), showing their overall popularity. For instance, if "Milk" appears in 30% of transactions, its consequent support is 0.30, helping to understand the prevalence of the consequent independently of its relationship with the antecedent.

On the other hand, support measures the proportion of transactions where both the antecedent and consequent appear together. For example, if "Cheese" and "Wine" co-occur in 25% of transactions, the support is 0.25, indicating how frequently this combination is observed in the dataset. Together, these metrics provide a comprehensive picture of item relationships: antecedent and consequent support highlight the individual frequencies of items, while support shows the strength of their co-occurrence. High support suggests that the combination is common and relevant for actionable insights, whereas very low support might limit the practical utility of a rule even if other metrics like confidence or lift are high.
Confidence quantifies how often the consequent appears when the antecedent is present, with higher values indicating stronger likelihoods. For example, combinations like (Milk, Meat → Cheese) (83.12%) and (Eggs, Meat → Cheese) (80.95%) have high confidence, suggesting that customers frequently purchase these items together.

Lift evaluates how much more likely the consequent is to occur with the antecedent compared to its random occurrence, with values greater than 1 indicating a positive association. Strong lift values, such as 1.65 in (Milk, Meat → Cheese), highlight meaningful relationships that can be leveraged for creating product bundles or promotions. Leverage measures the degree to which item combinations exceed their expected occurrence by chance, with larger values pointing to significant co-occurrence. For instance, combinations like (Eggs, Meat → Cheese) and (Bagel → Bread) demonstrate actionable relationships for cross-selling. Meanwhile, conviction captures the dependency between the antecedent and consequent, with higher values (e.g., 2.95 for Milk, Meat → Cheese) signifying that the consequent rarely appears without the antecedent, making the rule highly dependable for predictions.

Overall, the table reveals patterns of strong associations, such as between Eggs, Meat, and Cheese, which suggest opportunities for recommendations or bundling. Moderate associations, like (Cheese → Milk), offer weaker but still useful insights for promoting related products, while weaker rules may result from chance. These metrics—antecedent support, consequent support, support, confidence, lift, leverage, and conviction—are invaluable for applications like market basket analysis, where frequently bought-together items can inform cross-selling strategies, recommendation systems, and inventory planning.