In [4]:
# 📘 Frequent Itemsets Analysis - Group 10
# Kevin Korir - Data Simulation Task

# %% [markdown]
# ## 🛍️ PART 1: Simulate Supermarket Transactions
# [Student: Kevin] This section generates 3000 fake supermarket transactions.
# Each transaction contains 2–7 items randomly selected from a pool of 30 items.
# The output is saved to `supermarket_transactions.csv`.

# %%
# [Student: Kevin] Import required libraries
import random
import pandas as pd

# [Student: Kevin] Define a list of 30+ unique supermarket items
items = [
    'Milk', 'Bread', 'Butter', 'Cheese', 'Eggs', 'Juice', 'Apples', 'Bananas',
    'Chicken', 'Beef', 'Fish', 'Rice', 'Pasta', 'Tomatoes', 'Onions', 'Potatoes',
    'Carrots', 'Peppers', 'Yogurt', 'Cereal', 'Chips', 'Soda', 'Cookies', 'Ice Cream',
    'Toilet Paper', 'Soap', 'Shampoo', 'Toothpaste', 'Detergent', 'Coffee', 'Tea', 'Water'
]

# [Student: Kevin] Generate 3000 transactions
transactions = []
for _ in range(3000):
    transaction = random.sample(items, k=random.randint(2, 7))  # 2 to 7 random items
    transactions.append(transaction)

# [Student: Kevin] Convert list of transactions into a DataFrame for inspection
transaction_df = pd.DataFrame({'Transaction': transactions})
transaction_df.head()

# [Student: Kevin] Save transactions to CSV (as comma-separated items per row)
transaction_df.to_csv("supermarket_transactions.csv", index=False)
print("✅ supermarket_transactions.csv created with 3000 transactions.")

# %% [markdown]
# ## 🔢 PART 2 and Beyond (To be implemented by other group members)
# [Student: Margret] One-hot encode transactions for Apriori
# [Student: Geoffrey] Generate frequent itemsets using apriori()
# [Student: Bricole] Identify closed frequent itemsets
# [Student: Bricole] Identify maximal frequent itemsets
# Each member must add inline comments to their section of code
# All output files will be saved in .csv format as required


✅ supermarket_transactions.csv created with 3000 transactions.


# 2 Generate frequent itemsets using apriori()

## 2.1 Install & Import Libraries

In [None]:
# using pip to install required libraries
pip install pandas mlxtend

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

## 2.2 Loading csv

In [4]:
df = pd.read_csv('supermarket_transactions.csv')
df.head()

Unnamed: 0,Transaction
0,"['Toilet Paper', 'Shampoo', 'Carrots', 'Chips'..."
1,"['Tomatoes', 'Carrots', 'Bananas', 'Bread', 'C..."
2,"['Peppers', 'Soda', 'Carrots', 'Toothpaste', '..."
3,"['Water', 'Yogurt', 'Tomatoes', 'Carrots', 'To..."
4,"['Eggs', 'Peppers', 'Carrots']"


## 2.3 One-Hot Encode

In [5]:
# Turn the comma‑separated string into a list
transactions = df['Transaction'].str.split(',')

In [6]:
# One-Hot Encode
encoder = TransactionEncoder() # initialize the encoder
df_encoded = pd.DataFrame(encoder.fit(transactions).transform(transactions), columns=encoder.columns_)

## 2.4 Frequent itemsets

In [7]:
# Mine for frequent itemsets with a minimum support threshold
#  - min_support=0.05 means “itemsets appearing in at least 5% of transactions”
#  - use_colnames=True returns item names instead of column indices
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)
frequent_itemsets.sort_values('support', ascending=False).head(10)


Unnamed: 0,support,itemsets
30,0.087333,( 'Water')
21,0.085667,( 'Potatoes')
24,0.084,( 'Soap')
0,0.083333,( 'Apples')
2,0.082333,( 'Beef')
28,0.081667,( 'Tomatoes')
19,0.081,( 'Pasta')
27,0.080333,( 'Toilet Paper')
1,0.079333,( 'Bananas')
22,0.079333,( 'Rice')
