Name: Hope Kimandi
ADM no: 670317

In [1]:
# Student: Hope

import csv                     # used to save the generated transactions into a CSV file
import random                  # used to randomly select items for each transaction
import pandas as pd            # imported for creating tables
import itertools               # imported for generating combinations
import math                    # imported for math operations 

random.seed(42)                # fixing the random seed so results stay the same every run

num_transactions = 4000        # total number of supermarket transactions to simulate
min_items = 2                  # minimum number of items per transaction
max_items = 7                  # maximum number of items per transaction

# creating a pool of 33 unique items that customers can buy
item_pool = [
    'apples','grapes','bananas','juice','oranges',
    'tomatoes','kiwi','cookies','milk','potatoes',
    'yogurt','rice','onion','cereal','flour','cakes',
    'bread','butter','peanut butter','sugar','salt',
    'coffee','tea','chocolate','eggs','soap','pens','pencil',
    'shampoo','toothpaste','detergent','napkins','olive oil'
]

transactions = []              # empty list where all generated transactions will be stored

# generating each transaction one by one
for i in range(num_transactions):
    k = random.randint(min_items, max_items)   # randomly deciding how many items this customer buys
    items = random.sample(item_pool, k)        # randomly picking k items (no duplicates)
    transactions.append(items)                 # adding this transaction to the full list

# writing the generated data into a CSV file
with open('supermarket_transactions.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)                     # creating a CSV writer object
    writer.writerow(['transaction_id', 'items'])   # writing the header row

    # writing each transaction with its ID
    for idx, t in enumerate(transactions):
        writer.writerow([idx + 1, ';'.join(t)])    # items joined by ";" inside the cell

print('Saved supermarket_transactions.csv')    # confirming that the file has been saved


Saved supermarket_transactions.csv


In [2]:
# Student: Hope

# creating a one-hot encoded dataFrame filled with zeros (for aproiri)
# rows = number of transactions, columns = all unique items in item_pool
one_hot = pd.DataFrame(0, index=range(len(transactions)), columns=item_pool)

# looping through every transaction and marking items as 1 where they appear
for idx, t in enumerate(transactions):           # idx = row index, t = list of items bought
    for item in t:                               # loop through each item in the transaction
        one_hot.at[idx, item] = 1                # mark the item as purchased (set value to 1)

# saving the one-hot encoded dataset to a CSV file
one_hot.to_csv('supermarket_onehot.csv', index_label='transaction_id')

# confirming the file has been saved successfully
print('Saved supermarket_onehot.csv')

Saved supermarket_onehot.csv


In [3]:
# Student: Hope

# setting the minimum support threshold (5% of all transactions)
min_support = 0.05

# converting minimum support into an actual count requirement
min_count = math.ceil(min_support * len(transactions))   # using ceil to ensure we round UP

# FUNCTION: get_frequent_1_itemsets()
# This function counts all single items and returns only those
# whose counts satisfy the minimum support threshold.

def get_frequent_1_itemsets(transactions_list):
    counts = {}                                          # dictionary to store how many times each item appears
    for t in transactions_list:                          # looping through each transaction
        for item in t:                                   # looping through each item in the transaction
            counts[item] = counts.get(item, 0) + 1       # counting occurrences of each item

    # returning itemsets as frozensets so they can act as dictionary keys
    return {
        frozenset([item]): cnt
        for item, cnt in counts.items()
        if cnt >= min_count                              # keeping only items above support threshold
    }

# FUNCTION: apriori_gen()
# Generates candidate itemsets of size k from previous frequent itemsets of size k-1.
# Implements the Apriori joining + pruning steps.

def apriori_gen(prev_freq_itemsets, k):
    prev_itemsets = list(prev_freq_itemsets.keys())       # converting previous frequent itemsets into a list
    candidates = set()                                    # will store all valid candidate itemsets

    # join step: pair each itemset with every other itemset
    for i in range(len(prev_itemsets)):
        for j in range(i + 1, len(prev_itemsets)):
            union_set = prev_itemsets[i] | prev_itemsets[j]   # union of two itemsets

            if len(union_set) == k:                        # only keep unions of the correct size
                subsets_ok = True                          # assume all subsets are valid

                # pruning step: all (k-1)-subsets must be frequent
                for subset in itertools.combinations(union_set, k - 1):
                    if frozenset(subset) not in prev_freq_itemsets:
                        subsets_ok = False
                        break

                if subsets_ok:
                    candidates.add(frozenset(union_set))    # add valid candidate

    return candidates                                      # return all generated candidates


# FUNCTION: count_candidates()
# Counts how many transactions contain each candidate itemset.

def count_candidates(candidates, transactions_list):
    counts = {c: 0 for c in candidates}                 # initialize all candidates with count = 0

    for t in transactions_list:                         # go through each transaction
        tset = set(t)                                   # convert transaction to a set for fast lookup
        for c in candidates:                            # check each candidate
            if c.issubset(tset):                        # if candidate is inside the transaction
                counts[c] += 1                          # increment its support count

    # return only candidates that satisfy the minimum count
    return {c: cnt for c, cnt in counts.items() if cnt >= min_count}

# Main apriori logic

frequent_itemsets = {}             # dictionary to store all frequent itemsets of all sizes

# first pass: find frequent 1-itemsets
L1 = get_frequent_1_itemsets(transactions)
frequent_itemsets.update(L1)       # add them to the global dictionary
prev_L = L1                        # previous level's frequent itemsets
k = 2                              # start generating 2-itemsets

# iterative Apriori process
while True:
    candidates_k = apriori_gen(prev_L, k)               # generate candidates of size k

    if not candidates_k:                                # if no candidates found → stop
        break

    Lk = count_candidates(candidates_k, transactions)    # filter candidates by support

    if not Lk:                                          # if no frequent itemsets remain → stop
        break

    frequent_itemsets.update(Lk)                        # add k-itemsets to global list
    prev_L = Lk                                         # prepare for next iteration
    k += 1                                              # increase k (itemset size)


# Convert results into dataframe and save


fi_rows = []                                             # list to store rows for the CSV

for itemset, cnt in frequent_itemsets.items():
    fi_rows.append({
        'itemset': ','.join(sorted(itemset)),           # convert set to readable string
        'support_count': cnt,                           # raw count
        'support': cnt / len(transactions)              # convert count → support value
    })

fi_df = pd.DataFrame(fi_rows).sort_values(
    by='support', ascending=False
).reset_index(drop=True)

fi_df.to_csv('frequent_itemsets.csv', index=False)       # saving frequent itemsets to CSV

print('Saved frequent_itemsets.csv')                     # confirmation message


Saved frequent_itemsets.csv


In [4]:
# Student: Hope

# Read frequent itemsets generated by Apriori
fi_df = pd.read_csv('frequent_itemsets.csv')  # load the frequent itemsets CSV

# create a mapping of itemsets (as frozensets) - support count for easy lookup
fi_count_map = {
    frozenset(s.split(',')): int(r['support_count'])
    for s, r in zip(fi_df['itemset'], fi_df.to_dict('records'))
}


# Identify Closed Frequent Itemsets
# A closed itemset has no superset with the same support

closed_rows = []  # will store the final closed itemsets

for _, row in fi_df.iterrows():                     # iterate through each frequent itemset
    A_items = frozenset(row['itemset'].split(','))  # convert itemset string to frozenset
    A_count = int(row['support_count'])             # get the support count of this itemset
    is_closed = True                                # assume itemset is closed initially

    # check all other itemsets for supersets with same support
    for B_items, B_count in fi_count_map.items():
        if A_items < B_items and A_count == B_count:  # if there exists a proper superset with same support
            is_closed = False                          # then A is NOT closed
            break

    if is_closed:                                     # if no superset has same support
        closed_rows.append({                           # add to closed itemsets list
            'itemset': ','.join(sorted(A_items)),
            'support_count': A_count,
            'support': A_count / len(transactions)
        })

# convert to DataFrame and save to CSV
closed_df = pd.DataFrame(closed_rows)
closed_df.to_csv('closed_itemsets.csv', index=False)  # save closed itemsets
print('Saved closed_itemsets.csv')                     # confirmation

# Identify Maximal Frequent Itemsets
# A maximal itemset has no frequent proper superset

maximal_rows = []                                      # will store maximal itemsets
fi_sets = [frozenset(s.split(',')) for s in fi_df['itemset']]  # list of all frequent itemsets as frozensets

for A in fi_sets:                                     # iterate through each frequent itemset
    is_maximal = True                                 # assume A is maximal initially
    for B in fi_sets:                                 # check against all other itemsets
        if A < B:                                     # if there exists a proper superset B
            is_maximal = False                        # then A is NOT maximal
            break

    if is_maximal:                                    # if no superset exists
        cnt = int(fi_df[fi_df['itemset'] == ','.join(sorted(A))]['support_count'].iloc[0])  # get support count
        maximal_rows.append({                          # add to maximal itemsets list
            'itemset': ','.join(sorted(A)),
            'support_count': cnt,
            'support': cnt / len(transactions)
        })

# convert to DataFrame and save to CSV
maximal_df = pd.DataFrame(maximal_rows)
maximal_df.to_csv('maximal_itemsets.csv', index=False)   # save maximal itemsets
print('Saved maximal_itemsets.csv')                      # confirmation


Saved closed_itemsets.csv
Saved maximal_itemsets.csv
