# Fk-1 x F1 method

In [1]:
import pandas as pd
import itertools

#Generating Frequent 1 itemset
def generate_frequent_1_itemsets(transactions, min_support):
    item_counts = {}
    num_transactions = len(transactions)

    for transaction in transactions:
        for item in transaction:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

    frequent_1_itemsets = {frozenset([item]) for item, count in item_counts.items() if count / num_transactions >= min_support}

    return frequent_1_itemsets

#Generating candidate itemsets i.e. Fk-1 by combining F1 itemsets and so on until we reach the last frequent itemset
#Also included pruning in this step, so that we remove candidate itemsets whose subsets are infrequent before generating new itemsets
def generate_candidates(prev_itemsets, k):
    candidates = set()

    for itemset1 in prev_itemsets:
        for itemset2 in prev_itemsets:
            if len(itemset1.union(itemset2)) == k:
                candidate = itemset1.union(itemset2)
                all_subsets_frequent = all(frozenset(subset) in prev_itemsets for subset in itertools.combinations(candidate, k-1))
                if all_subsets_frequent:
                    candidates.add(candidate)
    return candidates

#Support Counting
def count_support(transactions, candidate_itemsets):
    item_counts = {}

    for transaction in transactions:
        for itemset in candidate_itemsets:
            if itemset.issubset(transaction):
                if itemset in item_counts:
                    item_counts[itemset] += 1
                else:
                    item_counts[itemset] = 1

    return item_counts

#Association rule generation
def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    association_rules = []

    for itemset in frequent_itemsets:
        if len(itemset) > 1:
            for item in itemset:
                a = frozenset([item])
                c = itemset - a
                confidence = count_support(transactions, {itemset})[itemset] / count_support(transactions, {a})[a]
                
                if confidence >= min_confidence:
                    association_rules.append((a, c, confidence))

    return association_rules

#Apriori logic
def apriori(transactions, min_support):
    k = 1
    frequent_itemsets = generate_frequent_1_itemsets(transactions, min_support)
    result = list(frequent_itemsets)
    total_itemsets = len(frequent_itemsets)
    total_candidates = len(set().union(*transactions))

    while frequent_itemsets:
        k += 1
        candidates = generate_candidates(frequent_itemsets, k)
        total_candidates += len(candidates)
        item_counts = count_support(transactions, candidates)
        frequent_itemsets = {itemset for itemset, count in item_counts.items() if count / len(transactions) >= min_support}
        if frequent_itemsets:
            total_itemsets += len(frequent_itemsets)
            result.extend(frequent_itemsets)

    return result, total_itemsets, total_candidates

# Test set used for self-testing
# transactions = [
#     {'Bread', 'Milk'},
#     {'Bread', 'Diaper', 'Beer', 'Eggs'},
#     {'Milk', 'Diaper', 'Beer', 'Coke'},
#     {'Bread', 'Milk', 'Diaper', 'Beer'},
#     {'Bread', 'Milk', 'Diaper', 'Coke'}
# ]

df = pd.read_csv('/kaggle/input/groceries/groceries.csv', usecols=lambda column: column != 'Item(s)')

transactions = df.apply(lambda row: set(row.dropna()), axis=1).tolist()

min_support = 0.01
result, total_itemsets, total_candidates = apriori(transactions, min_support)

print(f"\nTotal number of frequent itemsets: {total_itemsets}")
print(f"Total number of candidates generated: {total_candidates} \n")

print("Frequent itemsets:")
for itemset in result:
    itemset = set(itemset)
    print(itemset)

min_confidence = 0.1
association_rules = generate_association_rules(result, transactions, min_confidence)

print("\nAssociation Rules:")
for rule in association_rules:
    print(f"{set(rule[0])} => {set(rule[1])} (Confidence: {rule[2]})")


Total number of frequent itemsets: 333
Total number of candidates generated: 4579 

Frequent itemsets:
{'beef'}
{'curd'}
{'roll products'}
{'specialty chocolate'}
{'meat'}
{'mustard'}
{'ham'}
{'shopping bags'}
{'pickled vegetables'}
{'seasonal products'}
{'frankfurter'}
{'cake bar'}
{'baking powder'}
{'rolls/buns'}
{'misc. beverages'}
{'packaged fruit/vegetables'}
{'whipped/sour cream'}
{'pip fruit'}
{'white bread'}
{'frozen dessert'}
{'grapes'}
{'coffee'}
{'soda'}
{'flower (seeds)'}
{'processed cheese'}
{'butter'}
{'hygiene articles'}
{'pasta'}
{'salty snack'}
{'butter milk'}
{'frozen vegetables'}
{'red/blush wine'}
{'chewing gum'}
{'detergent'}
{'cat food'}
{'white wine'}
{'long life bakery product'}
{'pastry'}
{'other vegetables'}
{'UHT-milk'}
{'cling film/bags'}
{'waffles'}
{'liquor'}
{'sliced cheese'}
{'napkins'}
{'sausage'}
{'domestic eggs'}
{'margarine'}
{'ice cream'}
{'bottled beer'}
{'frozen meals'}
{'hard cheese'}
{'potted plants'}
{'dessert'}
{'canned vegetables'}
{'brown b

# Fk-1 x Fk-1 method

In [2]:
import pandas as pd

def generate_frequent_1_itemsets(transactions, min_support):
    item_counts = {}
    num_transactions = len(transactions)

    for transaction in transactions:
        for item in transaction:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

    frequent_1_itemsets = {frozenset([item]) for item, count in item_counts.items() if count / num_transactions >= min_support}

    return frequent_1_itemsets

#Same code as above, just changed the candidate generation logic for Fk-1 x Fk-1 method
def generate_candidates(frequent_itemsets, k):
    candidates = set()
    for itemset1 in frequent_itemsets:
        for itemset2 in frequent_itemsets:
            #Ensuring that the first k-2 elements are the same
            if list(itemset1)[:-1] == list(itemset2)[:-1]:
                new_candidate = tuple(sorted(set(itemset1) | set(itemset2)))
                if len(new_candidate) == k and new_candidate not in candidates:
                    candidates.add(new_candidate)

    return candidates

def count_support(transactions, candidates):
    item_counts = {candidate: 0 for candidate in candidates}
    for transaction in transactions:
        for candidate in candidates:
            if set(candidate).issubset(set(transaction)):
                item_counts[candidate] += 1
    return item_counts

def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    association_rules = []

    for itemset in frequent_itemsets:
        if len(itemset) > 1:
            for item in itemset:
                a = frozenset([item])
                c = frozenset(itemset) - a
                confidence = count_support(transactions, {itemset})[itemset] / count_support(transactions, {a})[a]
                
                if confidence >= min_confidence:
                    association_rules.append((a, c, confidence))

    return association_rules

def apriori(transactions, min_support):
    k = 1
    frequent_itemsets = generate_frequent_1_itemsets(transactions, min_support)
    result = list(frequent_itemsets)
    total_itemsets = len(frequent_itemsets)
    total_candidates = len(set().union(*transactions))

    while frequent_itemsets:
        k += 1
        candidates = generate_candidates(frequent_itemsets, k)
        total_candidates += len(candidates)
        item_counts = count_support(transactions, candidates)
        frequent_itemsets = {itemset for itemset, count in item_counts.items() if count / len(transactions) >= min_support}
        if frequent_itemsets:
            total_itemsets += len(frequent_itemsets)
            result.extend(frequent_itemsets)

    return result, total_itemsets, total_candidates

# Test set used for self-testing
# transactions = [
#     {'Bread', 'Milk'},
#     {'Bread', 'Diaper', 'Beer', 'Eggs'},
#     {'Milk', 'Diaper', 'Beer', 'Coke'},
#     {'Bread', 'Milk', 'Diaper', 'Beer'},
#     {'Bread', 'Milk', 'Diaper', 'Coke'}
# ]

df = pd.read_csv('/kaggle/input/groceries/groceries.csv', usecols=lambda column: column != 'Item(s)')

transactions = df.apply(lambda row: set(row.dropna()), axis=1).tolist()

min_support = 0.01
result, total_itemsets, total_candidates = apriori(transactions, min_support)

print(f"\nTotal number of frequent itemsets: {total_itemsets}")
print(f"Total number of candidates generated: {total_candidates} \n")

print("Frequent itemsets:")
for itemset in result:
    itemset = set(itemset)
    print(itemset)

min_confidence = 0.1
association_rules = generate_association_rules(result, transactions, min_confidence)

print("\nAssociation Rules:")
for rule in association_rules:
    print(f"{set(rule[0])} => {set(rule[1])} (Confidence: {rule[2]})")


Total number of frequent itemsets: 333
Total number of candidates generated: 4674 

Frequent itemsets:
{'beef'}
{'curd'}
{'roll products'}
{'specialty chocolate'}
{'meat'}
{'mustard'}
{'ham'}
{'shopping bags'}
{'pickled vegetables'}
{'seasonal products'}
{'frankfurter'}
{'cake bar'}
{'baking powder'}
{'rolls/buns'}
{'misc. beverages'}
{'packaged fruit/vegetables'}
{'whipped/sour cream'}
{'pip fruit'}
{'white bread'}
{'frozen dessert'}
{'grapes'}
{'coffee'}
{'soda'}
{'flower (seeds)'}
{'processed cheese'}
{'butter'}
{'hygiene articles'}
{'pasta'}
{'salty snack'}
{'butter milk'}
{'frozen vegetables'}
{'red/blush wine'}
{'chewing gum'}
{'detergent'}
{'cat food'}
{'white wine'}
{'long life bakery product'}
{'pastry'}
{'other vegetables'}
{'UHT-milk'}
{'cling film/bags'}
{'waffles'}
{'liquor'}
{'sliced cheese'}
{'napkins'}
{'sausage'}
{'domestic eggs'}
{'margarine'}
{'ice cream'}
{'bottled beer'}
{'frozen meals'}
{'hard cheese'}
{'potted plants'}
{'dessert'}
{'canned vegetables'}
{'brown b