In [2]:
import csv

def read_products(file_path):
    """
    Reads product names from an external CSV file.
    Each row must contain one product.
    Returns a list of product names.
    """
    products = []
    with open(file_path, mode="r", newline="") as file:
        reader = csv.reader(file)
        for row in reader:
            if row and row[0].strip():
                products.append(row[0].strip())
    return products

def generate_transaction(product_list, txn_index, items_per_txn=3):
    """
    Deterministically generates a single transaction.
    Uses a cyclic selection of products from the product_list based on txn_index.
    """
    txn = []
    n = len(product_list)
    for i in range(items_per_txn):
        prod_index = (txn_index + i) % n
        txn.append(product_list[prod_index])
    return txn

def create_database(product_list, offset, total_txns=20, items_per_txn=3):
    """
    Builds a database of transactions.
    The offset ensures that each database is slightly different.
    """
    db = []
    for t in range(total_txns):
        txn = generate_transaction(product_list, txn_index=t + offset, items_per_txn=items_per_txn)
        db.append(txn)
    return db

def export_database_csv(db, file_name):
    """
    Writes the database of transactions to a CSV file.
    The CSV will have two columns: TransactionID and Items.
    Items are joined by a semicolon.
    """
    with open(file_name, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["TransactionID", "Items"])
        for idx, txn in enumerate(db, start=1):
            writer.writerow([idx, ";".join(txn)])

def main():
    # Read products from external file (do not hard-code items here)
    product_file = "products.csv"
    product_names = read_products(product_file)
    if len(product_names) < 10:
        print("Error: At least 10 products are required. Found:", len(product_names))
        return

    # Generate 5 different databases (each with 20 transactions)
    num_dbs = 5
    for db_num in range(1, num_dbs + 1):
        db_offset = db_num  # Different offset for each database
        database = create_database(product_names, offset=db_offset, total_txns=20, items_per_txn=3)
        csv_file_name = f"db_transactions_{db_num}.csv"
        export_database_csv(database, csv_file_name)
        print(f"Database {db_num} created and saved as '{csv_file_name}' with 20 transactions.")

if __name__ == "__main__":
    main()


Database 1 created and saved as 'db_transactions_1.csv' with 20 transactions.
Database 2 created and saved as 'db_transactions_2.csv' with 20 transactions.
Database 3 created and saved as 'db_transactions_3.csv' with 20 transactions.
Database 4 created and saved as 'db_transactions_4.csv' with 20 transactions.
Database 5 created and saved as 'db_transactions_5.csv' with 20 transactions.


In [3]:
import csv
from itertools import combinations

def load_transactions(filename):
    """
    Load transactions from a CSV file.
    Assumes a header with columns "TransactionID" and "Items",
    where items are semicolon-separated.
    """
    transactions = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            items = row[1].split(';')
            transactions.append(set(items))
    return transactions

def get_support(itemset, transactions):
    """
    Count the number of transactions that contain the itemset.
    """
    return sum(1 for transaction in transactions if itemset.issubset(transaction))

def generate_candidates(prev_frequents, k):
    """
    Generate candidate k-itemsets by joining frequent (k-1)-itemsets.
    Only candidates that have the correct size (k) are retained.
    """
    candidates = set()
    list_prev = list(prev_frequents)
    for i in range(len(list_prev)):
        for j in range(i + 1, len(list_prev)):
            candidate = list_prev[i].union(list_prev[j])
            if len(candidate) == k:
                candidates.add(candidate)
    return candidates

def apriori_frequent_itemsets(transactions, min_support):
    """
    Generate all frequent itemsets using the Apriori algorithm.
    """
    # First, count individual items to get frequent 1-itemsets
    item_counts = {}
    for transaction in transactions:
        for item in transaction:
            item_counts[item] = item_counts.get(item, 0) + 1

    frequents = {frozenset([item]): count
                 for item, count in item_counts.items()
                 if count >= min_support}
    all_frequents = dict(frequents)
    k = 2

    while frequents:
        # Generate candidate k-itemsets from frequent (k-1)-itemsets
        candidates = generate_candidates(list(frequents.keys()), k)
        candidate_counts = {}
        for candidate in candidates:
            count = get_support(candidate, transactions)
            if count >= min_support:
                candidate_counts[frozenset(candidate)] = count

        # Update for the next level
        frequents = candidate_counts
        all_frequents.update(frequents)
        k += 1

    return all_frequents

def generate_association_rules(frequent_itemsets, transactions, min_confidence):
    """
    Generate association rules from frequent itemsets.
    For each frequent itemset of size >= 2, consider every non-empty
    proper subset as the antecedent and compute the confidence.
    """
    rules = []
    for itemset, itemset_support in frequent_itemsets.items():
        if len(itemset) < 2:
            continue  # Rules require at least 2 items
        # Iterate over all possible non-empty proper subsets
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                antecedent_support = frequent_itemsets.get(antecedent, get_support(antecedent, transactions))
                confidence = itemset_support / antecedent_support
                if confidence >= min_confidence:
                    rules.append((antecedent, consequent, itemset_support, confidence))
    return rules

def process_file(filename, min_support, min_confidence):
    transactions = load_transactions(filename)
    print(f"Processing file: {filename}")

    frequent_itemsets = apriori_frequent_itemsets(transactions, min_support)
    print("Frequent Itemsets:")
    for itemset, support in frequent_itemsets.items():
        print(f"  {set(itemset)}: support = {support}")

    rules = generate_association_rules(frequent_itemsets, transactions, min_confidence)
    print("\nAssociation Rules:")
    for antecedent, consequent, support, confidence in rules:
        print(f"  {set(antecedent)} -> {set(consequent)} (support: {support}, confidence: {confidence:.2f})")
    print("-" * 40)

def main():
    min_support = 2      # Minimum support count
    min_confidence = 0.5 # Minimum confidence threshold
    transaction_files = [f"db_transactions_{i}.csv" for i in range(1, 6)]

    for filename in transaction_files:
        process_file(filename, min_support, min_confidence)

if __name__ == "__main__":
    main()


Processing file: db_transactions_1.csv
Frequent Itemsets:
  {'Pasta'}: support = 5
  {'Olive Oil'}: support = 6
  {'Rice'}: support = 4
  {'Coffee'}: support = 6
  {'Tea'}: support = 6
  {'Sugar'}: support = 6
  {'Salt'}: support = 6
  {'Flour'}: support = 6
  {'Butter'}: support = 6
  {'Cheese'}: support = 5
  {'\ufeffProduct'}: support = 4
  {'Flour', 'Sugar'}: support = 2
  {'Sugar', 'Tea'}: support = 4
  {'Cheese', 'Flour'}: support = 2
  {'Pasta', 'Coffee'}: support = 2
  {'Tea', 'Coffee'}: support = 4
  {'Pasta', 'Rice'}: support = 3
  {'Salt', 'Tea'}: support = 2
  {'Olive Oil', 'Rice'}: support = 2
  {'Butter', '\ufeffProduct'}: support = 2
  {'Salt', 'Flour'}: support = 4
  {'Cheese', 'Butter'}: support = 4
  {'Olive Oil', 'Tea'}: support = 2
  {'Pasta', 'Olive Oil'}: support = 4
  {'Salt', 'Sugar'}: support = 4
  {'Sugar', 'Coffee'}: support = 2
  {'\ufeffProduct', 'Rice'}: support = 2
  {'Cheese', '\ufeffProduct'}: support = 3
  {'Salt', 'Butter'}: support = 2
  {'Flour', 'B

In [4]:
import csv
import itertools
import pandas as pd
import time
import sys
from mlxtend.frequent_patterns import apriori as mlxtend_apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

class TransactionData:
    @staticmethod
    def load_csv(file_path):
        """
        Load transactions from a CSV file.
        Expects a header with "TransactionID" and "Items" columns.
        Items should be semicolon-separated.
        """
        transactions = []
        try:
            with open(file_path, 'r', newline='') as file:
                csv_reader = csv.DictReader(file)
                for row in csv_reader:
                    items_field = row.get("Items", "")
                    if items_field:
                        items = [item.strip() for item in items_field.split(';') if item.strip()]
                        transactions.append(set(items))
            return transactions
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            sys.exit(1)

    @staticmethod
    def to_one_hot(transactions):
        """
        Convert a list of transaction sets into a one-hot encoded DataFrame.
        """
        transaction_list = [list(tr) for tr in transactions]
        encoder = TransactionEncoder()
        encoded_array = encoder.fit_transform(transaction_list)
        return pd.DataFrame(encoded_array, columns=encoder.columns_)

def compute_support(candidate, transactions):
    """
    Compute the support count for a candidate itemset.
    """
    return sum(1 for tr in transactions if candidate.issubset(tr))

def compute_frequent_itemsets_brute(transactions, min_support_count):
    """
    Enumerate candidate itemsets of increasing sizes and record those meeting the support threshold.
    Returns a dictionary mapping frozenset(itemset) to its support count.
    """
    all_items = set()
    for tr in transactions:
        all_items.update(tr)

    frequent_sets = {}
    size = 1
    candidates = [frozenset([itm]) for itm in all_items]

    while candidates:
        current_frequents = {}
        for cand in candidates:
            supp = compute_support(cand, transactions)
            if supp >= min_support_count:
                current_frequents[cand] = supp
        if not current_frequents:
            break
        frequent_sets.update(current_frequents)
        size += 1
        candidates = [frozenset(combo) for combo in itertools.combinations(all_items, size)]
    return frequent_sets

def derive_rules_brute(frequent_sets, transactions, min_conf):
    """
    Generate association rules from frequent itemsets using brute-force.
    Each rule (antecedent => consequent) is output if its confidence is at least min_conf.
    Returns a list of tuples: (antecedent, consequent, support, confidence).
    """
    rules = []
    for itemset, itemset_support in frequent_sets.items():
        if len(itemset) < 2:
            continue  # Cannot derive a rule from a singleton
        for i in range(1, len(itemset)):
            for antecedent in itertools.combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                if not consequent:
                    continue
                antecedent_support = frequent_sets.get(antecedent, compute_support(antecedent, transactions))
                if antecedent_support > 0:
                    confidence = itemset_support / antecedent_support
                    if confidence >= min_conf:
                        rules.append((antecedent, consequent, itemset_support, confidence))
    return rules

def display_rules(rules, method_label):
    print(f"\nAssociation Rules via {method_label}:")
    for antecedent, consequent, supp, conf in rules:
        print(f"  {set(antecedent)} => {set(consequent)} (support: {supp}, confidence: {conf:.2f})")

def execute_brute_force(transactions, support_frac, min_conf):
    total_transactions = len(transactions)
    min_support_count = max(1, int(total_transactions * support_frac))
    start = time.perf_counter()
    freq_sets = compute_frequent_itemsets_brute(transactions, min_support_count)
    rules = derive_rules_brute(freq_sets, transactions, min_conf)
    elapsed = time.perf_counter() - start
    print(f"\nBrute-Force Method completed in {elapsed:.4f} seconds.")
    display_rules(rules, "Brute-Force")
    return rules, elapsed

def execute_mlxtend_apriori(onehot_df, support_frac, min_conf):
    start = time.perf_counter()
    freq_itemsets = mlxtend_apriori(onehot_df, min_support=support_frac, use_colnames=True)
    freq_itemsets['length'] = freq_itemsets['itemsets'].apply(len)
    rules_df = association_rules(freq_itemsets, metric="confidence", min_threshold=min_conf)
    elapsed = time.perf_counter() - start
    print(f"\nMLXtend Apriori completed in {elapsed:.4f} seconds.")
    print("\nAssociation Rules via MLXtend Apriori:")
    for _, row in rules_df.iterrows():
        ant = set(row['antecedents'])
        cons = set(row['consequents'])
        print(f"  {ant} => {cons} (support: {row['support']:.2f}, confidence: {row['confidence']:.2f})")
    return rules_df, elapsed

def execute_mlxtend_fpgrowth(onehot_df, support_frac, min_conf):
    start = time.perf_counter()
    freq_itemsets = fpgrowth(onehot_df, min_support=support_frac, use_colnames=True)
    freq_itemsets['length'] = freq_itemsets['itemsets'].apply(len)
    rules_df = association_rules(freq_itemsets, metric="confidence", min_threshold=min_conf)
    elapsed = time.perf_counter() - start
    print(f"\nMLXtend FP-Growth completed in {elapsed:.4f} seconds.")
    print("\nAssociation Rules via MLXtend FP-Growth:")
    for _, row in rules_df.iterrows():
        ant = set(row['antecedents'])
        cons = set(row['consequents'])
        print(f"  {ant} => {cons} (support: {row['support']:.2f}, confidence: {row['confidence']:.2f})")
    return rules_df, elapsed

def main():
    files_input = input("Enter CSV file paths (comma-separated): ").strip()
    file_paths = [path.strip() for path in files_input.split(',') if path.strip()]
    support_input = input("Enter minimum support (fraction, e.g., 0.05): ").strip()
    conf_input = input("Enter minimum confidence (fraction, e.g., 0.5): ").strip()
    try:
        support_frac = float(support_input)
        min_conf = float(conf_input)
    except Exception as e:
        print("Invalid input for support or confidence.", e)
        sys.exit(1)

    for file in file_paths:
        print(f"\nProcessing file: {file}")
        transactions = TransactionData.load_csv(file)
        print(f"Loaded {len(transactions)} transactions. Sample transactions:")
        for tr in transactions[:5]:
            print("  ", tr)

        onehot_df = TransactionData.to_one_hot(transactions)

        brute_rules, brute_time = execute_brute_force(transactions, support_frac, min_conf)
        apriori_rules, apriori_time = execute_mlxtend_apriori(onehot_df, support_frac, min_conf)
        fpgrowth_rules, fpgrowth_time = execute_mlxtend_fpgrowth(onehot_df, support_frac, min_conf)

        print("\nSummary for", file)
        print(f"  Brute-Force: {brute_time:.4f} sec, Rules: {len(brute_rules)}")
        print(f"  MLXtend Apriori: {apriori_time:.4f} sec, Rules: {len(apriori_rules)}")
        print(f"  MLXtend FP-Growth: {fpgrowth_time:.4f} sec, Rules: {len(fpgrowth_rules)}")
        print("-" * 50)

if __name__ == "__main__":
    main()


Enter CSV file paths (comma-separated): db_transactions_1.csv,db_transactions_2.csv
Enter minimum support (fraction, e.g., 0.05): 0.2
Enter minimum confidence (fraction, e.g., 0.5): 0.2

Processing file: db_transactions_1.csv
Loaded 20 transactions. Sample transactions:
   {'Pasta', 'Olive Oil', 'Rice'}
   {'Pasta', 'Olive Oil', 'Coffee'}
   {'Tea', 'Olive Oil', 'Coffee'}
   {'Tea', 'Sugar', 'Coffee'}
   {'Salt', 'Sugar', 'Tea'}

Brute-Force Method completed in 0.0008 seconds.

Association Rules via Brute-Force:
  {'Salt'} => {'Sugar'} (support: 4, confidence: 0.67)
  {'Sugar'} => {'Salt'} (support: 4, confidence: 0.67)
  {'Sugar'} => {'Tea'} (support: 4, confidence: 0.67)
  {'Tea'} => {'Sugar'} (support: 4, confidence: 0.67)
  {'Olive Oil'} => {'Coffee'} (support: 4, confidence: 0.67)
  {'Coffee'} => {'Olive Oil'} (support: 4, confidence: 0.67)
  {'Tea'} => {'Coffee'} (support: 4, confidence: 0.67)
  {'Coffee'} => {'Tea'} (support: 4, confidence: 0.67)
  {'Flour'} => {'Butter'} (suppo