### Part 2

In [2]:
import itertools
import pandas as pd

In [3]:
items = ['Shampoo', 'Apple', 'Banana', 'Milk', 'Eggs', 'Soap', 'Bacon', 'Sugar', 'Water', 'Yogurt']
items = sorted(items)

In [4]:
def findsubsets(s, k):
    '''
    Returns all k-itemsets from list s
    Parameters:
        s (list): List of items
        k (int): Size of subsets you wish to find
    Returns:
        subsets (list): List of k-subsets
    '''
    return list(itertools.combinations(s, k))

In [5]:
def generate_possible_rules(freq):
    '''
    Returns possible association rules from an itemset
    Parameters:
        freq (tuple): Frequent itemset
    Returns:
        possible_rules (list): List of possible rules
    '''
    n = len(freq)
    possible_rules = []
    for i in range(1, n):
        for combo in itertools.combinations(freq, i):
            tuple1 = combo
            tuple2 = tuple(item for item in freq if item not in combo)
            possible_rules.append([tuple1, tuple2])
    return possible_rules

In [6]:
def brute_force(items, filename, support, confidence):
    '''
    Uses brute force method to generate frequent items and generate association rules
    Paramaters:
        items (list): Sorted list of items
        filename (str): Filename of database you want to read from
        support (float): Desired support level for finding frequent itemsets
        confidence (float): Desired support level for finding association rules
    Returns:
        [freq_itemsets, rules, sups, cons] (list): A list of frequent items, a list of association rules, and their corresponding support values and confidence values
    '''
    df = pd.read_csv(filename)
    freq_itemsets = []
    supports = {}
    sups = []
    cons = []

    for i in range (1, 11):
        i_sets = findsubsets(items, i)
        for subset in i_sets:
            freq_count = 0
            for ind in df.index:
                match = True
                for item in subset:
                    if not df[item][ind]:
                        match = False
                        break
                if match:
                    freq_count = freq_count + 1

            supports.update({subset: freq_count / 20})
            if freq_count / 20 >= support:
                freq_itemsets.append(subset)
                sups.append(freq_count / 20)
    
    rules = []
    for freq in freq_itemsets:
        possible_rules = generate_possible_rules(freq)
        for p in possible_rules:
            X = p[0]
            Y = p[1]
            X_and_Y = tuple(sorted(X+Y))
            if supports[X_and_Y] / supports[X] >= confidence:
                rules.append(p)
                cons.append(supports[X_and_Y] / supports[X])

    
    return([freq_itemsets, rules, sups, cons])

Frequent itemsets for Database1.csv with support 0.5 and confidence 0.75

In [7]:
result = brute_force(items, 'Database1.csv', 0.5, 0.75)
result[0]

[('Apple',),
 ('Banana',),
 ('Eggs',),
 ('Soap',),
 ('Water',),
 ('Yogurt',),
 ('Soap', 'Water')]

In [8]:
result[1]

[[('Soap',), ('Water',)], [('Water',), ('Soap',)]]