In [148]:
import csv
import datetime
import math
import numpy as np
from itertools import chain, combinations
from collections import defaultdict

### Basic I/O

In [149]:
def read_file(filename):
    ds = []
    with open(filename, newline='') as csvfile:
        filereader = csv.reader(csvfile, delimiter=',')
        for row in filereader:
            ds.append(tuple(sorted(row[:-1])))
        return ds

In [150]:
read = read_file('GroceryStore.csv')[:5]
print(read)

[('Butter', 'Cheese', 'Coffee Powder', 'Ghee', 'Lassi', 'Yougurt'), ('Coffee Powder', 'Ghee'), ('Butter', 'Cheese', 'Lassi', 'Tea Powder'), ('Bread', 'Butter', 'Cheese', 'Coffee Powder', 'Panner', 'Tea Powder'), ('Butter', 'Cheese', 'Coffee Powder', 'Sugar', 'Sweet', 'Yougurt')]


In [151]:
def write_file(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        filewriter = csv.writer(csvfile, delimiter=',')
        for row in data:
            filewriter.writerow(row)

### Apriori Algorithm

In [152]:
def calculate_support_count(instance, data, is_set):
    count = 0
    if is_set:
        for row in data:
            if set(instance).issubset(set(row)): 
                count = count + 1
    else:
        for row in data:
            if instance in row:
                count = count + 1
    return count

In [153]:
def calculate_support(data, items):
    dct = {}
    lgtn = len(data)
    if isinstance(items, set): 
        for i in items:
            support_count = calculate_support_count(i, data, True)
            dct[i] = support_count / lgtn
    else: 
        for i in items:
            support_count = calculate_support_count(i, data, False)
            dct[i] = support_count / lgtn
    return dct

In [154]:
def support_elimination(data, items, minimal_support):
    dct = calculate_support(data, items)
    support_resistant = []
    for key in dct:
        if dct[key] >= minimal_support:
            support_resistant.append(key)
    return support_resistant

In [155]:
def find_unique_level_one(data, sort_by_support_desc):
    dct = defaultdict(int)
    for y in data:
        for x in y:
            dct[x] += 1
    if sort_by_support_desc:
        return tuple([i for i in dict(sorted(dct.items(), key=lambda item: item[1], reverse=True))])
    else:
        return tuple(sorted([i for i in dct]))

In [156]:
data = read_file('GroceryStore.csv')
unique = find_unique_level_one(data, False)
support_elimination(data, unique, 0.4)

['Bread',
 'Butter',
 'Cheese',
 'Coffee Powder',
 'Ghee',
 'Lassi',
 'Milk',
 'Panner',
 'Sugar',
 'Sweet',
 'Tea Powder',
 'Yougurt']

In [157]:
def generate_candidates_fk1_1(previous_step_f_itemset, step_one_f_itemset):
    lst = []
    if isinstance(previous_step_f_itemset[0], list) or isinstance(previous_step_f_itemset[0], tuple): 
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k not in i:
                    lst.append(tuple(sorted([*i, k])))
    else:
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k != i:
                    lst.append(tuple(sorted((i, k))))
    return set(lst)

In [158]:
print(generate_candidates_fk1_1(unique, unique), end=" ")

{('Lassi', 'Panner'), ('Bread', 'Cheese'), ('Bread', 'Yougurt'), ('Coffee Powder', 'Yougurt'), ('Ghee', 'Yougurt'), ('Panner', 'Sugar'), ('Bread', 'Butter'), ('Bread', 'Sweet'), ('Bread', 'Tea Powder'), ('Bread', 'Panner'), ('Cheese', 'Sugar'), ('Coffee Powder', 'Sweet'), ('Butter', 'Sugar'), ('Ghee', 'Sweet'), ('Ghee', 'Tea Powder'), ('Milk', 'Yougurt'), ('Coffee Powder', 'Panner'), ('Ghee', 'Panner'), ('Tea Powder', 'Yougurt'), ('Coffee Powder', 'Tea Powder'), ('Cheese', 'Milk'), ('Butter', 'Milk'), ('Milk', 'Sweet'), ('Lassi', 'Sugar'), ('Milk', 'Panner'), ('Sugar', 'Yougurt'), ('Sweet', 'Yougurt'), ('Lassi', 'Milk'), ('Milk', 'Tea Powder'), ('Cheese', 'Ghee'), ('Butter', 'Ghee'), ('Cheese', 'Coffee Powder'), ('Sugar', 'Sweet'), ('Cheese', 'Lassi'), ('Sweet', 'Tea Powder'), ('Butter', 'Lassi'), ('Butter', 'Coffee Powder'), ('Sugar', 'Tea Powder'), ('Panner', 'Yougurt'), ('Bread', 'Sugar'), ('Bread', 'Milk'), ('Coffee Powder', 'Sugar'), ('Ghee', 'Sugar'), ('Butter', 'Cheese'), ('Chee

In [114]:
def apriori_algorigth(filename, max_length, min_support):
    data = read_file(filename)
    one_unique = find_unique_level_one(data, False)
    el_one_unique = support_elimination(data, one_unique, min_support)
    lim = 1
    el_k_unique = el_one_unique
    while(lim < max_length):
        k_unique = generate_candidates_fk1_1(el_k_unique, el_one_unique)
        el_k_unique = support_elimination(data, k_unique, min_support)
        lim = lim + 1
    return el_k_unique

In [48]:
a = apriori_algorigth('GroceryStore.csv', max_length=4, min_support=0.048)

In [49]:
write_file(a, f'Result-{datetime.datetime.now()}.csv')

### Association Rule

In [115]:
def association_rule_props(itemset, data):
    T = len(data)
    support_count_dict = {}
    gen_dict = {}
    for item in itemset:
        if item not in support_count_dict:
            support_count_dict[item] = calculate_support_count(item, data, True)
        all_subsets = chain.from_iterable(combinations(item, i) for i in range(1, len(item)))
        for a in all_subsets:
            if a not in support_count_dict:
                support_count_dict[a] = calculate_support_count(a, data, True)
            diff = tuple(set(item).difference(set(a)))
            if not (a, diff) in gen_dict:
                gen_dict[(a, diff)] = {
                    'support': support_count_dict[item] / T,
                    'confidence': support_count_dict[item] / support_count_dict[a]
                }
    return gen_dict

In [116]:
def association_rule(min_support, min_confidence, data_file_name, frequent_file_name):
    data = read_file(data_file_name)
    frequent = read_file(frequent_file_name)
    characteristics = association_rule_props(frequent, data)
    return {k: v for k, v in characteristics.items() if v['support'] >= min_support and v['confidence'] >= min_confidence}

In [117]:
min_support = 0.096
min_confidence = 0.49
association_rule(min_support, min_confidence, 'GroceryStore.csv', 'Result-2021-03-15 15:47:31.656068.csv')

{(('Lassi', 'Panner'), ('Sweet',)): {'support': 0.10098994092288041,
  'confidence': 0.5066079295154186},
 (('Lassi', 'Sweet'), ('Panner',)): {'support': 0.10098994092288041,
  'confidence': 0.49107142857142855},
 (('Panner', 'Sweet'), ('Lassi',)): {'support': 0.10098994092288041,
  'confidence': 0.5049900199600799},
 (('Butter', 'Panner'), ('Ghee',)): {'support': 0.0971579115439885,
  'confidence': 0.4901329037454692},
 (('Butter', 'Milk'), ('Sugar',)): {'support': 0.09875459045186014,
  'confidence': 0.49658771577679645},
 (('Butter', 'Sweet'), ('Sugar',)): {'support': 0.0997125977965831,
  'confidence': 0.4911521824616595},
 (('Sugar', 'Sweet'), ('Butter',)): {'support': 0.0997125977965831,
  'confidence': 0.5006012024048097}}

### FP-Growth Algorithm

In [207]:
class node:
    def __init__(self, entity, parent, count):
        self.entity = entity
        self.parent_node = parent
        self.child_nodes = {}
        self.count = count

In [240]:
def fp_tree(data, min_support):
    one_unique = find_unique_level_one(data, True)
    el_one_unique_sorted = support_elimination(data, one_unique, min_support)
    node_links = {}
    for i in el_one_unique_sorted:
        node_links[i] = []
    null_node = node(None, None, 1)
    for item in data:
        item_ordered = sorted(item, key=lambda x: el_one_unique_sorted.index(x))
        start_node = null_node
        for nd in item_ordered:
            if nd in start_node.child_nodes:
                start_node.child_nodes[nd].count += 1
                start_node = start_node.child_nodes[nd]
            else:
                this_node = node(nd, start_node, 1)
                start_node.child_nodes[nd] = this_node
                start_node = this_node
                node_links[i].append(this_node) 
    node_links_sorted = {k: v for k, v in sorted(dct.items(), key=lambda x: el_one_unique_sorted.index(x[0]))}
    return (null_node, node_links_sorted)

In [241]:
def get_frequent_itemsets(node_links):
    pass

In [242]:
def fp_growth(filename, max_length, min_support):
    data = read_file(filename)
    tree, node_links = fp_tree(data, min_support)
    print(tree.child_nodes['Milk'].child_nodes)

In [243]:
fp_growth('GroceryStore.csv', 5, 0.4)

{'Ghee': <__main__.node object at 0x7fca8aae0850>, 'Bread': <__main__.node object at 0x7fca8aae0fa0>, 'Sweet': <__main__.node object at 0x7fca8c0bf970>, 'Yougurt': <__main__.node object at 0x7fca8c0bfcd0>, 'Coffee Powder': <__main__.node object at 0x7fca8b890130>, 'Sugar': <__main__.node object at 0x7fca8bbf7b50>, 'Butter': <__main__.node object at 0x7fca8bc067c0>, 'Tea Powder': <__main__.node object at 0x7fca8bc0cd60>, 'Cheese': <__main__.node object at 0x7fca8ba54460>, 'Lassi': <__main__.node object at 0x7fca8ba657c0>, 'Panner': <__main__.node object at 0x7fca8ba903a0>}


### Experiment on the Dataset