In [14]:
##Functions for the algorithm

In [15]:
def read_data(file_loc='breadbasket.csv'):
    trans = dict()
    with open(file_loc) as f:
        filedata = csv.reader(f, delimiter=',')
        count = 0
        for line in filedata:
            count += 1
            trans[count] = list(set(line))
    return trans

In [16]:
#function to get frequent one itemset
def frequent_one_item(Transaction,min_support):
    candidate1 = {}

    for i in range(0,len(Transaction)):
        for j in range(0,len(Transaction[i])):
            if Transaction[i][j] not in candidate1:
                candidate1[Transaction[i][j]] = 1
            else:
                candidate1[Transaction[i][j]] += 1

    frequentitem1 = []                      #to get frequent 1 itemsets with minimum support count
    for value in candidate1:
        if candidate1[value] >= min_support:
            frequentitem1 = frequentitem1 + [[value]]
            Frequent_items_value[tuple(value)] = candidate1[value]

    return frequentitem1

In [17]:
#class of Hash node
class Hash_node:
    def __init__(self):
        self.children = {}           #pointer to its children
        self.Leaf_status = True      #to know the status whether current node is leaf or not
        self.bucket = {}             #contains itemsets in bucket

In [18]:
#class of constructing and getting hashtree
class HashTree:
    # class constructor
    def __init__(self, max_leaf_count, max_child_count):
        self.root = Hash_node()
        self.max_leaf_count = max_leaf_count
        self.max_child_count = max_child_count
        self.frequent_itemsets = []

    # function to recursive insertion to make hashtree
    def recursively_insert(self, node, itemset, index, count):
        if index == len(itemset):
            if itemset in node.bucket:
                node.bucket[itemset] += count
            else:
                node.bucket[itemset] = count
            return

        if node.Leaf_status:                             #if node is leaf
            if itemset in node.bucket:
                node.bucket[itemset] += count
            else:
                node.bucket[itemset] = count
            if len(node.bucket) == self.max_leaf_count:  #if bucket capacity increases
                for old_itemset, old_count in node.bucket.items():

                    hash_key = self.hash_function(old_itemset[index])  #do hashing on next index
                    if hash_key not in node.children:
                        node.children[hash_key] = Hash_node()
                    self.recursively_insert(node.children[hash_key], old_itemset, index + 1, old_count)
                #since no more requirement of this bucket
                del node.bucket
                node.Leaf_status = False
        else:                                            #if node is not leaf
            hash_key = self.hash_function(itemset[index])
            if hash_key not in node.children:
                node.children[hash_key] = Hash_node()
            self.recursively_insert(node.children[hash_key], itemset, index + 1, count)

    def insert(self, itemset):
        itemset = tuple(itemset)
        self.recursively_insert(self.root, itemset, 0, 0)

    # to add support to candidate itemsets. Transverse the Tree and find the bucket in which this itemset is present.
    def add_support(self, itemset):
        Transverse_HNode = self.root
        itemset = tuple(itemset)
        index = 0
        while True:
            if Transverse_HNode.Leaf_status:
                if itemset in Transverse_HNode.bucket:    #found the itemset in this bucket
                    Transverse_HNode.bucket[itemset] += 1 #increment the count of this itemset.
                break
            hash_key = self.hash_function(itemset[index])
            if hash_key in Transverse_HNode.children:
                Transverse_HNode = Transverse_HNode.children[hash_key]
            else:
                break
            index += 1

    # to transverse the hashtree to get frequent itemsets with minimum support count
    def get_frequent_itemsets(self, node, support_count,frequent_itemsets):
        if node.Leaf_status:
            for key, value in node.bucket.items():
                if value >= support_count:                       #if it satisfies the condition
                    frequent_itemsets.append(list(key))          #then add it to frequent itemsets.
                    Frequent_items_value[key] = value
            return

        for child in node.children.values():
            self.get_frequent_itemsets(child, support_count,frequent_itemsets)

    # hash function for making HashTree
    def hash_function(self, val):
        return int(val) % self.max_child_count

#To generate hash tree from candidate itemsets
def generate_hash_tree(candidate_itemsets, max_leaf_count, max_child_count):
    htree = HashTree(max_child_count, max_leaf_count)             #create instance of HashTree
    for itemset in candidate_itemsets:
        htree.insert(itemset)                                     #to insert itemset into Hashtree
    return htree

#to generate subsets of itemsets of size k
def generate_k_subsets(dataset, length):
    subsets = []
    for itemset in dataset:
        subsets.extend(map(list, itertools.combinations(itemset, length)))
    return subsets

def subset_generation(ck_data,l):
    return map(list,set(itertools.combinations(ck_data,l)))

#apriori generate function to generate ck
def apriori_generate(dataset,k):
    ck = []
    #join step
    lenlk = len(dataset)
    for i in range(lenlk):
        for j in range(i+1,lenlk):
            L1 = list(dataset[i])[:k - 2]
            L2 = list(dataset[j])[:k - 2]
            if L1 == L2:
                ck.append(sorted(list(set(dataset[i]) | set(dataset[j]))))

    #prune step
    final_ck = []
    for candidate in ck:
        all_subsets = list(subset_generation(set(candidate), k - 1))
        found = True
        for i in range(len(all_subsets)):
            value = list(sorted(all_subsets[i]))
            if value not in dataset:
                found = False
        if found == True:
            final_ck.append(candidate)

    return ck,final_ck

def generateL(ck,min_support):
    support_ck = {}
    for val in Transaction1:
        for val1 in ck:
            value = set(val)
            value1 = set(val1)

            if value1.issubset(value):
                if tuple(val1) not in support_ck:
                    support_ck[tuple(val1)] = 1
                else:
                    support_ck[tuple(val1)] += 1
    frequent_item = []
    for item_set in support_ck:
        if support_ck[item_set] >= min_support:
            frequent_item.append(sorted(list(item_set)))
            Frequent_items_value[item_set] = support_ck[item_set]

    return frequent_item

In [19]:
def association_rules(items_grater_then_min_support):
    rules = []
    dict_rules = {}
    for i in items_grater_then_min_support:
        dict_rules = {}
        if type(i) != type(str()):
            i = list(i)
            temp_i = i[:]
            for j in range(len(i)):
                k = temp_i[j]
                del temp_i[j]
                dict_rules[k] = temp_i
                temp_i = i[:]
        rules.append(dict_rules)
    temp = []
    for i in rules:
        for j in i.items():
            if type(j[1]) != type(str()):
                temp.append({tuple(j[1])[0]: j[0]})
            else:
                temp.append({j[1]: j[0]})
    rules.extend(temp)
    return rules

def confidence(associations, d, min_confidence):
    ans = {}
    for i in associations:
        for j in i.items():
            if type(j[0]) == type(str()):
                left = {j[0]}
            else:
                left = set(j[0])
            if type(j[1]) == type(str()):
                right = {j[1]}
            else:
                right = set(j[1])
            for k in d:
                if type(k) != type(str()):
                    if left.union(right) - set(k) == set():
                        up = d[k]
                    if len(right) == len(set(k)) and right - set(k) == set():
                            down = d[k]
                else:
                    if len(right) >= len({k}):
                        if right - {k} == set():
                            down = d[k]
                    elif len(right) <= len({k}):
                        if {k} - right == set():
                            down = d[k]
            if up/down >= min_confidence:
                ans[tuple(left)[0]] = right, up/down, up, down
    print(ans)   

In [20]:
## main apriori algorithm function

In [21]:
import csv
from itertools import combinations

In [22]:
def frequence(items_lst, trans, check=False):
    items_counts = dict()
    for i in items_lst:
        temp_i = {i}
        if check:
            temp_i = set(i)
        for j in trans.items():
            if temp_i.issubset(set(j[1])):
                if i in items_counts:
                    items_counts[i] += 1
                else:
                    items_counts[i] = 1
    return items_counts
    

def support(items_counts, trans):
    support = dict()
    total_trans = len(trans)
    for i in items_counts:
        support[i] = items_counts[i]/total_trans
    return support

In [23]:
def main(min_support, min_confidence, file_loc):
    
    trans = read_data()
    number_of_trans = [len(i) for i in trans.values()]
    items_lst = set()
    
    itemcount_track = list()    
    
    for i in trans.values():
        for j in i:
            items_lst.add(j)
    
    store_item_lst = list(items_lst)[:]
    items_grater_then_min_support = list()
    items_counts = frequence(items_lst, trans)
    itemcount_track.append(items_counts)
    items_grater_then_min_support.append({j[0]:j[1] for j in support(items_counts, trans).items() if j[1]>min_support})
    
    for i in range(2, max(number_of_trans)+1):
        item_list = combinations(items_lst, i)
        items_counts = frequence(item_list, trans, check=True)
        itemcount_track.append(items_counts)
        if list({j[0]:j[1] for j in support(items_counts, trans).items() if j[1]>min_support}.keys()) != []:
            items_grater_then_min_support.append({j[0]:j[1] for j in support(items_counts, trans).items() if j[1]>min_support})
        
    d = {}
    
    {d.update(i) for i in itemcount_track}
    
    associations = association_rules(items_grater_then_min_support[len(items_grater_then_min_support)-1])
    associations_grater_then_confidene = confidence(associations, d, min_confidence)
    

In [24]:
main(0.02, 0.8, 'breadbasket.csv')

{'13': ({'Basket'}, 1.0, 1, 1), '30-10-2016 10:31': ({'13', 'morning', 'weekend', 'Basket'}, 1.0, 1, 1), 'morning': ({'30-10-2016 10:05'}, 1.0, 2, 2), 'weekend': ({'Tea', '30-10-2016 10:19', 'morning', '7'}, 1.0, 1, 1), '10': ({'30-10-2016 10:25', 'Scandinavian', 'morning', 'weekend'}, 1.0, 1, 1), '30-10-2016 10:25': ({'10', 'morning', 'Scandinavian', 'weekend'}, 1.0, 1, 1), '3': ({'Hot chocolate'}, 1.0, 1, 1), '30-10-2016 10:07': ({'morning', '3', 'Hot chocolate', 'weekend'}, 1.0, 1, 1), '12': ({'Tartine'}, 1.0, 1, 1), '30-10-2016 10:30': ({'Tea', 'morning', 'weekend', '12'}, 1.0, 1, 1), 'Transaction': ({'weekday_weekend'}, 1.0, 1, 1), 'Item': ({'Transaction'}, 1.0, 1, 1), 'date_time': ({'Transaction', 'period_day', 'Item', 'weekday_weekend'}, 1.0, 1, 1), 'period_day': ({'Transaction', 'weekday_weekend', 'Item', 'date_time'}, 1.0, 1, 1), 'weekday_weekend': ({'Transaction', 'period_day', 'Item', 'date_time'}, 1.0, 1, 1), '30-10-2016 10:13': ({'5', 'morning', 'weekend', 'Pastry'}, 1.0, 