## 2023638 | Anton Dementyev | Coursework 1: Algorithm Implementation

In [378]:
import csv
import datetime
import math
import numpy as np
import threading
from itertools import chain, combinations
from collections import defaultdict
from timeit import default_timer as timer

### Basic I/O

In [512]:
def read_file(filename, skipEmptyColumn):
    ds = []
    with open(filename, newline='') as csvfile:
        filereader = csv.reader(csvfile, delimiter=',')
        if skipEmptyColumn:
            for row in filereader:
                ds.append(tuple(sorted(row[:-1])))
        else:
            for row in filereader:
                ds.append(tuple(sorted(row)))
        return ds

In [503]:
read = read_file('GroceryStore.csv', True)[:5]
print(read)

[('Butter', 'Cheese', 'Coffee Powder', 'Ghee', 'Lassi', 'Yougurt'), ('Coffee Powder', 'Ghee'), ('Butter', 'Cheese', 'Lassi', 'Tea Powder'), ('Bread', 'Butter', 'Cheese', 'Coffee Powder', 'Panner', 'Tea Powder'), ('Butter', 'Cheese', 'Coffee Powder', 'Sugar', 'Sweet', 'Yougurt')]


In [504]:
def write_file(data, filename, reverse):
    if reverse:
        data.reverse()
    if data != [] and data is not None:
        with open(filename, 'w', newline='') as csvfile:
            filewriter = csv.writer(csvfile, delimiter=',')
            for row in data:
                filewriter.writerow(row)
    else:
        print('Attempting to write empty dataset')

#### ________________________________________________________________________

### Apriori Algorithm

In [382]:
def calculate_support_count(instance, data, is_set):
    count = 0
    if is_set:
        for row in data:
            if set(instance).issubset(set(row)): 
                count = count + 1
    else:
        for row in data:
            if instance in row:
                count = count + 1
    return count

In [383]:
def calculate_support(data, items):
    dct = {}
    lgtn = len(data)
    if isinstance(items, set): 
        for i in items:
            support_count = calculate_support_count(i, data, True)
            dct[i] = support_count / lgtn
    else: 
        for i in items:
            support_count = calculate_support_count(i, data, False)
            dct[i] = support_count / lgtn
    return dct

In [384]:
def support_elimination(data, items, minimal_support):
    dct = calculate_support(data, items)
    support_resistant = []
    for key in dct:
        if dct[key] >= minimal_support:
            support_resistant.append(key)
    return support_resistant

In [385]:
def find_unique_level_one(data, sort_by_support_desc):
    dct = defaultdict(int)
    for y in data:
        for x in y:
            dct[x] += 1
    if sort_by_support_desc:
        return tuple([i for i in dict(sorted(dct.items(), key=lambda item: item[1], reverse=True))])
    else:
        return tuple(sorted([i for i in dct]))

In [505]:
data = read_file('GroceryStore.csv', True)
unique = find_unique_level_one(data, False)
print(support_elimination(data, unique, 0.4), end=" ")

['Bread', 'Butter', 'Cheese', 'Coffee Powder', 'Ghee', 'Lassi', 'Milk', 'Panner', 'Sugar', 'Sweet', 'Tea Powder', 'Yougurt'] 

In [506]:
def generate_candidates_fk1_1(previous_step_f_itemset, step_one_f_itemset):
    lst = []
    if isinstance(previous_step_f_itemset[0], list) or isinstance(previous_step_f_itemset[0], tuple): 
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k not in i:
                    lst.append(tuple(sorted([*i, k])))
    else:
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k != i:
                    lst.append(tuple(sorted((i, k))))
    return set(lst)

In [507]:
print(generate_candidates_fk1_1(unique, unique), end=" ")

{('Lassi', 'Panner'), ('Bread', 'Cheese'), ('Bread', 'Yougurt'), ('Coffee Powder', 'Yougurt'), ('Ghee', 'Yougurt'), ('Panner', 'Sugar'), ('Bread', 'Butter'), ('Bread', 'Sweet'), ('Bread', 'Tea Powder'), ('Bread', 'Panner'), ('Cheese', 'Sugar'), ('Coffee Powder', 'Sweet'), ('Butter', 'Sugar'), ('Ghee', 'Sweet'), ('Ghee', 'Tea Powder'), ('Milk', 'Yougurt'), ('Coffee Powder', 'Panner'), ('Ghee', 'Panner'), ('Tea Powder', 'Yougurt'), ('Coffee Powder', 'Tea Powder'), ('Cheese', 'Milk'), ('Butter', 'Milk'), ('Milk', 'Sweet'), ('Lassi', 'Sugar'), ('Milk', 'Panner'), ('Sugar', 'Yougurt'), ('Sweet', 'Yougurt'), ('Lassi', 'Milk'), ('Milk', 'Tea Powder'), ('Cheese', 'Ghee'), ('Butter', 'Ghee'), ('Cheese', 'Coffee Powder'), ('Sugar', 'Sweet'), ('Cheese', 'Lassi'), ('Sweet', 'Tea Powder'), ('Butter', 'Lassi'), ('Butter', 'Coffee Powder'), ('Sugar', 'Tea Powder'), ('Panner', 'Yougurt'), ('Bread', 'Sugar'), ('Bread', 'Milk'), ('Coffee Powder', 'Sugar'), ('Ghee', 'Sugar'), ('Butter', 'Cheese'), ('Chee

In [509]:
def apriori_algorigth(filename, max_length, min_support):
    data = read_file(filename, True)
    one_unique = find_unique_level_one(data, False)
    if len(one_unique) < max_length:
        return
    el_one_unique = support_elimination(data, one_unique, min_support)
    if el_one_unique == []:
        return
    lim = 1
    frequent = []
    el_k_unique = el_one_unique
    while(lim < max_length):
        if el_k_unique == []:
            return frequent
        k_unique = generate_candidates_fk1_1(el_k_unique, el_one_unique)
        el_k_unique = support_elimination(data, k_unique, min_support)
        frequent.extend(el_k_unique)
        lim = lim + 1
    return frequent

<span style="color:gray">Testing Apriori</span>

In [510]:
dataset_name = 'GroceryStore.csv'
min_support = 0.1
max_itemset_length = 5

start = timer()
apriori_frequent = apriori_algorigth(dataset_name, max_length=max_itemset_length, min_support=min_support)
end = timer()
print(f'Apriori algorithm took appx. {"{:.2f}".format(end - start)}s to run on the {dataset_name} dataset')

Apriori algorithm took appx. 2.23s to run on the GroceryStore.csv dataset


In [484]:
write_file(apriori_frequent, f'Apriori-{datetime.datetime.now()}.csv', reverse=True)

<h5 style="color: #7d3434">NB: in the next sections some functions from the current section will be reused</h5>

#### ________________________________________________________________________

### Association Rule

In [511]:
def association_rule_props(itemset, data):
    T = len(data)
    support_count_dict = {}
    gen_dict = {}
    for item in itemset:
        if item not in support_count_dict:
            support_count_dict[item] = calculate_support_count(item, data, True)
        all_subsets = chain.from_iterable(combinations(item, i) for i in range(1, len(item)))
        for x in all_subsets:
            if x not in support_count_dict:
                support_count_dict[x] = calculate_support_count(x, data, True)
            y = tuple(set(item).difference(set(x)))
            if not y in support_count_dict:
                support_count_dict[y] = calculate_support_count(y, data, True)
            if not (x, y) in gen_dict:
                support_count = support_count_dict[item]
                support_count_x = support_count_dict[x]
                support = support_count / T
                support_x = support_count_x / T
                gen_dict[(x, y)] = {
                    'support': support,
                    'confidence': support_count / support_count_x,
                    'lift': support / (support_x * support_count_dict[y] / T)
                }
    return gen_dict

In [513]:
def association_rule(min_support, min_confidence, data_file_name, frequent_file_name):
    data = read_file(data_file_name, skipEmptyColumn=True)
    frequent = read_file(frequent_file_name, skipEmptyColumn=False)
    characteristics = association_rule_props(frequent, data)
    return {k: v for k, v in characteristics.items() if v['support'] >= min_support and v['confidence'] >= min_confidence}

<span style="color:gray">Testing Association Rule</span>

In [516]:
min_support = 0.1
min_confidence = 0.48
association_rule(min_support, min_confidence, dataset_name, 'Apriori-2021-03-20 17:37:12.342135.csv')

{(('Lassi', 'Panner'), ('Sweet',)): {'support': 0.10098994092288041,
  'confidence': 0.5066079295154186,
  'lift': 1.1573538072424097},
 (('Lassi', 'Sweet'), ('Panner',)): {'support': 0.10098994092288041,
  'confidence': 0.49107142857142855,
  'lift': 1.129897265666002},
 (('Panner', 'Sweet'), ('Lassi',)): {'support': 0.10098994092288041,
  'confidence': 0.5049900199600799,
  'lift': 1.1644891366016128}}

#### ________________________________________________________________________

### FP-Growth Algorithm

In [249]:
class node:
    def __init__(self, entity, parent, count):
        self.entity = entity
        self.parent_node = parent
        self.child_nodes = {}
        self.count = count
        self.cond_count = defaultdict(int)

In [409]:
def fp_tree(data, min_support):
    one_unique = find_unique_level_one(data, True)
    el_one_unique_sorted = support_elimination(data, one_unique, min_support)
    node_links = {}
    for i in el_one_unique_sorted:
        node_links[i] = []
    null_node = node(None, None, 1)
    for item in data:
        item_ordered = sorted(item, key=lambda x: el_one_unique_sorted.index(x))
        start_node = null_node
        for nd in item_ordered:
            if nd in start_node.child_nodes:
                start_node.child_nodes[nd].count += 1
                start_node = start_node.child_nodes[nd]
            else:
                this_node = node(nd, start_node, 1)
                start_node.child_nodes[nd] = this_node
                start_node = this_node
                node_links[nd].append(this_node) 
    node_links = {k: v for k, v in sorted(node_links.items(), key=lambda x: el_one_unique_sorted.index(x[0]))}
    return (null_node, node_links)

In [410]:
def get_frequent_itemsets(node_links):
    frequent = [] ## then make it a set
    for key in node_links:
        for nd in node_links[key]:
            current = nd
            while current.entity != None:
                current.parent_node.cond_count[nd.entity] += nd.count
                current = current.parent_node

In [411]:
def fp_growth(filename, max_length, min_support):
    data = read_file(filename)
    tree, node_links = fp_tree(data, min_support)
    get_frequent_itemsets(node_links)

In [412]:
start = timer()
fp_growth('GroceryStore.csv', 5, 0.205)
end = timer()
print(f'FP-Growth algorithm took appx. {"{:.2f}".format(end - start)}s to run on the {dataset_name} dataset')

FP-Growth algorithm took appx. 0.13s to run on the GroceryStore.csv dataset


### Experiment on the Dataset