## 2023638 | Anton Dementyev | Coursework 1: Algorithm Implementation

In [669]:
import csv
import datetime
import math
import threading
import pprint
from itertools import chain, combinations
from collections import defaultdict
from timeit import default_timer as timer

### Basic I/O

In [680]:
def read_file(filename, skipEmptyColumn):
    ds = []
    with open(filename, newline='') as csvfile:
        filereader = csv.reader(csvfile, delimiter=',')
        if skipEmptyColumn:
            for row in filereader:
                ## tuple is hashable, which is useful for all further actions
                ds.append(tuple(sorted(row[:-1])))
        else:
            for row in filereader:
                ds.append(tuple(sorted(row)))
        return ds

<span style="color:gray">Testing Read File</span>

In [503]:
read = read_file('GroceryStore.csv', True)[:5]
print(read)

[('Butter', 'Cheese', 'Coffee Powder', 'Ghee', 'Lassi', 'Yougurt'), ('Coffee Powder', 'Ghee'), ('Butter', 'Cheese', 'Lassi', 'Tea Powder'), ('Bread', 'Butter', 'Cheese', 'Coffee Powder', 'Panner', 'Tea Powder'), ('Butter', 'Cheese', 'Coffee Powder', 'Sugar', 'Sweet', 'Yougurt')]


In [672]:
def write_file(data, filename, reverse):
    # reverse mode can be helpful to create a required order of itemsets
    if reverse:
        data.reverse()
    if data != [] and data is not None:
        with open(filename, 'w', newline='') as csvfile:
            filewriter = csv.writer(csvfile, delimiter=',')
            for row in data:
                filewriter.writerow(row)
    else:
        print('Attempting to write empty dataset')

#### ________________________________________________________________________

### Apriori Algorithm

In [683]:
def calculate_support_count(instance, data, is_set):
    count = 0
    # if item is a plain string, do not waste memory to make convert it to a set
    if is_set:
        for row in data:
            if set(instance).issubset(set(row)): 
                count = count + 1
    else:
        for row in data:
            if instance in row:
                count = count + 1
    return count

In [705]:
def calculate_support(data, items):
    dct = {}
    lgtn = len(data)
    ## if not itemset is supplied, decompose the data to the mininum
    if items == None:
        for i in data:
            for j in i:
                if j in dct:
                    continue
                else:
                    support_count = calculate_support_count(j, data, False)
                    dct[j] = support_count / lgtn
    else:
        for i in items:
            support_count = calculate_support_count(i, data, True)
            dct[i] = support_count / lgtn
    return dct

In [696]:
def support_elimination(data, items, minimal_support, support_dict):
    dct = calculate_support(data, items) if support_dict is None else support_dict
    support_resistant = []
    for key in dct:
        if dct[key] >= minimal_support:
            support_resistant.append(key)
    return support_resistant

In [506]:
def generate_candidates_fk1_1(previous_step_f_itemset, step_one_f_itemset):
    lst = []
    if isinstance(previous_step_f_itemset[0], list) or isinstance(previous_step_f_itemset[0], tuple): 
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k not in i:
                    lst.append(tuple(sorted([*i, k])))
    else:
        for i in previous_step_f_itemset:
            for k in step_one_f_itemset:
                if k != i:
                    lst.append(tuple(sorted((i, k))))
    return set(lst)

In [507]:
print(generate_candidates_fk1_1(unique, unique), end=" ")

{('Lassi', 'Panner'), ('Bread', 'Cheese'), ('Bread', 'Yougurt'), ('Coffee Powder', 'Yougurt'), ('Ghee', 'Yougurt'), ('Panner', 'Sugar'), ('Bread', 'Butter'), ('Bread', 'Sweet'), ('Bread', 'Tea Powder'), ('Bread', 'Panner'), ('Cheese', 'Sugar'), ('Coffee Powder', 'Sweet'), ('Butter', 'Sugar'), ('Ghee', 'Sweet'), ('Ghee', 'Tea Powder'), ('Milk', 'Yougurt'), ('Coffee Powder', 'Panner'), ('Ghee', 'Panner'), ('Tea Powder', 'Yougurt'), ('Coffee Powder', 'Tea Powder'), ('Cheese', 'Milk'), ('Butter', 'Milk'), ('Milk', 'Sweet'), ('Lassi', 'Sugar'), ('Milk', 'Panner'), ('Sugar', 'Yougurt'), ('Sweet', 'Yougurt'), ('Lassi', 'Milk'), ('Milk', 'Tea Powder'), ('Cheese', 'Ghee'), ('Butter', 'Ghee'), ('Cheese', 'Coffee Powder'), ('Sugar', 'Sweet'), ('Cheese', 'Lassi'), ('Sweet', 'Tea Powder'), ('Butter', 'Lassi'), ('Butter', 'Coffee Powder'), ('Sugar', 'Tea Powder'), ('Panner', 'Yougurt'), ('Bread', 'Sugar'), ('Bread', 'Milk'), ('Coffee Powder', 'Sugar'), ('Ghee', 'Sugar'), ('Butter', 'Cheese'), ('Chee

In [727]:
def apriori_algorithm(filename, max_length, is_fixed_length, min_support):
    data = read_file(filename, True)
    one_unique_dict = calculate_support(data, None)
    el_one_unique = support_elimination(data, None, min_support, one_unique_dict)
    el_k_unique = tuple(sorted([i for i in el_one_unique]))
    ## if only n-length itemsets
    if is_fixed_length:
        for i in range(max_length-1):
            if el_k_unique == []:
                return []
            k_unique = generate_candidates_fk1_1(el_k_unique, el_one_unique)
            el_k_unique = support_elimination(data, k_unique, min_support, None)
        return el_k_unique
    ## if 2 - n-length itemsets
    else:
        frequent = []
        for i in range(max_length-1):
            if el_k_unique == []:
                return frequent
            k_unique = generate_candidates_fk1_1(el_k_unique, el_one_unique)
            el_k_unique = support_elimination(data, k_unique, min_support, None)
            frequent.extend(el_k_unique)
        return frequent

<span style="color:gray">Testing Apriori</span>

In [728]:
dataset_name = 'GroceryStore.csv'
min_support = 0.048
max_itemset_length = 4
is_fixed_length = True

start = timer()
apriori_frequent = apriori_algorithm(dataset_name, max_length=max_itemset_length, is_fixed_length=is_fixed_length, min_support=min_support)
end = timer()
print(f'Apriori algorithm took appx. {"{:.2f}".format(end - start)}s to run on the {dataset_name} dataset')

Apriori algorithm took appx. 5.58s to run on the GroceryStore.csv dataset


In [729]:
write_file(apriori_frequent, f'Apriori-min_support={min_support}-{datetime.datetime.now()}.csv', reverse=True)

<h5 style="color: #7d3434">NB: in the next sections some functions from the current section will be reused</h5>

#### ________________________________________________________________________

### Association Rule

In [511]:
def association_rule_props(itemset, data):
    T = len(data)
    support_count_dict = {}
    gen_dict = {}
    for item in itemset:
        if item not in support_count_dict:
            ## calculate support for each set of objects
            support_count_dict[item] = calculate_support_count(item, data, True)
        ## create all possible subsets of the item / set of objects
        all_subsets = chain.from_iterable(combinations(item, i) for i in range(1, len(item)))
        for x in all_subsets:
            if x not in support_count_dict:
                support_count_dict[x] = calculate_support_count(x, data, True)
            y = tuple(set(item).difference(set(x)))
            if not y in support_count_dict:
                support_count_dict[y] = calculate_support_count(y, data, True)
            if not (x, y) in gen_dict:
                support_count = support_count_dict[item]
                support_count_x = support_count_dict[x]
                support = support_count / T
                support_x = support_count_x / T
                ## return props for each X => Y pair
                gen_dict[(x, y)] = {
                    'support': support,
                    'confidence': support_count / support_count_x,
                    'lift': support / (support_x * support_count_dict[y] / T)
                }
    return gen_dict

In [513]:
def association_rule(min_support, min_confidence, data_file_name, frequent_file_name):
    # read the main file and frequent itemsets
    data = read_file(data_file_name, skipEmptyColumn=True)
    frequent = read_file(frequent_file_name, skipEmptyColumn=False)
    characteristics = association_rule_props(frequent, data)
    ## eliminate based on the supplied params
    return {k: v for k, v in characteristics.items() if v['support'] >= min_support and v['confidence'] >= min_confidence}

<span style="color:gray">Testing Association Rule</span>

In [723]:
min_support = 0.1
min_confidence = 0.468
association_rule(min_support, min_confidence, dataset_name, 'Apriori-min_support=0.1-2021-03-21 00:45:30.864484.csv')

{(('Lassi', 'Panner'), ('Sweet',)): {'support': 0.10098994092288041,
  'confidence': 0.5066079295154186,
  'lift': 1.1573538072424097},
 (('Lassi', 'Sweet'), ('Panner',)): {'support': 0.10098994092288041,
  'confidence': 0.49107142857142855,
  'lift': 1.129897265666002},
 (('Panner', 'Sweet'), ('Lassi',)): {'support': 0.10098994092288041,
  'confidence': 0.5049900199600799,
  'lift': 1.1644891366016128},
 (('Lassi',), ('Sweet',)): {'support': 0.20565224333386556,
  'confidence': 0.4742268041237113,
  'lift': 1.0833786154392868},
 (('Sweet',), ('Lassi',)): {'support': 0.20565224333386556,
  'confidence': 0.4698157942732081,
  'lift': 1.0833786154392864},
 (('Butter',), ('Sugar',)): {'support': 0.20525307360689765,
  'confidence': 0.4690749863163656,
  'lift': 1.071804684166143},
 (('Sugar',), ('Butter',)): {'support': 0.20525307360689765,
  'confidence': 0.4689894199197373,
  'lift': 1.071804684166143},
 (('Panner',), ('Bread',)): {'support': 0.20357656075363245,
  'confidence': 0.46840

#### ________________________________________________________________________

### FP-Growth Algorithm

In [249]:
class node:
    def __init__(self, entity, parent, count):
        self.entity = entity ## item name
        self.parent_node = parent
        self.child_nodes = {} ## children nodes with the key of item name
        self.count = count ## number of passes of the node
        self.cond_count = defaultdict(int) ## count of the conditional fp_tree

In [719]:
def fp_tree(data, min_support):
    one_unique_dict = calculate_support(data, None)
    el_one_unique = support_elimination(data, None, min_support, one_unique_dict)
    el_one_unique_sorted = [i for i in dict(sorted(dct.items(), key=lambda item: item[1], reverse=True))]
    node_links = {}
    for i in el_one_unique_sorted:
        node_links[i] = []
    null_node = node(None, None, 1)
    for item in data:
        item_ordered = sorted(item, key=lambda x: el_one_unique_sorted.index(x))
        start_node = null_node
        for nd in item_ordered:
            if nd in start_node.child_nodes:
                start_node.child_nodes[nd].count += 1
                start_node = start_node.child_nodes[nd]
            else:
                this_node = node(nd, start_node, 1)
                start_node.child_nodes[nd] = this_node
                start_node = this_node
                node_links[nd].append(this_node) 
    node_links = {k: v for k, v in sorted(node_links.items(), key=lambda x: el_one_unique_sorted.index(x[0]))}
    return (null_node, node_links)

In [715]:
def conditional_fp_tree(node_links):
    for key in node_links:
        for nd in node_links[key]:
            current = nd
            while current.entity != None:
                current.parent_node.cond_count[nd.entity] += nd.count
                current = current.parent_node              

In [716]:
## the rest is unfinished

In [717]:
def fp_growth(filename, max_length, min_support):
    data = read_file(filename, True)
    tree, node_links = fp_tree(data, min_support)
    conditional_fp_tree(node_links)

In [718]:
start = timer()
fp_growth('GroceryStore.csv', 5, 0.205)
end = timer()
print(f'FP-Growth algorithm took appx. {"{:.2f}".format(end - start)}s to run on the {dataset_name} dataset')

['Milk', 'Ghee', 'Coffee Powder', 'Yougurt', 'Bread', 'Sweet', 'Sugar', 'Butter', 'Cheese', 'Panner', 'Lassi', 'Tea Powder']
FP-Growth algorithm took appx. 0.12s to run on the GroceryStore.csv dataset


#### ________________________________________________________________________

### Experiments on the Dataset

<div style="color:gray">
    <span>Judging by the lift, we can see that after purchasing Panner & Sweet customers are more likely to purchase Lassi.</span></br></br>
    <span>Frequent gen params:</span>
    <ul>
        <li>min_support = 0.1</li>
        <li>max_itemset_length = 3</li>
        <li>is_fixed_length = False</li>
    </ul>
</div>

In [652]:
association_rule(min_support=0, min_confidence=0.49, 
                 data_file_name=dataset_name, frequent_file_name='Apriori-min_support=0.1-2021-03-21 00:45:30.864484.csv')

{(('Lassi', 'Panner'), ('Sweet',)): {'support': 0.10098994092288041,
  'confidence': 0.5066079295154186,
  'lift': 1.1573538072424097},
 (('Lassi', 'Sweet'), ('Panner',)): {'support': 0.10098994092288041,
  'confidence': 0.49107142857142855,
  'lift': 1.129897265666002},
 (('Panner', 'Sweet'), ('Lassi',)): {'support': 0.10098994092288041,
  'confidence': 0.5049900199600799,
  'lift': 1.1644891366016128}}

<div style="color:gray">
    <span>For 4-itemsets we can see that Panner, Sweet and Tea Powder are to be put together, especially if the task is to increase Lassi sales.</span></br></br>
    <span>Frequent gen params:</span>
    <ul>
        <li>min_support = 0.048</li>
        <li>max_itemset_length = 4</li>
        <li>is_fixed_length = True</li>
    </ul>
</div>

In [645]:
association_rule(min_support=0, min_confidence=0.52, 
                 data_file_name=dataset_name, frequent_file_name='Apriori-min_support=0.048-2021-03-20 23:12:47.216416.csv')

{(('Milk', 'Panner', 'Sweet'), ('Lassi',)): {'support': 0.04941721219862685,
  'confidence': 0.5254668930390493,
  'lift': 1.21170808214417},
 (('Lassi', 'Panner', 'Tea Powder'),
  ('Sweet',)): {'support': 0.04981638192559476, 'confidence': 0.5324232081911263, 'lift': 1.2163292186398043},
 (('Panner', 'Sweet', 'Tea Powder'),
  ('Lassi',)): {'support': 0.04981638192559476, 'confidence': 0.5342465753424658, 'lift': 1.231953719208344}}

<div style="color:gray">
    <span>Tea Powder, Coffee Powder and Milk (ranked to highest P to lowest) are most likely to go in the basket with Panner when a customer buys Lassi and Sweet.</span></br></br>
    <span>Frequent gen params:</span>
    <ul>
        <li>min_support = 0.0235</li>
        <li>max_itemset_length = 5</li>
        <li>is_fixed_length = True</li>
    </ul>
</div>

In [730]:
min_confidence = 0.53
rule = association_rule(min_support=0, min_confidence=min_confidence, 
                 data_file_name=dataset_name, frequent_file_name='Apriori-min_support=0.0235-2021-03-20 23:47:48.888064.csv')
pprint.pprint(rule)

{(('Coffee Powder', 'Lassi', 'Panner', 'Tea Powder'), ('Sweet',)): {'confidence': 0.5376344086021505,
                                                                    'lift': 1.2282342881908694,
                                                                    'support': 0.023950183618074404},
 (('Coffee Powder', 'Panner', 'Sweet', 'Tea Powder'), ('Lassi',)): {'confidence': 0.5494505494505495,
                                                                    'lift': 1.2670135461004386,
                                                                    'support': 0.023950183618074404},
 (('Lassi', 'Milk', 'Panner', 'Tea Powder'), ('Sweet',)): {'confidence': 0.5314183123877917,
                                                           'lift': 1.2140335183238153,
                                                           'support': 0.02363084783650008},
 (('Milk', 'Panner', 'Sweet', 'Tea Powder'), ('Lassi',)): {'confidence': 0.5323741007194245,
                                   

In [671]:
## we can write the acquired rules in a file
write_file(rule.keys(), f'Association_rule-confidence={min_confidence}-{datetime.datetime.now()}.csv', reverse=False)