In [1]:
from collections import defaultdict

transactions = []
min_util = 25000
parsed_trans = []
TWU = defaultdict(int)
all_candidates = set()
with open('shortened_chainstore.txt', 'r') as file:
    for line in file:
        transactions.append(line.strip())

In [None]:
# where we calculate the TWU value for each unique 1-itemset
def parse_twu(line, tid):
    items, total_util, item_utils = line.split(':')
    items = list(map(int, items.strip().split()))
    total_util = float(total_util.strip())
    item_utils = list(map(float, item_utils.strip().split()))
    # [0] = tid, [1] = list of items, [2] = list of item utilities, [3] = total utility
    parsed_trans.append((tid, items, item_utils, total_util))
    # parse item by item in each transaction such that each item is accounted for
    for item in items:
        TWU[item] += total_util

In [None]:
# high utility 1-itemsets that have TWU >= min_util
def high_utility_one_itemset():
    oneItemset = set()
    for item in TWU:
        # filtering process based on min utility threshold
        # loose upper bound filtering
        if TWU[item] >= min_util:
            oneItemset.add(item)
    return oneItemset

In [None]:
from itertools import combinations
# using Apriori algorithm to generate candidates
def generate_candidates(prev_HU_itemsets, k):
    candidates = set()
    all_items = set()
    for itemset in prev_HU_itemsets:
        if k == 2:
            all_items.add(itemset)
        else:
            for item in itemset:
                all_items.add(item)
    # python combinations() method performs as such:
    # combinations('ABCD', 2) = AB AC AD BC BD CD
    possible_combinations = combinations(sorted(all_items), k)
    
    for combo in possible_combinations:
        all_HU_items = True
        k_minus_1_subsets = combinations(combo, k-1)
    
        for subset in k_minus_1_subsets:
            sorted_subset = tuple(sorted(subset))
            for element in sorted_subset:
                # if the element is not high-utility, then its supersets cannot be high-utility
                if element not in prev_HU_itemsets:
                    all_HU_items = False
                    break   
        # if all subsets are HU, then we can add the itemset to our candidates
        if all_HU_items == True:
            candidates.add(tuple(sorted(combo)))
    return candidates

In [None]:
# using TWU upper-bound to filter low-utility itemsets
def filter_by_twu(candidates, min_util):
    candidate_twu = defaultdict(int)
    hu_candidates = set()
    for transaction in parsed_trans:
        for candidate in candidates:
            contains = True
            for item in candidate:
                if item not in transaction[1]:
                    contains = False
            # checking if each item in candidate itemset is in the transaction
            if contains == True:
                candidate_twu[candidate] += transaction[3]
    for itemset, twu in candidate_twu.items():
        if twu >= min_util:
            hu_candidates.add(itemset)
    return hu_candidates

In [None]:
# going through remaining candidates that passed the TWU >= minutil threshold
# calculating their actual utility to compare against minutil
def compute_exact_utility(candidates, min_util):
    candidate_util = 0
    high_utility_itemsets = dict()
    for candidate in candidates:
        for transaction in parsed_trans:
            contains = True
            if isinstance(candidate, int):
                if candidate not in transaction[1]:
                    contains = False
            else:
                for item in candidate:
                    if item not in transaction[1]:
                        contains = False
            # checking if each item in candidate itemset is in the transaction
            if contains == True:
                if isinstance(candidate, int):
                    item_list = transaction[1]
                    item_index = item_list.index(candidate)
                    item_utility_list = transaction[2]
                    item_util = item_utility_list[item_index]
                    candidate_util += item_util
                else:
                    for item in candidate:
                        item_list = transaction[1]
                        item_index = item_list.index(item)
                        item_utility_list = transaction[2]
                        item_util = item_utility_list[item_index]
                        candidate_util += item_util
        if candidate_util >= min_util:
            high_utility_itemsets[candidate] = candidate_util
            
    return high_utility_itemsets

In [None]:
from datetime import datetime

# running all the necessary parts in order
def run():     
    start_time = datetime.now()
    # PHASE ONE
    # calculating TWU of all 1-itemsets
    for tid, line in enumerate(transactions):
        parse_twu(line, tid)
    
    # filtering out all low-utility 1-itemsets based on TWU threshold
    HU_itemsets = high_utility_one_itemset()
    all_candidates.update(HU_itemsets)
    # apriori implementation
    k = 2
    while HU_itemsets:
        # using (k-1)-high-utility itemsets to create k-itemset candidates
        candidates = generate_candidates(HU_itemsets, k)
        HU_itemsets = filter_by_twu(candidates, min_util)
        all_candidates.update(HU_itemsets)
        k += 1
    print(k)
    
    # PHASE TWO
    final_high_utility_itemsets = compute_exact_utility(all_candidates, min_util)
    print(len(final_high_utility_itemsets))
    print(final_high_utility_itemsets)
    end_time = datetime.now()
    lapsed_time = end_time - start_time
    print("Lapsed time = ", lapsed_time)

In [8]:
run()

4
9845
{(14974, 15969): 25426.0, (28092, 30280): 26164.0, (20459, 21210): 26682.0, (8071, 14588): 27150.0, (9119, 35159): 28088.0, (1782, 6031): 28416.0, (39435, 39610): 29811.0, (14918, 16976): 31347.0, (7224, 38515): 31635.0, (14533, 14627): 32033.0, (8084, 19896): 32637.0, (19894, 19896): 33918.0, (14520, 21747): 34935.0, (7841, 21895): 35259.0, (24521, 44125): 36190.0, (7632, 34553): 36838.0, (13737, 20843): 38456.0, (498, 35289): 38780.0, (4472, 43418): 39504.0, (21929, 28305): 40042.0, (9090, 21747): 40600.0, (3566, 3948): 41039.0, (16852, 38515): 41287.0, (12508, 27177): 41925.0, (3948, 24429): 42673.0, (5634, 23151): 43669.0, (12297, 21746): 44297.0, (9551, 18880): 44673.0, (4472, 12277): 45161.0, (9119, 16971): 45609.0, (16844, 22826): 45987.0, (12508, 21929): 46225.0, (16971, 19898): 46593.0, (16971, 17454): 46881.0, (16124, 21719): 47269.0, (15457, 34465): 47997.0, (14312, 20148): 48515.0, (12297, 14054): 49183.0, (4974, 5735): 49701.0, (12370, 21214): 50619.0, (14627, 27070