In [None]:
'''
CONSTANTS
'''

FILE_PATH = "../data/Chicago_Crimes_2001_to_2017_utility.txt"
MIN_UTIL = 500000

In [None]:
'''
IHUP-Tree Node Class
    Stores the item, twu, tf, parent, children, and node link
'''
class Node:
    def __init__(self, item, twu, tf=1, parent=None):
        self.item = item
        self.twu = twu
        self.tf = tf # unused, an optional way to order 
        self.parent = parent
        self.children = {}
        self.node_link = None


    def increment_vals(self, twu, tf=1):
        self.twu += twu
        self.tf += tf

In [None]:
'''
IHUP-Tree Class
    Stores an emoty root and header-table which points to the first instance of an 
    item in the tree. This allows for fast traversal through all instances of items
    and easier collection of prefix paths
'''
class IHUPTree:
    def __init__(self):
        self.root = Node(item=None, twu=0, tf=0)
        self.header_table = {}


    '''
    Updates the header table, by traversing to end of linked list of node-links and
    appending new node, or creating entry in header table if first instance of item
    '''
    def update_header(self, node):
        if node.item in self.header_table:
            current = self.header_table[node.item]
            while current.node_link:
                current = current.node_link
            current.node_link = node
        else:
            self.header_table[node.item] = node


    '''
    Inserts transacition by traversing the tree via the children dict. If the current node 
    to be added does not exist, creates new node, else updates the nodes values
    '''
    def insert_transaction(self, transaction, transaction_utility):
        current = self.root
        for item in transaction:
            if item in current.children:
                child = current.children[item]
                child.increment_vals(transaction_utility)
            else:
                child = Node(item, transaction_utility, parent=current)
                current.children[item] = child
                self.update_header(child)
            current = child

In [None]:
# CANDIDATE GENERARTION AND MINING METHODS

'''
Retrieves the prefix path for a node, traverses from the given node to the root, 
and returns the path
'''
def get_prefix_path(node):
    path = []
    current = node.parent
    while current and current.item is not None:
        path.insert(0, current.item)
        current = current.parent
    return path


'''
Creates the projected tree for an item. Uses he header-table to traverse all instances of 
the given item, and inserts the prefix path for each occurence of the item into a new 
IHUP-tree
'''
def get_projected_tree(full_tree, item):
    proj = IHUPTree()
    current = full_tree.header_table.get(item)
    while current:
        prefix = get_prefix_path(current)
        if prefix:
            proj.insert_transaction(prefix, current.twu)
        current = current.node_link
    return proj


'''
DFS recursive method mining the candidates from IHUP-tree. Iteraets through each item in 
lexiographic/numeric order, adding each item to the current prefix (initially empty). 
Computes the TWU, and prunes if TWU < minutil, else adds to candidates, then generates
projected tree for the item, and recurses
'''
def get_candidates(tree, minutil, prefix, candidates):
    for item in sorted(tree.header_table.keys(), key=int):
        new_cand = prefix + [item]
        # sum TWU over all occurrences
        twu_sum = 0
        node = tree.header_table[item]
        while node:
            twu_sum += node.twu
            node = node.node_link
        if twu_sum < minutil:
            continue
        key = tuple(sorted(new_cand, key=int))
        candidates[key] = twu_sum
        proj = get_projected_tree(tree, item)
        if proj.root.children:
            get_candidates(proj, minutil, new_cand, candidates)



'''
Reparses the original database to compute the exact utility of each candidate item set, 
pruning those who do not meet minimum utility threshold.
'''
def exact_high_utils(candidates, transactions, minutil):
    trans_map = [dict(zip(items, utils)) for items, utils in transactions]
    high_utils = {}
    for cand in candidates:
        cand_set = set(cand)
        util = 0.0
        for map in trans_map:
            if cand_set <= map.keys():
                util += sum(map[i] for i in cand)
        if util >= minutil:
            high_utils[cand] = util

    return high_utils



'''
Driver method for the mining process. Passes through the original database, and adds each
transaction to a global IHUP tree, then generates candiates and computes exact utiltiies
'''
def get_high_utility_itemsets(file_path, minutil):
    transactions = [] 
    tree = IHUPTree()
    with open(file_path, 'r') as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            parts = line.split(':')
            raw_items = parts[0].split()
            total_util = float(parts[1])
            raw_utils = list(map(float, parts[2].split()))
            paired = sorted(zip(raw_items, raw_utils), key=lambda x: int(x[0]))
            items = [item for item, _ in paired]
            utils = [utility for _, utility in paired]        
            items, utils = list(items), list(utils)

            transactions.append((items, utils))

            tree.insert_transaction(items, total_util) 

    candidates = {}
    get_candidates(tree, minutil, [], candidates)

    print(f"IHUP cands @{minutil}: {len(candidates)}")

    results = exact_high_utils(candidates, transactions, minutil)
    return results

In [None]:
import os


def write_results_to_file(results, input_file, minutil, output_folder="../output"):
    # Create the output folder if it doesn't exist.
    os.makedirs(output_folder, exist_ok=True)
    
    # Extract the base name without extension.
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    # Build the output file name.
    output_filename = f"{base_name}_high_utility_itemsets_min{minutil}.txt"
    output_path = os.path.join(output_folder, output_filename)
    
    with open(output_path, "w") as file:
        for itemset, utility in results.items():
            # Convert the tuple candidate to a string using a space separator.
            itemset_str = ' '.join(itemset)
            file.write(f"{itemset_str} {utility}\n")
    
    print(f"Results have been written to: {output_path}")


In [8]:
def run():
    results = get_high_utility_itemsets(FILE_PATH, MIN_UTIL)
    print(f"Run complete: found {len(results)} HUIs @ {MIN_UTIL}")
    return results


In [9]:
run()

IHUP cands @500000: 38
Run complete: found 9 HUIs @ 500000


{('1',): 1640506.0,
 ('8',): 1442717.0,
 ('1', '8'): 971931.0,
 ('12',): 885431.0,
 ('1', '12'): 516739.0,
 ('8', '12'): 737414.0,
 ('17',): 923000.0,
 ('1', '17'): 616170.0,
 ('8', '17'): 636657.0}