In [1]:
import copy
import random
random.seed(42)
import datetime
import heapq

from collections import namedtuple

In [None]:
class Item:
    def __init__(self, item: str, utility: int):
        self.item = item
        self.utility = utility
        self._twu = 0

    @property
    def twu(self) -> int:
        return self._twu

    @twu.setter
    def twu(self, value: int) -> None:
        self._twu = value

    def __repr__(self):
        return f"{self.item}"

    def __eq__(self, other):
        if isinstance(other, Item):
            return self.item == other.item and self.utility == other.utility
        return False

    def __hash__(self):
        return hash((self.item, self.utility))


def check_order_condition(a: Item, b: Item) -> bool:
    """_summary_

    Args:
        a (Item): an item
        b (Item): an item

    Returns:
        bool: return a > b
    """
    if a.utility <= 0 and b.utility > 0:
        return True
    elif a.utility * b.utility >= 0:
        return a.twu >= b.twu
    return False


def check_order_item_and_set(item: Item, item_set: set[Item]) -> bool:
    """_summary_
    This function is used to check an item > item-set or not.
    Example: a > {b, c}
    Args:
        item (Item): an item
        item_set (set[Item]): an item set

    Returns:
        bool: return item > item_set
    """
    for i in item_set:
        if i != item and check_order_condition(item, i) == False:
            return False
    return True


class TransItem:
    def __init__(self, item: Item, quantity: int, probability: float):
        self.item = item
        self.quantity = quantity
        self.probability = probability

    def __repr__(self):
        return f"{self.item},{self.quantity},{self.probability}"

    def get_total_probability(self):
        return self.quantity * self.probability


class Transaction:
    def __init__(self, id: int, trans_items: list[TransItem]):
        self.id = id
        self.trans_items_dict = {
            trans_item.item: (trans_item.quantity, trans_item.probability)
            for trans_item in trans_items
        }

    def __repr__(self):
        return f"Transaction(id={self.id}, items={list(self.trans_items_dict.keys())}, quantities={list(self.trans_items_dict.values())})"

    def contains_item_set(self, item_set: set[Item]) -> bool:
        # Check directly against the dictionary keys
        return item_set.issubset(self.trans_items_dict.keys())

    def get_quantity_of_item(self, item: Item) -> int:
        # Access quantity directly from the dictionary
        return self.trans_items_dict.get(item, (0, 0))[0]

    def get_probability_of_item(self, item: Item) -> float:
        # Access probability directly from the dictionary
        return self.trans_items_dict.get(item, (0, 0))[1]

    def get_items(self) -> set[Item]:
        return set(self.trans_items_dict.keys())

    def get_probability_of_item_set(self, item_set: set[Item]) -> float:
        if not self.contains_item_set(item_set):
            return 0.0
        total_probability = 1.0
        for item in item_set:
            total_probability *= self.get_probability_of_item(item)
        return total_probability

    def _calculate_utility(self, item_set: set[Item], condition: callable) -> int:
        total_utility = 0
        if self.contains_item_set(item_set):
            for item in item_set:
                quantity = self.get_quantity_of_item(item)
                if condition(item.utility):
                    total_utility += item.utility * quantity
        return total_utility

    def get_positive_utility_of_item_set(self, item_set: set[Item]) -> int:
        return self._calculate_utility(item_set, lambda utility: utility > 0)

    def get_negative_utility_of_item_set(self, item_set: set[Item]) -> int:
        return self._calculate_utility(item_set, lambda utility: utility < 0)

    def get_utility_of_item_set(self, item_set: set[Item]) -> int:
        return self._calculate_utility(item_set, lambda utility: True)

    def get_remaining_utility_of_item_set(self, items: set[Item]) -> int:
        ru = 0
        ordered_items = {
            item
            for item in self.trans_items_dict.keys()
            if item.utility > 0 and item not in items
        }
        for item in ordered_items:
            if check_order_item_and_set(item, items):
                quantity = self.trans_items_dict[item][0]
                ru += item.utility * quantity
        return ru

class PriorityQueue:
    def __init__(self, max_size: int):
        self.max_size = max_size
        self.heap: list[tuple[int, set[Item]]] = []
        self.item_sets: set[frozenset] = set()

    def push(self, utility: int, item_set: set):
        fs_item_set = frozenset(item_set)

        if fs_item_set in self.item_sets:
            return

        if len(self.heap) < self.max_size:
            heapq.heappush(self.heap, (utility, item_set))
            self.item_sets.add(fs_item_set)
        else:
            if utility > self.heap[0][0]:
                removed = heapq.heappushpop(self.heap, (utility, item_set))
                self.item_sets.remove(frozenset(removed[1]))
                self.item_sets.add(fs_item_set)

    def get_min_utility(self) -> int:
        if self.heap:
            return self.heap[0][0]
        return float("-inf")

    def sort(self):
        return sorted(self.heap, reverse=True)

    def print_items(self):
        for utility, item_set in self.sort():
            print(f"{item_set}: {utility}")


# Utilities = namedtuple("Utilities", ["tid", "pro", "pu", "nu", "ru"])
from dataclasses import dataclass


@dataclass
class Utilities:
    tid: int
    pro: float
    pu: float
    nu: float
    ru: float


class AbstractList:
    def __init__(self, items: set[Item], utility_values: list[Utilities]):
        self.items = items
        self.utility_values = utility_values

    def get_ru(self) -> int:
        ru = 0
        for i in self.utility_values:
            ru += i.ru
        return ru

    def get_pu(self) -> int:
        pu = 0
        for i in self.utility_values:
            pu += i.pu
        return pu

    def get_nu(self) -> int:
        nu = 0
        for i in self.utility_values:
            nu += i.nu
        return nu

    def get_pro(self) -> float:
        pro = 0
        for i in self.utility_values:
            pro += i.pro
        return pro

    def get_utility(self) -> int:
        u = 0
        for i in self.utility_values:
            u += i.pu + i.nu
        return u

    def __repr__(self):
        items_str = ", ".join(str(item) for item in self.items)
        utility_values_str = ", \n".join(
            str(utility) for utility in self.utility_values
        )
        return f"AbstractList(\n  Items: [{items_str}]\n  Utility Values: \n[{utility_values_str}]\n)"

    def __eq__(self, other):
        if not isinstance(other, AbstractList):
            return False
        return self.items == other.items and self.utility_values == other.utility_values

    def __hash__(self):
        return hash(frozenset(self.items)) ^ hash(tuple(self.utility_values))


class PNUList(AbstractList):
    def __init__(self, items: set[Item], utility_values: list[Utilities]):
        super().__init__(items, utility_values)

    def __repr__(self):
        items_str = ", ".join(str(item) for item in self.items)
        utility_values_str = ", \n".join(
            str(utility) for utility in self.utility_values
        )
        return f"PNUList(\n  Items: [{items_str}]\n  Utility Values: \n[{utility_values_str}]\n)"

class MList(AbstractList):
    def __init__(
        self,
        items: set[Item],
        true_items: set[Item],
        prefix: PNUList,
        utility_values: list[Utilities],
        ru: int,
        pu: int,
    ):
        super().__init__(items, utility_values)
        self.true_items = true_items
        self.prefix = prefix
        self.ru = ru
        self.pu = pu

    def __repr__(self):
        return (
            f"MList(items={repr(self.items)}, "
            f"true_items={repr(self.true_items)}, "
            f"utility_values={repr(self.utility_values)}, "
            f"ru={self.ru}, pu={self.pu})"
        )

In [3]:
a = Item("a", 6)
b = Item("b", 7)
c = Item("c", 1)
d = Item("d", -5)
e = Item("e", 3)

t1_trans_items = {TransItem(b, 3, 0.85), TransItem(c, 1, 1.0), TransItem(d, 2, 0.70)}

t2_trans_items = {
    TransItem(a, 1, 1.0),
    TransItem(b, 1, 0.60),
    TransItem(c, 3, 0.75),
    TransItem(e, 1, 0.40),
}

t3_trans_items = {
    TransItem(a, 1, 0.55),
    TransItem(b, 2, 0.60),
    TransItem(c, 4, 1.0),
    TransItem(d, 1, 0.90),
    TransItem(e, 5, 0.40),
}

t4_trans_items = {TransItem(b, 3, 0.90), TransItem(d, 1, 0.45)}

t5_trans_items = {
    TransItem(a, 4, 1.0),
    TransItem(c, 3, 0.85),
    TransItem(d, 2, 0.70),
    TransItem(e, 2, 0.45),
}

t1 = Transaction(1, t1_trans_items)
t2 = Transaction(2, t2_trans_items)
t3 = Transaction(3, t3_trans_items)
t4 = Transaction(4, t4_trans_items)
t5 = Transaction(5, t5_trans_items)

item_list = [a, b, c, d, e]
db1 = [t1, t2, t3, t4, t5]
db2 = [t1, t2, t3, t4, t5]
db3 = [t1, t2, t3, t4, t5]
db4 = [t1, t2, t3, t4, t5]

In [4]:
def create_trans_items_dictionary(item_list: list[Item], database: list[Transaction]):
    trans_dict: dict[Item, set[Transaction]] = {item: set() for item in item_list}
    for trans in database:
        for item in trans.get_items():
            trans_dict[item].add(trans)
    return trans_dict

In [5]:
a = {1, 2, 3}
b = {2, 3, 4}
print(a & b)

{2, 3}


In [6]:
def find_max_min_avg_periodic_of_item_set(item_set: set[Item], db: list[Transaction], trans_item_dict: dict[Item, set[Transaction]]):
    """_summary_

    Args:
        item_set (set[Item]): an item set
        db (list[Transaction]): database

    Returns:
        tuple[(int, int, int)]: a tuple contains (max_per, min_per, avg_per) of item_set respectively
    """

    intersection_trans = set()
    for item in item_set:
        intersection_trans &= trans_item_dict.get(item)
    print(intersection_trans)
    
    # trans_ids: list[int] = find_trans_ids_contain_item_set(item_set, db)
    # m = len(trans_ids)
    # ps: list[int] = list()
    # ps.append(trans_ids[0])
    # for i in range(1, m):
    #     ps.append(trans_ids[i] - trans_ids[i - 1])
    # ps.append(len(db) - trans_ids[m - 1])
    # max_per, min_per = max(ps), min(ps)
    # avg_per = len(db) / len(ps)
    # return max_per, min_per, avg_per

In [7]:
def sort_items_by_twu_and_utility(items: list[Item]) -> list[Item]:
    """_summary_

    Args:
        items (list[Item]): a list item

    Returns:
        list[Item]: a sorted list by two principle:
        1. positive always takes priority over negative
        2. ascending twu value
    """    
    def sort_key(item: Item) -> tuple:
        return (0 if item.utility > 0 else 1, item.twu)
    return sorted(items, key=sort_key)

In [8]:
def calculate_utility_of_item_set_in_database(
    item_set: set[Item], database: list[Transaction]
) -> int:
    """_summary_

    Args:
        item_set (set[Item]): an item set
        database (list[Transaction]): database

    Returns:
        int: return utility (positive + negative) of item set in database
    """    
    utility = 0
    for trans in database:
        if trans.contains_item_set(item_set):
            utility += trans.get_utility_of_item_set(item_set)
    return utility

**Algorithm 1: Positive Real Item Utility strategy**


In [9]:
def priu_pruning(
    utility_arr: list[int],
    k: int
) -> int:
    sorted_util: list[int] = sorted(utility_arr, reverse=True)
    return (
        sorted_util[len(utility_arr) - 1]
        if k >= len(utility_arr)
        else sorted_util[k - 1]
    )

**Algorithm 2: Positive LIU-Exact strategy**


In [10]:
def pliue_strategy(lius: dict[frozenset[Item], int], k: int, current_min_util: int):
    """_summary_

    Args:
        lius (dict[frozenset[Item], int]): _description_
        k (int): _description_co
        current_min_util (int): _description_

    Returns:
        _type_: _description_
    """
    piqu_liu = list()
    for key, utility in lius.items():
        piqu_liu.append(utility)

    piqu_liu.sort(reverse=True)

    max_index = len(piqu_liu) - 1 if k > len(piqu_liu) else k - 1
    if piqu_liu[max_index] > current_min_util:
        current_min_util = piqu_liu[max_index]
    return current_min_util

In [11]:
from itertools import combinations

def create_liu_dict(
    item_list: list[Item], database: list[Transaction]
) -> dict[frozenset[Item], int]:
    """_summary_

    Args:
        item_list (list[Item]): a sorted item list
        database (list[Transaction]): database

    Returns:
        dict[frozenset[Item], int]: return a dictionary with key is an 2-item-set, and value is item-set's utility
        example: {{a, b}:50,..}
    """
    item_transaction_map = {item: [] for item in item_list}
    
    for trans in database:
        for item in trans.get_items():
            if item in item_transaction_map:
                item_transaction_map[item].append(trans)

    liu_dict = {}
    for item1, item2 in combinations(item_list, 2):
        # Using AND to get list of transaction that contains item set {item1, item2} 
        relevant_transactions: set[Transaction] = set(item_transaction_map[item1]) & set(
            item_transaction_map[item2]
        )
        # Calculate sum utility
        utility = sum(
            trans.get_utility_of_item_set({item1, item2})
            for trans in relevant_transactions
            if trans.contains_item_set({item1, item2})
        )
        liu_dict[frozenset({item1, item2})] = utility
    return liu_dict

**Algorithm 3: PLIU_LB strategy**


In [12]:
def pliu_lb_strategy(
    lius: dict[frozenset[Item], int],
    piqu_liu: set[int],
    ordered_list: list[Item],
    k: int,
    current_min_util: int,
    database: list[Transaction],
) -> int:
    piqu_lb_liu: set[int] = set()
    for key, value in lius.items():
        key_list = list(key)
        post_start_item: Item = (
            key_list[0]
            if check_order_condition(key_list[1], key_list[0])
            else key_list[1]
        )

        post_end_item: Item = (
            key_list[1]
            if check_order_condition(key_list[1], key_list[0])
            else key_list[0]
        )

        start_index = ordered_list.index(post_start_item)
        end_index = ordered_list.index(post_end_item)

        # sub_list = ordered_list[start_index + 1 : end_index]

        for x in range(start_index + 1, end_index):
            util_lb = value - calculate_utility_of_item_set_in_database(
                {ordered_list[x]}, database
            )
            if util_lb > current_min_util:
                piqu_lb_liu.add(util_lb)
            for y in range(x + 1, end_index):
                # print("y = " + str(ordered_list[y]))
                util_lb = (
                    value
                    - calculate_utility_of_item_set_in_database(
                        {ordered_list[x]}, database
                    )
                    - calculate_utility_of_item_set_in_database(
                        {ordered_list[y]}, database
                    )
                )
                if util_lb > current_min_util:
                    piqu_lb_liu.add(util_lb)
                for z in range(y + 1, end_index):
                    util_lb = (
                        value
                        - calculate_utility_of_item_set_in_database(
                            {ordered_list[x]}, database
                        )
                        - calculate_utility_of_item_set_in_database(
                            {ordered_list[y]}, database
                        )
                        - calculate_utility_of_item_set_in_database(
                            {ordered_list[z]}, database
                        )
                    )
                    if util_lb > current_min_util:
                        piqu_lb_liu.add(util_lb)
                    for w in range(z + 1, end_index):
                        util_lb = (
                            value
                            - calculate_utility_of_item_set_in_database(
                                {ordered_list[x]}, database
                            )
                            - calculate_utility_of_item_set_in_database(
                                {ordered_list[y]}, database
                            )
                            - calculate_utility_of_item_set_in_database(
                                {ordered_list[z]}, database
                            )
                            - calculate_utility_of_item_set_in_database(
                                {ordered_list[w]}, database
                            )
                        )
                        if util_lb > current_min_util:
                            piqu_lb_liu.add(util_lb)
    piqu_all: list[int] = list(piqu_lb_liu | piqu_liu)
    if len(piqu_all) >= k:
        piqu_all = list(piqu_all)
        piqu_all.sort(reverse=True)
        current_min_util = piqu_all[0]
    return current_min_util

In [13]:
def calculate_local_utility(alpha: set[Item], item: Item, database: list[Transaction]) -> int:
    """_summary_

    Args:
        alpha (set[Item]): an item set
        item (Item): a combined item
        database (list[Transaction]): database

    Returns:
        int: return a local utility value of item set
    """    
    lu = 0
    for trans in database:
        if trans.contains_item_set(alpha | {item}):
            lu += trans.get_utility_of_item_set(
                alpha
            ) + trans.get_remaining_utility_of_item_set(alpha)
    return lu


def calculate_subtree_utility(alpha: set[Item], item: Item, db: list[Transaction]) -> int:
    """_summary_

    Args:
        alpha (set[Item]): an item set
        item (Item): a combined item
        database (list[Transaction]): database

    Returns:
        int: return subtree utility value of item set
    """    
    su = 0
    if check_order_item_and_set(item, alpha):
        for trans in db:
            if trans.contains_item_set(alpha | {item}):
                su += (
                    trans.get_utility_of_item_set(alpha)
                    + trans.get_utility_of_item_set({item})
                    + trans.get_remaining_utility_of_item_set(set(alpha | {item}))
                )
    return su

In [14]:
def calculate_utilities_of_item_set(
    item_set: set[Item], database: list[Transaction]
) -> tuple[float, int, int, int]:
    """_summary_

    Args:
        item_set (set[Item]): an item set
        database (list[Transaction]): database

    Returns:
        tuple[float, int, int, int]: return a tuple (probability, utility, remaining-utility, twu)
    """    
    p = 0
    u = 0
    ru = 0
    twu = 0
    for trans in database:
        if trans.contains_item_set(item_set):
            p += trans.get_probability_of_item_set(item_set)
            u += trans.get_utility_of_item_set(item_set)
            ru += trans.get_remaining_utility_of_item_set(item_set)
            twu += calculate_positive_utility_of_transaction(trans)
    return p, u, ru, twu

In [15]:
def find_database_projection(item_set: set[Item], database: list[Transaction]) -> list[Transaction]:
    """_summary_

    Args:
        item_set (set[Item]): an item set
        database (list[Transaction]): database

    Returns:
        list[Transaction]: return a sub-database with only transactions contains item_set
    """       
    db_projection = list()
    for trans in database:
        if trans.contains_item_set(item_set):
            db_projection.append(trans)
    return db_projection

In [16]:
def efim_global_search(
    alpha: set[Item],
    project_db: list[Transaction],
    primary: list[Item],
    secondary: list[Item],
    min_util: int,
    user_prob_threshold: float,
    k: int,
    prob_arr: list[float],
    item_dict: dict[Item, int],
    topk_queue: PriorityQueue,
) -> None:
    for pri_item in primary:
        beta: set[Item] = alpha | {pri_item}
        prob, util = calculate_probability_and_utility_of_item_set(beta, project_db)
        if util >= min_util and round(prob, 3) >= user_prob_threshold:
            topk_queue.push(util, beta)
            if len(topk_queue.heap) == k:
                min_util = topk_queue.sort()[k - 1][0]
        i = secondary.index(pri_item)
        if i != -1 and i + 1 < len(secondary):
            second_item = secondary[i + 1]
            if (
                util < min_util or round(prob, 3) < user_prob_threshold
            ) and second_item.utility < 0:
                continue
            beta_dp: list[Transaction] = find_database_projection(beta, project_db)
            new_primary = list()
            new_secondary = list()
            lu_arr, su_arr, prob_arr = create_local_and_subtree_utility_bin_array(
                beta, secondary, beta_dp
            )
            for i in range(len(secondary)):
                item = secondary[i]
                if item != pri_item and item.twu >= min_util:
                    p = prob_arr[i]
                    lu = lu_arr[i]
                    if round(p, 3) >= user_prob_threshold and lu >= min_util:
                        new_secondary.append(item)
                        su = su_arr[i]
                        if su >= min_util:
                            new_primary.append(item)
            if primary and secondary:
                efim_global_search(
                    beta,
                    beta_dp,
                    new_primary,
                    new_secondary,
                    min_util,
                    user_prob_threshold,
                    k,
                    prob_arr,
                    item_dict,
                    topk_queue,
                )

In [17]:
def generate_mlist(X: AbstractList, Y: AbstractList, P: AbstractList, du: int):
    z = X.items | Y.items
    mlist = MList(z, Y.items, P, Y.utility_values, du, 0)
    return mlist

In [18]:
def topk_mining_based_on_EFIM(
    database: list[Transaction], item_list: list[Item], k: int, min_prob: float
) -> PriorityQueue:
    t1 = datetime.datetime.now()
    if k <= 0:
        return
    # Init alpha with empty set
    alpha: set[Item] = set()
    topk_queue = PriorityQueue(k)
    min_util = 0
    user_prob_threshold = min_prob * len(database)
    prob_arr, twu_arr, utility_arr, item_dict = create_prob_twu_utility_bin_array(
        item_list, database
    )
    positive_list: list[Item] = [item for item in item_list if item.utility >= 0]
    min_util: int = priu_pruning(utility_arr, k)
    secondary = list()
    for i in range(len(item_list)):
        item_list[i].twu = twu_arr[i]
        if round(prob_arr[i], 3) >= user_prob_threshold and twu_arr[i] >= min_util:
            secondary.append(item_list[i])
    removed_list = set(item_list).difference(secondary)
    for trans in database:
        for removed_item in removed_list:
            trans.trans_items_dict.pop(removed_item, None)
    lius: dict[frozenset[Item], int] = create_liu_dict(positive_list, database)
    min_util = pliue_strategy(lius, k, min_util)
    _, su_array, _ = create_local_and_subtree_utility_bin_array(alpha, secondary, database)
    primary: list[Item] = [
        secondary[i] for i in range(len(secondary)) if su_array[i] >= min_util
    ]
    secondary = sort_items_by_twu_and_utility(secondary)

    t2 = datetime.datetime.now()
    print("phase 1 util: " + str(min_util))
    print("phase 1 take time: " + str(t2 - t1))
    t1 = datetime.datetime.now()
    efim_global_search(
        alpha,
        database,
        primary,
        secondary,
        min_util,
        user_prob_threshold,
        k,
        prob_arr,
        item_dict,
        topk_queue,
    )
    t3 = datetime.datetime.now()
    print("phase 2 util: " + str(min_util))
    print("phase 2 take time: " + str(t3 - t2))
    return topk_queue

**BASE ON PHUI**


In [19]:
def find_tuple_by_trans_id(P: AbstractList, target_trans_id: int) -> Utilities:
    utilities_list: list[Utilities] = P.utility_values
    for iTuple in utilities_list:
        if iTuple.tid == target_trans_id:
            return iTuple
    return None


def utility_list_construct(
    P: PNUList,
    Px: PNUList,
    Py: PNUList,
    min_util: int,
    user_prob_threshold: float,
):
    if not Px or not Py or not Px.utility_values or not Py.utility_values:
        return None

    x = Px.items
    y = Py.items
    xy = x | y
    utilities_list: list[Utilities] = []
    Pxy = PNUList(xy, utilities_list)

    y_dict = {utl.tid: utl for utl in Py.utility_values}
    p_dict = (
        {utl.tid: utl for utl in P.utility_values} if P and P.utility_values else {}
    )

    probability = Px.get_pro()
    utility = Px.get_pu() + Px.get_ru()

    for xTuple in Px.utility_values:
        yTuple = y_dict.get(xTuple.tid, None)

        if yTuple:
            if P and P.utility_values:
                pTuple = p_dict.get(xTuple.tid, None)
                if pTuple:
                    pro = 1e-10 if pTuple.pro == 0 else pTuple.pro
                    xyTuple = Utilities(
                        xTuple.tid,
                        xTuple.pro * yTuple.pro / pro,
                        xTuple.pu + yTuple.pu - pTuple.pu,
                        xTuple.nu + yTuple.nu - pTuple.nu,
                        yTuple.ru,
                    )
                    utilities_list.append(xyTuple)
            else:
                xyTuple = Utilities(
                    xTuple.tid,
                    xTuple.pro * yTuple.pro,
                    xTuple.pu + yTuple.pu,
                    xTuple.nu + yTuple.nu,
                    yTuple.ru,
                )
                utilities_list.append(xyTuple)
        else:
            probability -= xTuple.pro
            utility -= xTuple.pu + xTuple.ru

            if probability < user_prob_threshold or utility < min_util:
                return None
    return Pxy


def covl_construct(
    sorted_list: list[Item],
    eucst_dict: dict[frozenset[Item], int],
    utility_arr: list[int],
    item_index_dict: dict[Item, int],
    database: list[Transaction],
):
    covl_list = list()
    for i in range(len(sorted_list)):
        x: Item = sorted_list[i]
        coverage_list = list()
        for j in range(i + 1, len(sorted_list)):
            y: Item = sorted_list[j]
            xy: set[Item] = {x, y}
            key = frozenset(xy)
            xy_twu = eucst_dict.get(key)
            if x.twu == xy_twu:
                coverage_list.append(y)
        r = len(coverage_list)
        if r == 0:
            coverage_list.append(-1)
        else:
            util: int = 0
            for z in range(0, r):
                util += calculate_utility_of_item_set_in_database(
                    {x, coverage_list[z]}, database
                )
            util -= (r - 1) * get_value_from_bin(x, utility_arr, item_index_dict)
            covl_list.append(util)
    covl_list.sort(reverse=True)
    return covl_list

In [20]:
def create_eucst_dict(
    sorted_item_list: list[Item], database: list[Transaction], min_util: int
):
    # Calculate all positive utility of transaction in database, and store in a dictionary
    transaction_twu = {
        trans: calculate_positive_utility_of_transaction(trans) for trans in database
    }
    # Create a dictionary with key is a item and value is a list of transaction that contains item
    item_transaction_map = {item: [] for item in sorted_item_list}
    for trans in database:
        for item in trans.get_items():
            if item in item_transaction_map:
                item_transaction_map[item].append(trans)

    eucst_dict = {}
    for item1, item2 in combinations(sorted_item_list, 2):
        # Using AND operator to get list of transaction that contains item set {item1, item2}
        relevant_transactions: set[Transaction] = set(
            item_transaction_map[item1]
        ) & set(item_transaction_map[item2])
        # Calculate sum utility in that transaction list, NOT in database
        twu = sum(
            transaction_twu[trans]
            for trans in relevant_transactions
            if trans.contains_item_set({item1, item2})
        )
        if twu >= min_util:
            eucst_dict[frozenset({item1, item2})] = twu
    return eucst_dict

def update_eucst_dict(
    eucst_dict: dict[frozenset, int], min_util: int
) -> dict[frozenset, int]:
    return {key: twu for key, twu in eucst_dict.items() if twu >= min_util}


In [21]:
def phui_searching_procedure(
    PList: AbstractList,
    lists: list[AbstractList],
    current_min_util: int,
    user_prob_threshold: float,
    database: list[Transaction],
    eucs_dict: dict[frozenset[Item], int],
    k: int,
    topk_queue: PriorityQueue,
):
    for i in range(0, len(lists)):
        XList: AbstractList = lists[i]
        XList_utility = XList.get_utility()
        XList_prob = XList.get_pro()
        if (
            round(XList_prob, 3) >= user_prob_threshold
            and XList_utility >= current_min_util
        ):
            topk_queue.push(XList_utility, XList.items)
            if len(topk_queue.heap) == k:
                current_min_util = topk_queue.sort()[k - 1][0]
        if (
            round(XList_prob, 3) >= user_prob_threshold
            and XList.get_pu() + XList.get_ru() >= current_min_util
        ):
            new_lists: list[AbstractList] = list()
            for j in range(i + 1, len(lists)):
                YList: AbstractList = lists[j]
                x = XList.items.difference(PList.items)
                y = YList.items.difference(PList.items)
                key = frozenset(x | y)
                twu_value = eucs_dict.get(key, -1)
                if twu_value >= current_min_util:
                    ZList = utility_list_construct(
                        PList, XList, YList, current_min_util, user_prob_threshold
                    )
                    if ZList and round(ZList.get_pro(), 3) >= user_prob_threshold:
                        new_lists.append(ZList)
            if new_lists:
                phui_searching_procedure(
                    XList,
                    new_lists,
                    current_min_util,
                    user_prob_threshold,
                    database,
                    eucs_dict,
                    k,
                    topk_queue,
                )

In [22]:
def calculate_dynamic_upper_bound(
    Y: AbstractList, X: AbstractList, database: list[Transaction]
) -> int:
    """_summary_

    Args:
        Y (AbstractList): _description_
        X (AbstractList): _description_
        database (list[Transaction]): _description_

    Returns:
        int: _description_
    """
    x = X.items - Y.items
    return (
        Y.get_ru() + Y.get_pu() + calculate_utility_of_item_set_in_database(x, database)
    )


def calculate_dynamic_upper_bound_v2(
    Y: AbstractList,
    X: AbstractList,
    utility_array: list[int],
    item_index_dict: dict[Item, int],
) -> int:
    x = next(iter(X.items - Y.items))
    return (
        Y.get_ru() + Y.get_pu() + get_value_from_bin(x, utility_array, item_index_dict)
    )


def phui_searching_procedure_plus(
    PList: AbstractList,
    lists: list[AbstractList],
    current_min_util: int,
    user_prob_threshold: float,
    db: list[Transaction],
    utility_array: list[int],
    item_index_dict: dict[Item, int],
    eucs_dict: dict[frozenset[Item], int],
    k: int,
    topk_queue: PriorityQueue,
) -> None:
    for i in range(0, len(lists)):
        XList: AbstractList = lists[i]
        x_utility = XList.get_utility()
        x_prob = XList.get_pro()
        remaining_utility = XList.get_ru()
        if round(x_prob, 3) >= user_prob_threshold and x_utility >= current_min_util:
            topk_queue.push(x_utility, XList.items)
            if len(topk_queue.heap) == k:
                current_min_util = topk_queue.sort()[k - 1][0]

        if (
            round(x_prob, 3) >= user_prob_threshold
            and remaining_utility + x_utility >= current_min_util
        ):
            new_lists: list[AbstractList] = list()
            for j in range(i + 1, len(lists)):
                YList: AbstractList = lists[j]
                x = XList.items.difference(PList.items)
                y = YList.items.difference(PList.items)
                twu_value = eucs_dict.get(frozenset(x | y), -1)
                if twu_value >= current_min_util:
                    # du = calculate_dynamic_upper_bound(YList, XList, db)
                    du = calculate_dynamic_upper_bound_v2(YList, XList, utility_array, item_index_dict)
                    # print(str(du) + " - " + str(du2))
                    if du >= current_min_util:
                        if isinstance(YList, MList):
                            ZList: PNUList = utility_list_construct(
                                YList.prefix,
                                XList,
                                YList,
                                current_min_util,
                                user_prob_threshold,
                            )
                            if ZList:
                                new_lists.append(ZList)
                        else:
                            ZList: PNUList = utility_list_construct(
                                PList,
                                XList,
                                YList,
                                current_min_util,
                                user_prob_threshold,
                            )
                            if ZList:
                                new_lists.append(ZList)
                    else:
                        if isinstance(YList, MList):
                            ZMlist = generate_mlist(XList, YList, None, du)
                            new_lists.append(ZMlist)
                        else:
                            ZMlist = generate_mlist(XList, YList, PList, du)
                            new_lists.append(ZMlist)
            phui_searching_procedure_plus(
                XList,
                new_lists,
                current_min_util,
                user_prob_threshold,
                db,
                utility_array,
                item_index_dict,
                eucs_dict,
                k,
                topk_queue,
            )

In [23]:
def create_pnu_lists(item_list: list[Item], database: list[Transaction]):
    item_to_transactions = {item: [] for item in item_list}
    for trans in database:
        for item in trans.get_items():
            if item in item_to_transactions:
                item_to_transactions[item].append(trans)

    pnu_list = []
    for item, transactions in item_to_transactions.items():
        utility_values_list = []
        pnu = PNUList({item}, utility_values_list)
        for trans in transactions:
            pro = trans.get_probability_of_item_set({item})
            pu = trans.get_positive_utility_of_item_set({item})
            nu = trans.get_negative_utility_of_item_set({item})
            ru = trans.get_remaining_utility_of_item_set({item})
            utility_values = Utilities(trans.id, pro, pu, nu, ru)
            utility_values_list.append(utility_values)
        pnu_list.append(pnu)
    return pnu_list

In [24]:
def topk_mining_based_on_PHUI(
    database: list[Transaction],
    item_list: list[Item],
    k: int,
    min_prob: float,
    is_plus: bool,
) -> PriorityQueue:
    t1 = datetime.datetime.now()
    # Create priority to contains top-k HUI
    topk_queue = PriorityQueue(k)
    user_prob_threshold = min_prob * len(database)

    # Create a utility array & and a dictionary to search with O(1)
    prob_arr, twu_arr, utility_arr, item_index_dict = create_prob_twu_utility_bin_array(
        item_list, database
    )

    positive_utility_arr = [u for u in utility_arr if u > 0]
    # First update min_util = the k-th highest utility value (using RIU strategy)
    current_min_util: int = priu_pruning(positive_utility_arr, k)
    # Create a list that contains all items is unqualified
    removed_list: list[Item] = list()
    for i in range(len(item_list)):
        item = item_list[i]
        if (
            round(prob_arr[i], 3) >= user_prob_threshold
            and twu_arr[i] >= current_min_util
        ):
            item.twu = twu_arr[i]
        else:
            removed_list.append(item)
    # Remove unqualified item
    new_distinct_items = [item for item in item_list if item not in removed_list]

    # Sort item list by order
    new_distinct_items = sort_items_by_twu_and_utility(new_distinct_items)

    # Remove unqualified items from transaction
    for trans in database:
        for removed_item in removed_list:
            trans.trans_items_dict.pop(removed_item, None)

    # Create list[AbstractList],
    pnu_lists: list[AbstractList] = create_pnu_lists(new_distinct_items, database)

    # Create EUCST dict that contain twu > current_min_util of all 2-item-set
    eucst_dict: dict[frozenset[Item], int] = create_eucst_dict(
        new_distinct_items, database, current_min_util
    )

    # Create CUDM dict that contain utility of all 2-item-set
    cudm_dict: dict[frozenset[Item], int] = create_liu_dict(
        new_distinct_items, database
    )

    # Update and increase current_min_util
    if cudm_dict:
        cud: int = pliue_strategy(cudm_dict, k, current_min_util)
        # Update current min_util = the k-th highest utility in CUDM dict
        current_min_util = max(current_min_util, cud)

    covl: list[int] = covl_construct(new_distinct_items, eucst_dict, utility_arr, item_index_dict, database)
    # Update and increase current_min_util
    if covl:
        current_min_util = max(current_min_util, covl[min(len(covl), k) - 1])

    eucst_dict = update_eucst_dict(eucst_dict, current_min_util)
    root = AbstractList({}, list())
    t2 = datetime.datetime.now()
    print("phase 1 util: " + str(current_min_util))
    print("phase 1 take time: " + str(t2 - t1))
    if is_plus:
        phui_searching_procedure_plus(
            root,
            pnu_lists,
            current_min_util,
            user_prob_threshold,
            database,
            utility_arr,
            item_index_dict,
            eucst_dict,
            k,
            topk_queue,
        )
    else:
        phui_searching_procedure(
            root,
            pnu_lists,
            current_min_util,
            user_prob_threshold,
            database,
            eucst_dict,
            k,
            topk_queue,
        )
    t3 = datetime.datetime.now()
    print("phase 2 util: " + str(current_min_util))
    print("phase 2 take time: " + str(t3 - t2))
    print(t2 - t1)
    return topk_queue

**Tree Base**


In [25]:
class PHUNode:
    def __init__(
        self,
        item_set: set[Item] = None,
        utility: int = 0,
        ru: int = 0,
        prob: float = 0.0,
        children: list["PHUNode"] = None,
        parent: "PHUNode" = None,
    ):
        self.item_set = item_set if item_set is not None else set()
        self.utility = utility
        self.ru = ru
        self.prob = prob
        self.children = children if children is not None else []
        self.parent = parent

    def add_child(self, child: "PHUNode") -> None:
        self.children.append(child)

    def is_leaf(self) -> bool:
        return len(self.children) == 0

    def get_total_utility(self) -> int:
        total = self.utility
        for child in self.children:
            total += child.get_total_utility()
        return total

    def get_right_sibling(self):
        if self.parent is None:
            return []

        siblings = self.parent.children

        if not siblings or self not in siblings:
            return []

        current_index = siblings.index(self)
        if current_index < len(siblings) - 1:
            return siblings[current_index + 1 :]
        else:
            return []

    def is_htwui(
        self, database: list[Transaction], min_util: int, user_prob_threshold: float
    ) -> bool:
        p, twu = calculate_probability_and_twu_of_item_set(self.item_set, database)
        return twu >= min_util and round(p, 3) >= user_prob_threshold

    def __repr__(self) -> str:
        return f"{self.item_set}"

    def print_children(self, level=0):
        print("  " * level + f"{self.item_set}")
        for child in self.children:
            child.print_children(level + 1)

In [26]:
def build_subtree(
    node_x: PHUNode,
    database: list[Transaction],
    current_min_util: int,
    user_prob_threshold: float,
    k: int,
    topk_queue: PriorityQueue,
):
    if not node_x.is_htwui(database, current_min_util, user_prob_threshold):
        return
    right_siblings = node_x.get_right_sibling()
    generates: list[PHUNode] = list()
    for node_y in right_siblings:
        if node_y.is_htwui(database, current_min_util, user_prob_threshold):
            xy = (node_x.item_set - node_y.item_set).union(
                node_y.item_set - node_x.item_set
            )
            xy_twu = calculate_transaction_weight_utility(xy, database)
            # EUCP Pruning
            if xy_twu >= current_min_util:
                z_item_set = node_x.item_set.union(node_y.item_set)
                z_p, z_u, z_ru, twu = calculate_utilities_of_item_set(
                    z_item_set, database
                )
                node_z = PHUNode(z_item_set, z_u, z_ru, z_p, [], node_x)
                node_x.add_child(node_z)
                if round(z_p, 3) >= user_prob_threshold:
                    generates.append(node_z)
                    if z_u >= current_min_util:
                        if frozenset(z_item_set) not in {
                            frozenset(item[1]) for item in topk_queue.heap
                        }:
                            topk_queue.push(z_u, z_item_set)
                            if len(topk_queue.heap) == k:
                                current_min_util = topk_queue.sort()[k - 1][0]
    for gen_node in generates:
        build_subtree(
            gen_node, database, current_min_util, user_prob_threshold, k, topk_queue
        )

In [27]:
def build_tree(
    database: list[Transaction], item_list: list[Item], k: int, min_prob: float
):
    min_util = 0
    user_prob_threshold = min_prob * len(database)
    topk_queue: PriorityQueue = PriorityQueue(k)
    positive_list: list[Item] = [item for item in item_list if item.utility >= 0]
    current_min_util: int = priu_pruning(positive_list, min_util, database)
    removed_list = set()
    for item in item_list:
        prob, twu = calculate_probability_and_twu_of_item_set({item}, database)
        if round(prob, 3) >= user_prob_threshold and item.twu >= current_min_util:
            item.twu = twu
        else:
            removed_list.add(item)
    # Remove unqualified item
    new_distinct_items = [item for item in item_list if item not in removed_list]
    # sort item list by order
    new_distinct_items = sort_items_by_twu_and_utility(new_distinct_items)
    root: PHUNode = PHUNode()
    for item in new_distinct_items:
        new_node = PHUNode({item}, 0, 0, 0, list(), root)
        root.add_child(new_node)
        utility = calculate_utility_of_item_set_in_database({item}, database)
        if utility >= current_min_util:
            if frozenset({item}) not in {
                frozenset(item[1]) for item in topk_queue.heap
            }:
                topk_queue.push(utility, {item})
                if len(topk_queue.heap) == k:
                    current_min_util = topk_queue.sort()[k - 1][0]

    liu_dict: dict[frozenset[Item], int] = create_liu_dict(new_distinct_items, database)
    pliue_util = pliue_strategy(liu_dict, k, min_util)
    current_min_util = max(current_min_util, pliue_util)
    for child in root.children:
        build_subtree(
            child, database, current_min_util, user_prob_threshold, k, topk_queue
        )
    return topk_queue

In [28]:
item_list = [Item(f"i{i+1}", random.randint(-10, 10)) for i in range(35)]
db1 = list()
db2 = list()
# db3 = list()
for transaction_id in range(1, 1001):
    trans_items = set()
    selected_items = random.sample(item_list, random.randint(16, 30))
    for item in selected_items:
        quantity = random.randint(1, 10)
        probability = round(random.uniform(0.01, 0.99), 2)
        trans_item = TransItem(item, quantity, probability)
        trans_items.add(trans_item)
    transaction = Transaction(transaction_id, trans_items)
    db1.append(transaction)
    db2.append(transaction)

In [29]:
k = 20
prob_threshold = 0.0002

In [30]:
t1 = datetime.datetime.now()
rs2 = topk_mining_based_on_PHUI(db1, item_list, k, prob_threshold, True)
rs2.print_items()
t2 = datetime.datetime.now()
print("topk_mining_based_on_PHUI(true): " + str(len(rs2.heap)))
print("topk_mining_based_on_PHUI(true): " + str(t2 - t1))

NameError: name 'create_prob_twu_utility_bin_array' is not defined

In [None]:
# t1 = datetime.datetime.now()
# rs1 = topk_mining_based_on_PHUI(db1, item_list, k, prob_threshold, False)
# rs1.print_items()
# t2 = datetime.datetime.now()
# print("topk_mining_based_on_PHUI(false): " + str(len(rs1.heap)))
# print("topk_mining_based_on_PHUI(false): " + str(t2 - t1))

In [None]:
# t0 = datetime.datetime.now()
# rs3 = topk_mining_based_on_EFIM(db1, item_list, k, prob_threshold)
# rs3.print_items()
# t1 = datetime.datetime.now()
# print("topk_mining_based_on_EFIM: " + str(len(rs3.heap)))
# print("topk_mining_based_on_EFIM: " + str(t1 - t0))

In [None]:
# lists = list()
# for item in item_list:
#     utility_values_list: list[tuple] = list()
#     pnu_list = PNUList({item}, utility_values_list)
#     for trans in db1:
#         if trans.contains_item_set({item}):
#             pro = trans.get_probability_of_item_set({item})
#             pu = trans.get_positive_utility_of_item_set({item})
#             nu = trans.get_negative_utility_of_item_set({item})
#             ru = trans.get_remaining_utility_of_item_set({item})
#             utility_values: Utilities = Utilities(trans.id, pro, pu, nu, ru)
#             utility_values_list.append(utility_values)
#     lists.append(pnu_list)

In [None]:
# root = PNUList({}, list())
# t1 = datetime.datetime.now()
# list1: list[PNUList] = list()
# for l1, l2 in combinations(lists, 2):
#     z = utility_list_construct(root, l1, l2, 0, 0)
#     if z:
#         list1.append(z)
# t2 = datetime.datetime.now()
# print(t2 - t1)

In [None]:
# root = PNUList({}, list())
# t1 = datetime.datetime.now()
# list2: list[PNUList] = list()
# for l1, l2 in combinations(lists, 2):
#     z = utility_list_construct_v2(root, l1, l2, 0, 0)
#     if z:
#         list2.append(z)
# print(len(list2))
# t2 = datetime.datetime.now()
# print(t2 - t1)

In [None]:
# print(list1 == list2)