In [12]:
import sys
import time
import tracemalloc


class Tep:
    def __init__(self, TID, prob, util, trans_util):
        self.TID = TID
        self.prob = round(prob, 2)  # Round to 2 decimal places
        self.util = util
        self.trans_util = trans_util

    def combine_with(self, other_tep):
        """Combine this Tep with another Tep."""
        combined_prob = round(self.prob * other_tep.prob, 2)
        combined_util = self.util + other_tep.util
        return Tep(self.TID, combined_prob, combined_util, other_tep.trans_util)

    def __repr__(self):
        return f"Tep(TID={self.TID}, prob={self.prob}, util={self.util}, trans_util={self.trans_util})\n"


class Cup:
    def __init__(self, name, tep_list=None):
        self.name = name
        self.exp_sup = round(sum(tep.prob for tep in tep_list), 2) if tep_list else 0
        self.tep_list = tep_list if tep_list else []
        self.max_prob = round(max(tep.prob for tep in tep_list), 2) if tep_list else 0
        self.trans_wei_util = sum(tep.trans_util for tep in tep_list) if tep_list else 0
        self.utility = sum(tep.util for tep in tep_list) if tep_list else 0
        self.last = []

    def update(self, probability, TID, trans_util, util_value):
        probability = round(probability, 2)  # Round to 2 decimal places
        tep = Tep(TID, probability, util_value, trans_util)
        self.exp_sup = round(self.exp_sup + probability, 2)  # Round after addition
        self.utility += util_value
        self.tep_list.append(tep)
        self.max_prob = max(self.max_prob, probability)
        self.trans_wei_util += trans_util

    def combine_tep(self, tep_list_x, tep_list_y):
        tep_list_xy = []
        i, j = 0, 0
        while i < len(tep_list_x) and j < len(tep_list_y):
            tX = tep_list_x[i]
            tY = tep_list_y[j]
            if tX.TID < tY.TID:
                i += 1
            elif tX.TID > tY.TID:
                j += 1
            else:
                combined_prob = round(tX.prob * tY.prob, 2)
                combined_util = tX.util + tY.util
                tep_list_xy.append(Tep(tX.TID, combined_prob, combined_util, tX.trans_util))
                i += 1
                j += 1
        return tep_list_xy

    def combine_with(self, other_cup):
        if len(other_cup.last) == 0:
            combined_name = self.name + ', ' + other_cup.name
            combined_tep_list = self.combine_tep(self.tep_list, other_cup.tep_list)
            last = [other_cup]
        else:
            combined_name = self.name + ', ' + other_cup.last[0].name
            combined_tep_list = self.combine_tep(self.tep_list, other_cup.last[0].tep_list)
            last = other_cup.last

        combined_cup = Cup(combined_name, combined_tep_list)
        combined_cup.last = last
        return combined_cup

    def __repr__(self):
        return f"Cup(name={self.name}, exp_sup={self.exp_sup}, utility={self.utility})"



class AlgorithmTUHUFP:
    # Initialization and other methods remain the same
    def __init__(self):
        self.start_timestamp = 0
        self.end_timestamp = 0
        self.database_size = 0
        self.database_util = 0
        self.candidates = 0
        self.top_UHUFP = []
        self.single_cup = {}
        self.threshold = float('-inf')
        self.min_util = float('-inf')
        self.peak_memory_usage = 0

    def read_data(self, file_path, percentage, k):
        file_paths = file_path.split(", ")
        # print(f"File paths: {file_paths}")  # Check the paths
        try:
            with open(file_paths[0], 'r') as file1, open(file_paths[1], 'r') as file2:
                print("Reading data . . .")
                tlines = file1.readlines()
                ulines = file2.readlines()
                item_name = tlines[0].strip().split(" ")
                # print(f"Item names: {item_name}")  # Check item names

                TID = 1
                for prob_line, util_line in zip(tlines[1:], ulines):
                    # print(f"Processing TID {TID}: {prob_line.strip()} | {util_line.strip()}")  # Check each line
                    self.process_data(item_name, prob_line.strip(), util_line.strip(), TID)
                    TID += 1
                    self.database_size += 1

                self.min_util = int(self.database_util * percentage)
                print(f"Minimum utility set to: {self.min_util}")
        except FileNotFoundError as e:
            print(f"File not found: {e}")
            print("STOP ALGORITHM !!!")
            sys.exit(0)
        except Exception as e:
            print(f"An error occurred: {e}")
            sys.exit(0)

            
    def process_data(self, item_name, prob_line, util_line, TID):
        prob_list = prob_line.split(" ")
        trans = util_line.split(":")
        items_util = trans[0].split(" ")
        # print(items_util)
        trans_util = int(trans[1])
        util_list = trans[2].split(" ")

        item_util_list = {item: int(util) for item, util in zip(items_util, util_list)}


        self.database_util += trans_util

        for i, prob in enumerate(prob_list):
            prob = prob.strip()
            if prob and float(prob) > 0:
                try:
                    item = item_name[i]
                    probability = round(float(prob), 2)  # Round to 2 decimal places
                   
                    util_value = item_util_list[item]
                    cup_name = item
                    if cup_name in self.single_cup:
                        # if cup_name == '1262':
                        #     print(trans_util)
                        self.single_cup[cup_name].update(probability, TID, trans_util, util_value)
                    else:
                        new_tep = Tep(TID, probability, util_value, trans_util)
                        self.single_cup[cup_name] = Cup(cup_name, [new_tep])
                except ValueError as ve:
                    print(f"Error processing probability '{prob}': {ve}")
                    continue



    def combine_cup(self, cupX, cupY):
        combined_cup = cupX.combine_with(cupY)
        combined_cup.exp_sup = round(combined_cup.exp_sup, 2)  # Round after combining
        return combined_cup
    
    def get_first_UHUFP(self, min_util, k):
        candidate_list = []
        cups = sorted(self.single_cup.values(), key=lambda x: x.exp_sup, reverse=True)
        # print(cup.name for cup in cups)
        # print(self.single_cup['1518'].tep_list)
        # print(f"Top {k} CUPs after sorting by exp_sup:")
        # for cup in cups[:k]:
        #     print(cup)

        for cup in cups[:k]:  # Only consider the top k
            if cup.trans_wei_util >= min_util:
                candidate_list.append(cup)
            if cup.utility >= min_util:
                self.top_UHUFP.append({'name': cup.name, 'exp_sup': cup.exp_sup, 'utility': cup.utility})
                # self.top_UHUFP.sort(key=lambda x: x['exp_sup'], reverse=True)
                # print(f"Added {cup.name} to top_UHUFP with exp_sup: {cup.exp_sup} and utility: {cup.utility}")
        # print(f"Initial candidate_list size: {len(candidate_list)}")
        # print(f"Initial top_UHUFP size: {len(self.top_UHUFP)}")
        return candidate_list


    # The rest of the methods remain the same


    def TUHUFPSearchHelper(self, combined, k):
        if combined.utility >= self.min_util:
            self.top_UHUFP.append({'name': combined.name, 'exp_sup': combined.exp_sup, 'utility': combined.utility})
            self.top_UHUFP.sort(key=lambda x: x['exp_sup'], reverse=True)
            if len(self.top_UHUFP) > k:
                self.top_UHUFP.pop()
                self.threshold = self.top_UHUFP[-1]['exp_sup']

        # print(f"Top {k} CUPs after add {combined}")
        # for cup in self.top_UHUFP[:]:
        #     print(cup)
        # print(self.threshold)


    def TUHUFPSearch(self, currentCup, k):
        if len(currentCup) <= 1:
            return
        for i in range(len(currentCup) - 1):
            newCupList = []
            for j in range(i + 1, len(currentCup)):
                # Use dot notation to access attributes of the Cup objects
                overestimate = currentCup[i].exp_sup * currentCup[j].max_prob
                if overestimate < self.threshold:
                    break
                combined = self.combine_cup(currentCup[i], currentCup[j])
                if combined.exp_sup > self.threshold:
                    self.TUHUFPSearchHelper(combined, k)
                    if combined.trans_wei_util >= self.min_util:
                        newCupList.append(combined)
                        self.candidates += 1
            self.TUHUFPSearch(newCupList, k)


    def run_TUHUFP_algorithm(self, file_path, percentage, k):
        self.start_timestamp = time.time()
        self.candidates = 0
        self.database_size = 0

        self.read_data(file_path, percentage, k)
        print({'database util: ': self.database_util})
        candidate_list = self.get_first_UHUFP(self.min_util, k)
        if not self.single_cup:
            print("No CUP List was created. STOP ALGORITHM !!!")
            sys.exit(0)
        self.candidates = len(candidate_list)
        print(self.candidates)
        if len(self.top_UHUFP) == k:
            self.threshold = self.top_UHUFP[-1]['exp_sup']

        print(f"Top {k} CUPs after sorting by exp_sup:")
        i = 1
        for cup in candidate_list[:]:
            print(f" {i}: {cup}")
            i +=1
        # Bắt đầu theo dõi bộ nhớ
        tracemalloc.start()
        self.TUHUFPSearch(candidate_list, k)

        # Kết thúc theo dõi bộ nhớ và lấy thông tin
        _, self.peak_memory_usage = tracemalloc.get_traced_memory()
        tracemalloc.stop()
        self.peak_memory_usage /= 10**6  # Chuyển đổi sang MB

        self.end_timestamp = time.time()

    def print_stats(self, path):
        with open(path, 'w') as output_file:
            output_file.write(f"minUtil: {self.min_util}\n")
            self.top_UHUFP.sort(key=lambda x: x['exp_sup'], reverse=True)
            for t in self.top_UHUFP:
                output_file.write(f"{t['name']}: {t['exp_sup']}: {t['utility']}\n")
            output_file.write("=============  TOP-K UFPs v1.20 - STATS =============\n")
            output_file.write(f" Transactions count from database : {self.database_size}\n")
            output_file.write(f" Candidates count : {self.candidates}\n")
            output_file.write(f" Algorithm run time : {self.end_timestamp - self.start_timestamp:.0f} seconds\n")
            output_file.write(f" Peak memory usage : {self.peak_memory_usage:.2f} MB\n")  # Ghi lại peak memory usage

# Example usage:
# algorithm = AlgorithmTUHUFP()
# algorithm.run_TUHUFP_algorithm("../../data/example.txt, ../../data/example_utility.txt", 0.3, 5)
# algorithm.print_stats("output.txt")




In [13]:

# algorithm = AlgorithmTUHUFP()
# algorithm.run_TUHUFP_algorithm("../../data/input_foodmart.txt, ../../data/foodmart_utility.txt", 0.0004, 100)
# algorithm.print_stats("../../out/TUHUFP/output_foodmart_top_100.txt")

In [14]:
algorithm = AlgorithmTUHUFP()
algorithm.run_TUHUFP_algorithm("../../data/input_retail.txt, ../../data/retail_utility.txt", 0.001, 100)
algorithm.print_stats("../../out/TUHUFP/output_retail_top_100.txt")

Reading data . . .
Minimum utility set to: 14910
{'database util: ': 14910915}
100
Top 100 CUPs after sorting by exp_sup:
 1: Cup(name=40, exp_sup=25268.8, utility=277044)
 2: Cup(name=49, exp_sup=21135.04, utility=461182)
 3: Cup(name=39, exp_sup=7775.43, utility=169166)
 4: Cup(name=33, exp_sup=7519.53, utility=250500)
 5: Cup(name=42, exp_sup=7426.48, utility=245544)
 6: Cup(name=66, exp_sup=2237.39, utility=49622)
 7: Cup(name=90, exp_sup=1913.99, utility=21008)
 8: Cup(name=226, exp_sup=1626.78, utility=17780)
 9: Cup(name=171, exp_sup=1553.45, utility=67864)
 10: Cup(name=238, exp_sup=1543.19, utility=16788)
 11: Cup(name=37, exp_sup=1470.53, utility=128112)
 12: Cup(name=111, exp_sup=1398.23, utility=15317)
 13: Cup(name=311, exp_sup=1294.12, utility=28166)
 14: Cup(name=102, exp_sup=1139.52, utility=24494)
 15: Cup(name=476, exp_sup=1102.97, utility=23850)
 16: Cup(name=272, exp_sup=1038.03, utility=22948)
 17: Cup(name=414, exp_sup=948.79, utility=61290)
 18: Cup(name=439, exp

In [15]:
# algorithm = AlgorithmTUHUFP()
# algorithm.run_TUHUFP_algorithm("../../data/input_chess.txt, ../../data/chess_utility.txt", 0.002, 900)
# algorithm.print_stats("../../out/TUHUFP/output_chess_top_900.txt")