In [1]:
%load_ext Cython

In [2]:
%%cython
import itertools
import copy
import time
import json

from memory_profiler import memory_usage
from pyfpgrowth import find_frequent_patterns as fpgrowth
from collections import Counter

In [3]:
support = 0
input_file = "contextPasquier99.txt"
output_file = open('outfile', 'w')
n_list = {}

In [4]:
def read_db_as_list(filename):
    global support
    transactions = []
    n_transactions = 0
    with open(filename) as finput:
        f = finput.read().split('\n')
        transactions = list(map(lambda x: x.strip().split(' '), f))
        finput.close()
    transactions = list(filter(lambda x: x!=[''], transactions))
        
    return transactions

In [5]:
def make_itemset_1(filename, minsup):
    global support
    itemset_1 = {}
    n_transactions = 0
    with open(filename) as finput:
        f = finput.read().split('\n')
        for line in f:
            n_transactions += 1
            items = line.strip().split(' ')
            for item in items:
                if item not in itemset_1:
                    itemset_1[item] = 0
                itemset_1[item] += 1
    support = n_transactions * minsup
    
    for k, v in list(itemset_1.items()):
        if v < support:
            del itemset_1[k]
    return itemset_1

In [6]:
class PPCNode:
    def __init__(self):
        self.pre_order = None
        self.post_order = None
        self.count = None
        self.label = None
        self.parent = None
        self.child = None
        self.sibling = None


class FPTNode:
    def __init__(self):
        self.equivalent_items = None
        self.child_nodes = None
        self.label = None
        self.itemset = None
        self.support = None

In [7]:
def create_node(root, label):
    node = PPCNode()
    node.label = label
    node.count = 1
    node.parent = root
    return node


def insert_ppc_node(root, item):
    if root.child is None:
        node = create_node(root, item)
        root.child = node
        return root.child
    elif root.child.label == item:
        root.child.count += 1
        return root.child
    elif root.child.sibling is None:
        node = create_node(root, item)
        root.child.sibling = node
        return root.child.sibling
    else:
        current_sibling = root.child.sibling
        last_sibling = None
        while current_sibling is not None:
            if current_sibling.label == item:
                current_sibling.count += 1
                return current_sibling
            else:
                last_sibling = current_sibling
                current_sibling = current_sibling.sibling
        node = create_node(root, item)
        last_sibling.sibling = node        
        return node

In [8]:
def pre_post(root, pre=0, post=0):
    root.pre_order = pre
    post_ = post
    if root.child is not None:
        root.child.pre_order = pre + 1
        pre, post_ = pre_post(root.child, root.child.pre_order, post)
    root.post_order = post_
    if root.sibling is not None:
        root.sibling.pre_order = pre + 1
        pre, post_ = pre_post(root.sibling, root.sibling.pre_order, root.post_order + 1)
        return pre, post_
    return pre, post_ + 1    

In [9]:
def build_ppc_tree(filename, itemset_1):
    root = PPCNode()
    with open(filename) as finput:
        f = finput.read().split('\n')
        for line in f:
            items = line.strip().split(' ')            
            transaction = []
            for item in items:
                if item in list(itemset_1.keys()):
                    transaction.append((item, itemset_1[item]))
            transaction.sort(key=lambda x: x[0])
            transaction.sort(key=lambda x: x[1], reverse=True)
            root_aux = root
            for item in transaction:
                root_aux = insert_ppc_node(root_aux, item[0])
    pre_post(root)
    return root

In [10]:
def print_tree(root):
    print(root.label, root.pre_order, root.post_order)
    if root.child:
        print_tree(root.child)
    if root.sibling:
        print_tree(root.sibling)

In [11]:
def make_n_list(root, n_list={}):
    if root.label:
        if root.label not in n_list:
            n_list[root.label] = []
        n_list[root.label].append(((root.pre_order, root.post_order), root.count))
    if root.child:
        make_n_list(root.child, n_list)
    if root.sibling:
        make_n_list(root.sibling, n_list)

In [12]:
def NL_interserction(n_list1, n_list2, minsup):
    n_list_result = []
    for k in n_list1:
        for l in n_list2:
            if k[0][0] < l[0][0] and k[0][1] > l[0][1]:                 
                n_list_result.append((k[0], l[1]))
    d = {x:0 for x, _ in n_list_result} 
    for name, num in n_list_result: d[name] += num 
    n_list_result = list(map(tuple, d.items()))
    return n_list_result

In [13]:
def find_subsets(items, n):
    return list(itertools.combinations(items, n))


def get_all_subsets(items):
    subsets = [find_subsets(items, i) for i in range(len(items)+1)]
    subsets = ['-'.join(map(str, item)) for item in list(itertools.chain.from_iterable(subsets))]
    return subsets[1:]

In [14]:
def get_n_list(key):
    global n_list
    return n_list[key] if key in n_list else []

In [15]:
def building_pattern_tree(cur_no, next_nos, father_no=None):
    global support
    global n_list
    global output_file
    
    if cur_no.equivalent_items is None:
        cur_no.equivalent_items = []
        
    cur_no.child_nodes = []
    if father_no is not None:
        p1 = get_n_list('-'.join([father_no.label, cur_no.label]))
    else:
        p1 = get_n_list(cur_no.label)
        
    for i in next_nos:
        if father_no is not None:
            p2 = get_n_list('-'.join([father_no.label, i.label]))
        else:
            p2 = get_n_list(i.label)
            
        p = NL_interserction(p2, p1, support)
        p_support = sum([item[1] for item in p])
        
        if p_support == cur_no.support:
            cur_no.equivalent_items += [i.label]
        elif p_support >= support:
            child = FPTNode()
            child.label = i.label
            child.support = p_support
            cur_no.child_nodes += [child]
            if father_no is not None:
                n_list['-'.join([father_no.label, cur_no.label, child.label])] = p
            else:
                n_list['-'.join([cur_no.label, child.label])] = p

    if father_no is not None:
        cur_no.label = '-'.join([father_no.label, cur_no.label])
        
    output_file.write(cur_no.label + " #SUP: " + str(cur_no.support) + "\n")
    
    if len(cur_no.equivalent_items) > 0:
        subsets = get_all_subsets(cur_no.equivalent_items)
        cand_itemsets = [('-'.join([cur_no.label, item]), cur_no.support) for item in subsets]
        for itemset in cand_itemsets:
            output_file.write(itemset[0] + " #SUP: " + str(itemset[1]) + "\n")
    
    if len(cur_no.child_nodes) > 0:
        for child in cur_no.child_nodes:
            aheads_ = [i for i in cur_no.child_nodes[cur_no.child_nodes.index(child)+1:]]
            child.equivalent_items = list(cur_no.equivalent_items)
            building_pattern_tree(child, aheads_, cur_no)

In [16]:
def prepostplus(minsup=0):
    global support
    global output_file
    global n_list
    
    initial_time = time.time()
    
    itemset_1 = make_itemset_1(input_file, minsup)
    tree = build_ppc_tree(input_file, itemset_1)
    make_n_list(tree, n_list)

    items_ordered = list(itemset_1.items())
    items_ordered.sort(key=lambda x: x[0], reverse=True)
    items_ordered.sort(key=lambda x: x[1])
    items_ordered = [x[0] for x in items_ordered]

    nodes = []
    for key in list(items_ordered):
        node = FPTNode()
        node.label = key
        node.support = sum([item[1] for item in n_list[key]])
        nodes.append(node)

    for node in list(nodes):
        aheads = nodes[nodes.index(node)+1:]
        building_pattern_tree(node, aheads)
    final_time = time.time()
    
    output_file.close()
    print("Execution time (s):", final_time - initial_time)

In [17]:
memory_usage((prepostplus))

Execution time (s): 0.0


ValueError: I/O operation on closed file.