In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import os
import tarfile
from six.moves import urllib
import pandas as pd
# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = ""
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## FP-Growth

In [2]:
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode  # needs to be updated
        self.children = {}

    def inc(self, numOccur):
        self.count += numOccur

    def disp(self, ind=1):
        print('  ' * ind, self.name, ':', self.count)
        for child in self.children.values():
            child.disp(ind + 1)

"""
Create FP tree
"""
def createTree(dataSet, minSup=1):  # create FP-tree from dataset but don't mine
    headerTable = {}
    # go over dataSet twice
    for trans in dataSet:  # first pass counts frequency of occurance
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    for k in headerTable.keys():  # remove items not meeting minSup
        if headerTable[k] < minSup:
            del (headerTable[k])
    #print len(headerTable)
    freqItemSet = set(headerTable.keys())
    # print 'freqItemSet: ',freqItemSet
    if len(freqItemSet) == 0: return None, None  # if no items meet min support -->get out
    for k in headerTable:
        headerTable[k] = [headerTable[k], None]  # reformat headerTable to use Node link
    # print 'headerTable: ',headerTable
    retTree = treeNode('Null Set', 1, None)  # create tree
    for tranSet, count in dataSet.items():  # go through dataset 2nd time
        #print "create tree loop"
        localD = {}
        for item in tranSet:  # put transaction items in order
            if item in freqItemSet:
                localD[item] = headerTable[item][0]
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
            updateTree(orderedItems, retTree, headerTable, count)  # populate tree with ordered freq itemset
    return retTree, headerTable  # return tree and header table

# 让FP树生长
def updateTree(items, inTree, headerTable, count):
    if items[0] in inTree.children:  # check if orderedItems[0] in retTree.children
        inTree.children[items[0]].inc(count)  # incrament count
    else:  # add items[0] to inTree.children
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        if headerTable[items[0]][1] == None:  # update header table
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    if len(items) > 1:  # call updateTree() with remaining ordered items
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)

# 增加节点后更新头指针表
def updateHeader(nodeToTest, targetNode):  # this version does not use recursion
    while (nodeToTest.nodeLink != None):  # Do not use recursion to traverse a linked list!
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode

"""
Find Conditional pattern base
"""
def ascendTree(leafNode, prefixPath):  # ascends from leaf node to root
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)


def findPrefixPath(basePat, treeNode):  # treeNode comes from header table
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats


"""
Create Mine tree
"""
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    #print "mine tree loop"
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]  # (sort header table)
    for basePat in bigL:  # start from bottom of header table
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        # print 'finalFrequent Item: ',newFreqSet    #append to set
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        # print 'condPattBases :',basePat, condPattBases
        # 2. construct cond FP-tree from cond. pattern base
        myCondTree, myHead = createTree(condPattBases, minSup)
        # print 'head from conditional tree: ', myHead
        if myHead != None:  # 3. mine cond. FP-tree
            # print 'conditional tree for: ',newFreqSet
            # myCondTree.disp(1)
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)

def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat

# 将数据集转换为字典格式
# frozen set：不可变的set，有哈希值
def createInitSet(dataSet):
    retDict = {}
    for trans in dataSet:
        retDict[frozenset(trans)] = 1
    return retDict

def createInitSet_dict(dataSet):
    retDict = {}
    for subDict in dataSet.values():
        retDict[frozenset(subDict.keys())] = 1
    return retDict

In [3]:
simpDat = loadSimpDat()
initSet = createInitSet(simpDat)
fpTree, headerTab = createTree(initSet, 3)
freqItems = []
mineTree(fpTree, headerTab, 3, set([]), freqItems)
print(freqItems)

[set([u'y']), set([u'y', u'x']), set([u'y', u'z']), set([u'y', u'x', u'z']), set([u'r']), set([u't']), set([u'x', u't']), set([u'y', u'x', u't']), set([u'z', u't']), set([u'x', u'z', u't']), set([u'y', u'x', u'z', u't']), set([u'y', u'z', u't']), set([u'y', u't']), set([u's']), set([u'x', u's']), set([u'x']), set([u'x', u'z']), set([u'z'])]


In [4]:
path = './birch6_validate/'
train_data = np.load(path + 'user_train_data.npy').item()

In [5]:
results = pd.DataFrame(columns=['cluster', 'n_learners', 'minSup', 'n_freqsets'])
for cluster in range(0, 6):
    print(cluster)
    initSet = createInitSet_dict(train_data[cluster])
    for minSup in range(12, 30, 1):
        print('minSup:', float(minSup)/100)
        minCount = int(float(minSup)/100*len(initSet))
        fpTree, headerTab = createTree(initSet, minCount)
        if headerTab is None:
            results.loc[len(results)] = [cluster, len(initSet), float(minSup)/100, 0]
            break
        freqItems = []
        mineTree(fpTree, headerTab, minCount, set([]), freqItems)
        results.loc[len(results)] = [cluster, len(initSet), float(minSup)/100, len(freqItems)]
results.to_csv(path+'param_minSup.csv')

0
minSup: 0.12
minSup: 0.13
minSup: 0.14
minSup: 0.15
1
minSup: 0.12
minSup: 0.13
2
minSup: 0.12
minSup: 0.13
minSup: 0.14
minSup: 0.15
minSup: 0.16
minSup: 0.17
minSup: 0.18
minSup: 0.19
minSup: 0.2
minSup: 0.21
minSup: 0.22
minSup: 0.23
minSup: 0.24
minSup: 0.25
minSup: 0.26
minSup: 0.27
minSup: 0.28
minSup: 0.29
3
minSup: 0.12
minSup: 0.13
minSup: 0.14
minSup: 0.15
minSup: 0.16
minSup: 0.17
4
minSup: 0.12
minSup: 0.13
minSup: 0.14
minSup: 0.15
minSup: 0.16
minSup: 0.17
minSup: 0.18
minSup: 0.19
minSup: 0.2
minSup: 0.21
minSup: 0.22
minSup: 0.23
minSup: 0.24
minSup: 0.25
minSup: 0.26
minSup: 0.27
minSup: 0.28
minSup: 0.29
5
minSup: 0.12
minSup: 0.13
minSup: 0.14
minSup: 0.15
minSup: 0.16
minSup: 0.17
minSup: 0.18
minSup: 0.19
minSup: 0.2
minSup: 0.21
minSup: 0.22
minSup: 0.23
minSup: 0.24
minSup: 0.25
minSup: 0.26
minSup: 0.27
minSup: 0.28
minSup: 0.29


In [6]:
results

Unnamed: 0,cluster,n_learners,minSup,n_freqsets
0,0.0,8816.0,0.12,75122.0
1,0.0,8816.0,0.13,1029.0
2,0.0,8816.0,0.14,7.0
3,0.0,8816.0,0.15,0.0
4,1.0,6387.0,0.12,5.0
5,1.0,6387.0,0.13,0.0
6,2.0,1.0,0.12,1.0
7,2.0,1.0,0.13,1.0
8,2.0,1.0,0.14,1.0
9,2.0,1.0,0.15,1.0
