In [1]:
from datetime import datetime, timedelta
import pandas as pd
import csv
import requests
import os
import re
from itertools import count
import numpy as np

from itertools import accumulate

from spmf import Spmf

# Event Representations

In [2]:
# A common class for all Events

class Event:
    def __init__(self, eventtype):
        self.type=eventtype
    
    #Return Attribute value given attribute name
    def getAttrVal(self, attrName):
        return self.attributes.get(attrName,None)

    
# A class that represents a point event
class PointEvent(Event):
    def __init__(self, timestamp, attributes):
        Event.__init__(self,"point")
        #self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 
        
    

# class to represent an interval event
class IntervalEvent(Event):
    def __init__(self, t1, t2, attributes):
        Event.__init__(self,"interval")
        #self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# Sequence Representations

In [3]:
class Sequence:
    _ids = count(0)
    
    attrdict={}
    reverseatttrdict={}
    def __init__(self, events, sid=None):
        # sequence id
        if sid is None:
            self.sid = next(self._ids)
        else:
            self.sid = sid
        
        self.events = events
        self.volume=1
        self.seqAttributes={}
    def getEventPosition(self, attr, hash_val):
        for count,event in enumerate(self.events):
            #if event.getAttrVal(attr)==hash_val:
            if Sequence.attrdict[attr][event.getAttrVal(attr)]==hash_val:
                return count
        return -1
    
    def setVolume(self, intValue):
        self.volume=intValue
        
    def getVolume(self):
        return self.volume
    
    def increaseVolume(self):
        self.volume += 1 
    
    
    def getUniqueValues(self, attr):
        l=list(set(event.getAttrVal(attr) for event in self.events))
        return l
    
    def getUniqueValueHashes(self, attr):
        l=list(set(event.getAttrVal(attr) for event in self.events))
        uniquelist=[Sequence.attrdict[attr][elem] for elem in l]
        return uniquelist
    
    #Not sure this will always result in same index, will change if 
    #dictionary is updated
    #since python is unordered
    
    def getHashList(self, attr):
        #l=list(list(event.attributes.keys()).index(attr) for event in self.events)
        l=[event.getAttrVal(attr) for event in self.events]
        hashlist=[Sequence.attrdict[attr][elem] for elem in l]
        
        return hashlist
    
    def getValueHashes(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        hashlist=[Sequence.attrdict[attr][elem] for elem in l]
        
        return hashlist
        
    
    def getEventsHashString(self, attr):
        s=attr+": "
        l=list(event.getAttrVal(attr) for event in self.events)
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" "
        s+="".join(str(Sequence.attrdict[attr][elem]) for elem in l)
        return s
    
    def convertToVMSPReadablenum(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        s=" -1 ".join(str(Sequence.attrdict[attr][elem]) for elem in l)
        #s=""
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" -1 "
        s+=" -2"
        
        return s
    
    def convertToVMSPReadable(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        s=" ".join(Sequence.attrdict[attr][elem] for elem in l)
        #s=""
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" -1 "
        s+="."
        
        return s
    
    def getPathID(self):
        return self.sid
    
    def matchPathAttribute(self, attr):
        # should i use eq?!
        if this.seqAttributes.get(attr)==(val):
            return True
        else:
            return False
        
    def setSequenceAttribute(self,attr, value):
        self.seqAttributes[attr]=value
        
         

    # equivalent to method signature public static int getVolume(List<Sequence> seqs)    
    def getSeqVolume(seqlist):
        return sum(seq.getVolume() for seq in seqlist)
    
    
    # Method equivalent to public String getEvtAttrValue(String attr, int hash) in DataManager.java
    def getEvtAttrValue(attr, hashval):
        return Sequence.reverseatttrdict[attr][hashval]
        
    # Method equivalent to public List<String> getEvtAttrValues(String attr) in DataManager.java    
    def getEvtAttrValues(attr):
        return list(Sequence.reverseatttrdict[attr].values())
    
    # Method equivalent to int getEvtAttrValueCount(String attr) in DataManager.java    
    def getEvtAttrValueCount(attr):
        return len(Sequence.reverseatttrdict[attr])
    
    

# Pattern Representation

In [4]:
class Pattern:
    _pids = count(1)

    def __init__(self, events=[]):
        #pattern id
        self.id = next(self._pids)
        
        self.keyEvts = events
        
        self.medianPos=[]
        self.meanPos=[]
        
        self.sids=[]
        
        self.support=0
        self.supPercent=None
        self.cluster=None
        self.medianPathLength=0
        self.meanPathLength=0
        
        self.parnetSegment=None
        self.segSizes=None
        
    def filterPaths(self, paths, evtType):
        print("filtering "+ str(len(paths))+" paths by "+str(len(self.keyEvts))+" checkpoints")
        
        for sequences in paths:
            if(self.matchMilestones(sequences.getValueHashes(evtType),self.keyEvts)==False):
                continue
            self.sids.append(sequences)
            
        print(str(len(self.sids))+" matching paths")

        
    def matchMilestones(self, arr, milestones):
        ja=arr
        idx=-1
        for elems in milestones:
            try:
                idx=arr[idx+1:].index(elems)
                #print(idx)
            except ValueError:
                return False
        return True
    
    def getMedianSpacing(self):
        l=[y - x for x,y in zip(self.medianPos,self.medianPos[1:])]
        if(len(l)<=1):
            return 100
        l=l.sort()
        middle=int(len(l)/2)
        if(len(l)%2==0):
            return ((l[middle-1]+l[middle])/2.0)
        else:
            return l[middle]
        return np.median(np.asarray(l))
    
    def addKeyEvent(self, hashval):
        self.keyEvts.append(hashval)
        
    def addToSupportSet(self, seq):
        #print(seq.sid)
        self.sids.append(seq)
        self.support+=seq.getVolume()
        
    def getSequences(self):
        return self.sids
    
    def setMedianPathLength(self, median):
        self.medianPathLength=median
    
    def setMeanPathLength(self, mean):
        self.meanPathLength=mean
        
    def getMedianPathLength(self):
        return self.medianPathLength
    
    def getMeanPathLength(self):
        return self.meanPathLength
    
    def getEvents(self):
        return self.keyEvts
    
    def getEventMeanPos(self):
        return self.meanPos
    
    def getEventMedianPos(self):
        return self.medianPos
    
    #Do we need to preserve order here??

    def getUniqueEventsString(self):
        #return "-".join(str(x) for x in list(set(self.keyEvts)))
        return "-".join(str(x) for x in list(dict.fromkeys(self.keyEvts)))
    
    def getPositions(self, events, path):
        sequence=path
        pos=[]
        idx=-1
        offset=0
        
        for elems in events:
            
            offset+=idx+1
            try:
                idx=path[offset:].index(elems)
            except ValueError:
                continue
            pos.append(offset+idx)
        return pos
    
    def getMedian(self, data):
        #middle=len(data)/2
        #if(len(data)%2==0 and len(data)>1):
        #    return (data[middle-1]+data[middle])/2.0
        #else: 
        #    return data[middle]
        return np.median(data)
    
    def computePatternStats(self, evtAttr):
        pathsOfStrings=[]
        #print(f' sids {self.sids}')
        for path in self.sids:
            pageSequence=path.getHashList(evtAttr)
            pathsOfStrings.append(pageSequence)
        
        #print(f'path of string {pathsOfStrings}')
        medians=[]
        means=[]
        
        ## swap the loops for better readability
        for i,events in enumerate(self.keyEvts):
            numSteps=[]
            
            for idx,paths in enumerate(pathsOfStrings):
                if(self.matchMilestones(paths, self.keyEvts[0:i+1])):
                    pos=self.getPositions(self.keyEvts[0:i+1], paths)
                    if i==0:
                        #add position value of first element id sequence
                        numSteps.append(pos[i])
                    else:
                        #in other cases add the difference
                        numSteps.append(pos[i]-pos[i-1])
            sum_steps=sum(numSteps)
            
            median= self.getMedian(numSteps)
            
            medians.append(median)
            means.append(sum_steps*1.0/ len(numSteps))
                
            
                
        #list(accumulate(means))
        means=np.cumsum(np.asarray(means))
        medians=np.cumsum(np.asarray(medians))
        
        self.setMedianPositions(medians)
        self.setMeanPositions(means)
        
        trailingSteps=[0]*len(self.sids)
        for i,path in enumerate(self.sids):
            pos=self.getPositions(self.keyEvts, path.getHashList(evtAttr))
            #the difference between the last event in thesequence and the last key event
            trailingSteps[i]= len(path.events)- pos[-1]-1
        
        trailStepSum=sum(trailingSteps)
        median= self.getMedian(trailingSteps)
        mean= trailStepSum/len(trailingSteps)
        
        self.setMedianPathLength(median+medians[-1])
        self.setMeanPathLength(mean+means[-1])
                                  
    def getMedianPositions(self, allPos, pids):
        median=[]
        for k in range(0, len(pid)):
            posInPaths=allPos[k]
            median.append(self.getMedian(posInPaths))
        #return list(self.getMedian(posInPaths) for posInPaths in allPos)
        return median
    
    def getMeanPositions(self, allPos, pids):
        mean=[]
        for k in range(0, len(allPos)):
            mean.append(sum(allPos[k])*1.0/(len(allPos[k])))
        return mean
    
    def setMedianPositions(self, median):
        self.medianPos=median
        
    def setMeanPositions(self, mean):
        self.meanPos=mean
        
    def toJson(self):
        return json.dumps(self, default=lambda o: o.__dict__)#,sort_keys=True, indent=4)
    
    def getSupport(self):
        return self.support
    
    def setCluster(self, cluster):
        self.cluster=cluster
        
    def setParent(self, parent, segment):
        self.parent=parent
        self.parentSegment=segment
    
    
    # How to implement this with BitArray?
    #def getEventBitSet(self)
    
    def getParent(self):
        return self.parent
    
    def getParentSegment(self):
        return self.parentSegment
    
    def setMeanPathLength(self,d):
        self.meanPathLength=d
    
    def getMeanPathLength(self):
        return self.meanPathLength
        
    def setSupport(self, sup, total):
        self.support=sup
        self.supPercent= sup*1.0/total
    

# TreeNode Representation

In [5]:
class TreeNode:
    NID=count(1)
    nodeHash={}
    
    
    def __init__(self, name="", count=0, value=""):
        
        self.nid=next(self.NID)
        self.name=name
        self.seqCount=count
        ## What's the difference between name and value?
        self.value=value
        self.hash=-1
        self.pos=[]
        self.meanStep=0
        self.medianStep=0
        #self.zipCompressRatio=0
        self.incomingBranchUniqueEvts=None
        #self.incomingBranchSimMean=None
        #self.incomingBranchSimMedian=None
        #self.incomingBranchSimVariance=None
        
        self.incomingSequences=[]
        self.outgoingSequences=[]
        
        self.meanRelTimestamp=0
        self.medianRelTimestamp=0
        
        TreeNode.nodeHash[self.nid]=self
        self.children = []
        
    def getNode(self, node_id):
        return nodeHash[node_id]
    
    def clearHash(self):
        nodeHash.clear()
        
    def getIncomingSequences(self):
        return self.incomingSequences
    
    def getSeqCount(self):
        return self.seqCount
    
    def setSeqCount(self, seqCount):
        self.seqCount=seqCount
        
    def getName(self):
        return self.name
    
    def setName(self, name):
        self.name=name
        
    def getMeanStep(self):
        return self.meanStep
    
    #need a better implementation
    def toJSONObject(self):
        return json.dumps(self, default=lambda o: o.__dict__)#,sort_keys=True, indent=4) 
    
    def toString(self):
        return self.name+": "+self.seqCount
    
    def setPositions(self, l):
        self.pos=l
        self.pos.sort()
        d=sum(self.pos)+len(self.pos)
        mid=len(self.pos)/2
        
        if len(self.pos)==0:
            self.meanStep=0
            slf.medianStep=0
        else:
            #WHY WE ARE ADDING 1 to mean and medianStep?
            self.meanStep=d/len(self.pos)
            self.medianStep= np.median(self.pos)+1#((self.pos[mid-1]+self.pos[mid])/2.0)+1 if len(self.pos)%2==0 else self.pos[mid]+1
            
    def getValue(self):
        return self.value
    
    def setValue(self, value):
        self.value=value
        
    def getMedianStep(self):
        return self.medianStep
    
    #def getZipCompressRatio(self):
    #    return self.zipCompressRatio
    
    #def setZipCompressRatio(self, zipcompressratio):
    #    self.zipCompressRatio=zipcompressratio
        
    def getIncomingBranchUniqueEvts(self):
        return self.incomingBranchUniqueEvts
    
    def setIncomingBranchUniqueEvts(self, incomingbranchuniqueevts):
        self.incomingBranchUniqueEvts=incomingbranchuniqueevts
        
    #def setIncomingBranchSimilarityStats(self, mean, median, variance):
    #    self.incomingBranchSimMean=mean
    #    self.incomingBranchSimMedian=median
    #    self.incomingBranchSimVariance=variance
        
    
    def setIncomingSequences(self, incomingbrancseqs, evtattr):
        self.incomingSequences=incomingbrancseqs
        
    def setRelTimeStamps(self, reltimestamps):
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')
        reltimestamps.sort()
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')
        
        d=sum(reltimestamps, timedelta())
        
        mid=len(reltimestamps)/2
        
        if(len(reltimestamps)==0):
            self.meanRelTimestamp=0
            self.medianRelTimestamp=0
            
        else:
        
            self.meanRelTimestamp=d*1.0/len(reltimestamps)
            self.medianRelTimestamp=np.median(reltimestamps) #(reltimestamps[mid-1]+reltimestamps[mid])/2.0 if len(reltimestamps%2==0) else reltimestamps[mid]
        
        #print(f'Time Stamp {self.meanRelTimestamp}')
        #print(f'Time Stamp {self.meanRelTimestamp}')
        
    def getHash():
        return self.hash
        
    def setHash(self, value):
        self.hash=value
        
        
        
    

In [6]:
class OcccurrencesMeanRankingFunction:
    def __init__(self):
        self.topRankedEvtValues=[]
        self.evtAttr=""
        
    def setEvtAttr(self, evtAttr):
        self.evtAttr=evtAttr
        #print(f'evtattr {self.evtAttr}')

    def getTopEventSet(self):
        if not self.topRankedEvtValues:
            return None
        elif len(self.topRankedEvtValues)==1:
            return self.topRankedEvtValues[0]
        else:
            #for k in self.topRankedEvtValues:
            #    print(f'top key {k.keyEvts}')

            for p in self.topRankedEvtValues:
                p.computePatternStats(self.evtAttr)
            
            #for k in self.topRankedEvtValues:
            #    print(f'sorted key {k.keyEvts}')

            self.topRankedEvtValues=sorted(self.topRankedEvtValues, key=lambda x: x.getEventMeanPos()[0])
        
        return self.topRankedEvtValues[0]
    
    def performRanking(self, seqs, maxSup, excludedEvts):
        result={}
        evtHashes=[]
        evtValueKey=""
        
        for seq in seqs:
            # get hashlist for each individual sequence
            evtHashes= seq.getHashList(self.evtAttr)
            #print(f'evthash {evtHashes}')
            for hashval in evtHashes:
                
                if hashval in excludedEvts:
                    continue
                evtValueKey=str(hashval)
                
                #create a pattern for all hash values
                if evtValueKey  not in result.keys():
                    #print(f'evtValueKey {evtValueKey}')
                    p=Pattern([evtValueKey])
                    #p.addKeyEvent(hashval)
                    result[evtValueKey]=p
                    
                result[evtValueKey].addToSupportSet(seq)
        #print(result.keys())
        #print(result.values())
        
        
        s=[]
        #print(f'maxSup {maxSup}')
        
        for itr in result.values():
            #print(itr.keyEvts)
            if(itr.getSupport()>maxSup):
                continue
            s.append(itr)
        
        if not s:
            return
        
        #for patterns in s:
            #print(f'pat before sort {patterns.keyEvts}')
        s=sorted(s, key= lambda x: x.getSupport())
        
        self.topRankedEvtValues=[]
        
        maxval= s[0].getSupport()
        #print(f'maxval {maxval}')
        
        for patterns in s:
            #print(f'pat {patterns.keyEvts}')
            #print(f'support {patterns.getSupport()}')
            if patterns.getSupport() < maxval:
                break
            self.topRankedEvtValues.append(patterns)
        #print(len(s))
        #print(len(self.topRankedEvtValues))
        #for k in self.topRankedEvtValues:
        #    print(f'key {k.keyEvts}')

        #print(f'top ranked {*(k.keyEvts for k in self.topRankedEvtValues)}')


In [7]:
class FrequencyMedianRankingFunction:
    def __init__(self):
        self.topRankedEvtValues=[]
        self.evtAttr=""
        
    def setEvtAttr(self, evtAttr):
        self.evtAttr=evtAttr
        #print(f'evtattr {self.evtAttr}')

    def getTopEventSet(self):
        if not self.topRankedEvtValues:
            return None
        elif len(self.topRankedEvtValues)==1:
            return self.topRankedEvtValues[0]
        else:
            #for k in self.topRankedEvtValues:
            #    print(f'top key {k.keyEvts}')

            for p in self.topRankedEvtValues:
                p.computePatternStats(self.evtAttr)
            
            #for k in self.topRankedEvtValues:
            #    print(f'sorted key {k.keyEvts}')

            self.topRankedEvtValues=sorted(self.topRankedEvtValues, key=lambda x: x.getEventMedianPos()[0])
        
        return self.topRankedEvtValues[0]
    
    def performRanking(self, seqs, maxSup, excludedEvts):
        result={}
        
        evtValueKey=""
        uniqueHashes=[]
        
        for seq in seqs:
            # get hashlist for each individual sequence
            uniqueHashes= seq.getUniqueValueHashes(self.evtAttr)
            #print(f'evthash {evtHashes}')
            for hashval in uniqueHashes:
                
                if hashval in excludedEvts:
                    continue
                evtValueKey=str(hashval)
                
                #create a pattern for all hash values
                if evtValueKey  not in result.keys():
                    #print(f'evtValueKey {evtValueKey}')
                    p=Pattern([evtValueKey])
                    #p.addKeyEvent(hashval)
                    result[evtValueKey]=p
                    
                result[evtValueKey].addToSupportSet(seq)
        #print(result.keys())
        #print(result.values())
        
        
        s=[]
        #print(f'maxSup {maxSup}')
        
        for itr in result.values():
            #print(itr.keyEvts)
            if(itr.getSupport()>maxSup):
                continue
            s.append(itr)
        
        if not s:
            return
        
        #for patterns in s:
            #print(f'pat before sort {patterns.keyEvts}')
        s=sorted(s, key= lambda x: x.getSupport(), reverse=True)
        
        self.topRankedEvtValues=[]
        
        maxval= s[0].getSupport()
        #print(f'maxval {maxval}')
        
        for patterns in s:
            #print(f'pat {patterns.keyEvts}')
            #print(f'support {patterns.getSupport()}')
            if patterns.getSupport() < maxval:
                break
            self.topRankedEvtValues.append(patterns)
        #print(len(s))
        #print(len(self.topRankedEvtValues))
        #for k in self.topRankedEvtValues:
        #    print(f'key {k.keyEvts}')

        #print(f'top ranked {*(k.keyEvts for k in self.topRankedEvtValues)}')


# Coreflow Miner

In [8]:
class CoreFlowMiner:
    rf=FrequencyMedianRankingFunction()
    # Implement CoreFlow algo which takes a list of sequences, a TreeNode (root), and a bunch of CoreFlow parameters 

    def __init__(self):
        self.branchSequences={}
        
    
    def checkForStop(seqs, minval, checkpoints):
        pass
    
    def adjustMin(seqs, minval):
        if minval<50 :
            return minval
        
        while(Sequence.getSeqVolume(seqs)<minval and minval>50):
            minval=minval/2
        
        return minval
    
    def bundleToExit(self, seqs, parent, attr, exitNodeHash):
        if len(seqs)==0:
            return
        
        node=TreeNode()
        
        if exitNodeHash==-1:
            node.setName("Exit")
            node.setValue("Exit")
            node.setHash(-1)
            
        else:
            node.setName(str(Sequence.getEvtAttrValue(attr,exitNodeHash)))
            
            #set attribute value for this sequence
            node.setValue(Sequence.getEvtAttrValue(attr,exitNodeHash))
            node.setHash(exitNodeHash)
        
        node.setIncomingSequences(seqs, attr)
        node.setSeqCount(Sequence.getSeqVolume(seqs))
        print(f'exit node seq count {node.seqCount}')
        lengths=[]
        for s in seqs:
            for i in range(0,s.getVolume()):
                lengths.append(len(s.events)-1)
        node.setPositions(lengths)
        parent.children.append(node)
        
    #needs properimplementation    
    def getNewRootNode(self, numPaths, seqlist):
        return TreeNode("Start of all "+ str(len(seqlist))+" visits", numPaths, "-1")
    
    
    def truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain):
        indices=[]
        uniqueEvts=[]
        relTimestamps=[]
        incomingBranchSeqs=[]
        
        print(f'hashval {hashval}')
        for seq in seqs:
            i=seq.getEventPosition(evtAttr, hashval)
            print(f'Position {i}')
            if i<0:
                notContain.append(seq)
                print(f'not contain {seq.getHashList(evtAttr)}')
            else:
                if i>=1:
                    incomingSeq= Sequence(seq.events[0:i])
                    self.branchSequences[incomingSeq.getPathID()]= incomingSeq
                    incomingSeq.setVolume(seq.getVolume())
                    incomingBranchSeqs.append(incomingSeq)
                    print(f'previous {incomingSeq.getHashList(evtAttr)}')
                    uniqueEvts.extend(incomingSeq.getUniqueValueHashes(evtAttr))
                    
                if len(seq.events)>i+1:
                    outgoingSeq= Sequence(seq.events[i+1:len(seq.events)])
                    self.branchSequences[outgoingSeq.getPathID()]= outgoingSeq
                    
                    outgoingSeq.setVolume(seq.getVolume())
                    trailingSeqSegs.append(outgoingSeq)
                    print(f'next {outgoingSeq.getHashList(evtAttr)}')
                    
                for k in range(0, seq.getVolume()):
                    indices.append(i)
                relTimestamps.append(seq.events[i].timestamp-seq.events[0].timestamp)
        #print(f'Time Stamp {relTimestamps}')
        #print(f'unique {uniqueEvts}')
        #print(f'unique {set(uniqueEvts)}')
        #print(f'unique {len(set(uniqueEvts))}')
                
        node.setIncomingBranchUniqueEvts( len(set(uniqueEvts)) )
        node.setSeqCount(Sequence.getSeqVolume(incomingBranchSeqs))
        node.setPositions(indices)
        node.setRelTimeStamps(relTimestamps)
        node.setIncomingSequences(incomingBranchSeqs, evtAttr)
        print(f'seq count {node.getSeqCount()}')
        print(f' pos {node.pos}')
        print(f'Seq len trailing {len(trailingSeqSegs)}')
        print(f'Seq len not contain {len(notContain)}')

            
    def run(self, seqs, evtAttr, parent, minval, maxval, checkpoints, excludedEvts, exitNodeHash ):
        if len(checkpoints)>0:
            containSegs=[]
            notContain=[]
            
            node= TreeNode()
            
            #First integer event
            hashval=checkpoints[0]
            print(f'hashval {hashval}')
            eVal=Sequence.getEvtAttrValue(evtAttr, hashval)
            print(f'eVal {eVal}')
            node.setName(str(eVal)) #NOT sure
            node.setValue(eVal)
            node.setHash(hashval)
            del checkpoints[0]
            self.truncateSequences(seqs, hashval, evtAttr, node, containSegs, notContain)
            
            parent.children.append(node)
            
            self.run(containSegs, evtAttr, node, minval, maxval, checkpoints, excludedEvts, exitNodeHash)
            self.run(notContain, evtAttr, parent, minval, maxval, checkpoints, excludedEvts, exitNodeHash)
            
        else:
            #print(f'minval {minval}')
            #print(f'Seq volume {Sequence.getSeqVolume(seqs)}')
            if Sequence.getSeqVolume(seqs)<minval:
                self.bundleToExit(seqs, parent, evtAttr, exitNodeHash)
                return 
            
            else:
                self.rf.setEvtAttr(evtAttr)
                #print(f'maxval {maxval}')
                self.rf.performRanking(seqs, maxval, excludedEvts)
                topPattern=self.rf.getTopEventSet()
                print(f'topPattern {topPattern.keyEvts}')
        
                if topPattern is None:
                    print("no patterns found")
                    self.bundleToExit(seqs, parent, evtAttr, exitNodeHash)
                
                containSegs=[]
                notContain=[]
                
                node= TreeNode()
                hashval=topPattern.getEvents()[0]
                eVal=Sequence.getEvtAttrValue(evtAttr, hashval)
                node.setName(str(eVal)) #NOT sure
                node.setValue(eVal)
                node.setHash(hashval)

                self.truncateSequences(seqs, hashval, evtAttr, node, containSegs, notContain)
                node.setSeqCount(Sequence.getSeqVolume(containSegs))
                
                if node.getSeqCount()>minval:
                    parent.children.append(node)
                    self.run(containSegs, evtAttr, node, minval, maxval, checkpoints, excludedEvts, exitNodeHash)
                    self.run(notContain, evtAttr, parent, minval, maxval, checkpoints, excludedEvts, exitNodeHash)
                
                else:
                    self.bundleToExit(seqs, parent, evtAttr, exitNodeHash)
                    return

In [9]:
class TreeAnalyzer:
    def __init__(self):
        self.root=root
        self.numnodes=0
        self.uniqueNodes={}
        self.outDegree=[]
        self.depth=0
    def run(currentnode, numnodes, uniquenodes,outdegree, depth):
        numnodes=numnodes+1
        uniquenodes[currentnode.getValue()]=True
        outdegree.append(len(currentnode.children))
        print(f'depth {depth}')
        print(f'numnodes {numnodes}')
        print(f'children {len(currentnode.children)}')
            
        for node in currentnode.children:
            depth=depth+1
            
            TreeAnalyzer.run(node, numnodes, uniquenodes, outdegree, depth)
        #self.numnodes+=1
        #uniqueNodes[node.getValue()]=True
        return numnodes, uniquenodes, outdegree, depth
    def traverse(root, path):
        path="{\n"
        path+=root.name
        for node in root.children:
            TreeAnalyzer.traverse(node, path)
        if len(root.children)==0:
            print(path+" ")

In [10]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def get_dataframe( src, local=False, sep="\t", header=[]):
    if not local:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        url_content = req.content
        csv_file = open('data.txt', 'wb') 
        csv_file.write(url_content)
        csv_file.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv("data.txt", sep)
        #else use header param for column names
        else:
            df = pd.read_csv("data.txt", sep, names=header)
        # Delete the csv file
        os.remove("data.txt")
        return df
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            print(src)
            df = pd.read_csv(src, sep)
        # else use header param for column names
        else:
            df = pd.read_csv(src, sep, names=header)
        return df
    
    
# Helper function for generateSequence to use when sorting events to get what time field to sort by
# Also used in splitSequences to give the time of an event when splitting the events up

def get_time_to_sort_by(e):
    # Sort by starting time of event if its an interval event
    if type(e) == IntervalEvent:
        return e.time[0]
    # Otherwise use the timestamp
    else:
        return e.timestamp


    
# Helper to insert an event into a map
# Params are key=unique id for that time, map of key to event list, event object
def insert_event_into_dict(key, dictionary, event):
    if key in dictionary:
        dictionary[key].append(event)
    else:
        dictionary[key] = [event]



# Importing events functions

In [11]:
class EventStore:
    #should be moved to EventStore
    # hold the list of events, also the dictionaries
    
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # header is list of column names if they are not provided in the dataset
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    @staticmethod
    def importPointEvents(src, timestampColumnIdx, timeFormat, sep='\t', local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
            # for all attributes other tahn time, add them to attributes dict
            for i in range(len(data)):
                if i != timestampColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = PointEvent(timestamp, attribs)
            events.append(e)
        sequence=Sequence(events)
        EventStore.create_attr_dict(sequence)
        return sequence

    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    @staticmethod
    def importIntervalEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for the start and end times of the event
            t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
            t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
            # for all attributes other than times, add them to attributes dict
            for i in range(len(data)):
                if i != startTimeColumnIdx and i != endTimeColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        sequence=Sequence(events)
        EventStore.create_attr_dict(sequence)
        return sequence

    # Import a dataset that has both interval and point events
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    @staticmethod
    def importMixedEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for timestamp (if point events) or t1 and t2 (if interval event)
            # If the endTimeColumnIdx value is NaN ie a float instead of a time string then its a point event
            if type(data[endTimeColumnIdx]) is float:
                t = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                event_type = "point"
            # Otherwise its an interval event
            else:
                t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
                event_type = "interval"
            # for all attributes other than times, add them to attributes dict
            ignore=[startTimeColumnIdx, endTimeColumnIdx] # list of indices to be ignored
            attribute_columns = [ind for ind in range(len(data)) if ind not in ignore]
            for i in attribute_columns:
                attribs[cols[i]] = data[i]
            # use time stamp (or t1 and t2) and attributes map to construct event object
            if event_type == "point":
                e = PointEvent(t, attribs)
            else:
                e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        sequence=Sequence(events)
        EventStore.create_attr_dict(sequence)
        return sequence

    #should take an eventlist as input
    # Group events by attributeName, and order them by timestamp
    @staticmethod
    #should return a list of sequences
    def generateSequence(sequence, attributeName):
        eventList=sequence.events
        grouped_by = {}
        # Sort the event list
        eventList = sorted(eventList, key=get_time_to_sort_by)
        for event in eventList:
            value = event.attributes[attributeName]
            # If have seen this value before, append it the list of events in grouped_by for value
            if value in grouped_by:
                grouped_by[value].append(event)
            # otherwise store a new list with just that event
            else:
                grouped_by[value] = [event]
        return list(grouped_by.values())
    
    # Split a long sequence into shorter ones by timeUnit. For example, a sequence may span several days and we want to 
    # break it down into daily sequences. The argument timeUnit can be one of the following strings: “hour”, “day”, 
    # “week”, “month”, “quarter”, and “year”.
    # For interval events I used the start time of the event to determine its category when splitting it
    
    #ZINAT- changes
    #SequenceList represents a list of objects of type Sequence. The sequences are further splitted into
    #sequence objects, this way we can use generate sequences and then splitSequences 
    @staticmethod
    def splitSequences(sequenceLists, timeUnit, record=None):
        if not isinstance(sequenceLists, list):
            sequenceLists=[sequenceLists]
        results = []
        resultlist=[]
        timeUnit = timeUnit.lower()
        # Check if the time unit is a valid argument
        valid_time_units = ["hour", "day", "week", "month", "quarter", "year"]
        if timeUnit not in valid_time_units:
            raise ValueError("timeUnit must be hour, day, week, month, quarter, or year")
        
        for sequence in sequenceLists:
            # Sort the events by the timestamp or event start time
            sequenceList= sequence.events
            sequenceList = sorted(sequenceList, key=get_time_to_sort_by)

            # Process the event sequence based on the given time unit
            # Generally, create a map for that time unit and then add each event into that map 
            # (key=time such as May 2021 in case of month, value=sequence) and then return the values of the map as a list
            if timeUnit == "hour":
                hours = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.hour, time.day, time.month, time.year)
                    insert_event_into_dict(key,hours,event)
                    if record is None:
                        event.attributes["record"]=' '.join([str(k) for k in key])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+' '.join([str(k) for k in key])
                results = list(hours.values())

            elif timeUnit == "day":
                days = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.day, time.month, time.year)
                    insert_event_into_dict(key,days,event)
                    #print(days)
                    if record is None:
                        event.attributes["record"]=datetime(*(key[::-1])).strftime("%Y%m%d")
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d")
                results = list(days.values())

            elif timeUnit == "month":
                months = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.month,time.year)
                    insert_event_into_dict(key,months,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(months.values())

            elif timeUnit == "week":
                weeks = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    year = time.year
                    week_num = time.isocalendar()[1]
                    key = (year,week_num)
                    insert_event_into_dict(key,weeks,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+"W"+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+"W"+str(key[1])
                results = list(weeks.values())

            elif timeUnit == "year":
                years = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = time.year
                    insert_event_into_dict(key,years,event)
                    if record is None:
                        event.attributes["record"]=str(key)
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key)
                results = list(years.values())

            elif timeUnit == "quarter":
                quarters = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    year = time.year
                    month = time.month
                    # Determine the year, quarter pair/key for quarter dict
                    # January, February, and March (Q1)
                    if month in range(1, 4):
                        key = (year, "Q1")
                    # April, May, and June (Q2)
                    elif month in range(4, 7):
                        key = (year, "Q2")
                    # July, August, and September (Q3)
                    elif month in range(7,10):
                        key = (year, "Q3")
                    # October, November, and December (Q4)
                    elif month in range(10,13):
                        key = (year, "Q4")
                    # Put the event in the dictionary
                    insert_event_into_dict(key,quarters,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(quarters.values())
            resultlist.extend(results)
        resultlists= [Sequence(x) for x in resultlist]

        return resultlists
    
    #Assuming we are given a list of events and from those events we create 
    #the mapping and reverse mapping dictionary
    def create_attr_dict(seqList):
        attr_list=seqList.events[0].attributes.keys()
        print(attr_list)
        
        for attr in attr_list:
            a=48
            unique_list=[]
            unique_list.extend(seqList.getUniqueValues(attr))
            unique_list=list(set(unique_list))
            #unique_list.clear()
            
            unicode_dict={}
            reverse_dict={}
            for uniques in unique_list:
                unicode_dict[uniques]=chr(a)
                reverse_dict[chr(a)]=uniques
                a=a+1
            Sequence.attrdict[attr]=unicode_dict
            Sequence.reverseatttrdict[attr]=reverse_dict
            #unicode_dict.clear()                    
   

# Generating Sequences

# Event Aggregation
For aggregateEventsRegex and aggregateEventsDict, see what the files are expected to look like in the repo in DataModel/testFiles

In [12]:
# Helper function to run the mappings file as a dictionary
def give_dictionary_of_mappings_file(fileName):
    # Open the file and split the contents on new lines
    file = open(fileName, "r")
    mappings = file.read().split("\n")
    file.close()
    # Remove any empty strings from the list of mappings
    mappings = list(filter(None, mappings))
    # Raise an error if there is an odd number of items in mapping
    if (len(mappings) % 2) != 0:
        raise ValueError("There must be an even number of lines in the mappings file.")
    # Create a dictionary based on read in mappings
    aggregations = {}
    for i in range(len(mappings)):
        if i % 2 == 0:
            aggregations[mappings[i]] = mappings[i+1]
    #print(aggregations)
    return aggregations

# NOTE: this current modifies the events in eventList argument
# merge events by rules expressed in regular expressions. For example, in the highway incident dataset, we can 
# replace all events with the pattern “CHART Unit [number] departed” by “CHART Unit departed”. The argument 
# regexMapping can be a path pointing to a file defining such rules. We can assume each rule occupies two lines: 
# first line is the regular expression, second line is the merged event name 
def aggregateEventsRegex(eventList, regexMapping, attributeName): 
    aggregations = give_dictionary_of_mappings_file(regexMapping)
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # For all the regexes
        for regex in aggregations.keys():
            # If its a match then replace the attribute value for event with
            if re.match(regex, attribute_val):
                event.attributes[attributeName] = aggregations[regex]
                break
    return eventList
    
# NOTE: this current modifies the events in eventList argument
# merge events by a dictionary mapping an event name to the merged name. The argument nameDict can be a path 
# pointing to a file defining such a dictionary. We can assume each mapping occupies two lines: first line is the 
# original name, second line is the merged event name.    
def aggregateEventsDict(eventList, nameDict, attributeName):
    aggregations = give_dictionary_of_mappings_file(nameDict)
    # Iterate over all events and replace evevnts in event list with updated attribute name
    # if directed to by given mappings
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # If the attribute value has a mapping then replace the event's current value with the one in give map
        if attribute_val in aggregations:
            
            event.attributes[attributeName] = aggregations[attribute_val]
    return eventList

In [13]:
sequence_braiding = EventStore.importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
#print(type(sequence_braiding))
seq=sequence_braiding
#Sequence.create_attr_dict([seq])
#seq.getEventPosition('Meal','Lunch')
#print(seq.getUniqueValueHashes('Meal'))
#print(seq.getHashList('Glucose'))
print(seq.getValueHashes('Glucose'))
#print(seq.getEventsHashString('Glucose'))
raw_seq=seq.convertToVMSPReadable('Meal')
print(seq.convertToVMSPReadable('Glucose'))
#print(seq.getPathID())
#sequence_braiding[0].attributes.keys()
#print(sequence_braiding[0].getAttrVal('Meals'))
#print(sequence_braiding[0].type)
#for events in sequence_braiding:
#    print(events.getAttrVal('Meal'))


../datasets/sequence_braiding_refined.csv
dict_keys(['Glucose', 'Meal'])
['0', 'Í', '9', 'H', 'â', 'G', 'U', '¦', '§', '=', 'v', 'E', '¨', 'I', 'J', '\x84', 'O', '\x9f', '´', '¯', '©', 'º', 'Ä', 'Y', 'a', '\x8e', '<', 'N', '¶', 'W', 'h', 'Ý', 'J', 'Ñ', '»', '^', 'E', 'D', 'S', 'R', 'j', 'H', 'h', 'Q', 'Q', '\x91', 'J', 'B', 'I', '2', '¨', 'O', 'E', 'F', '@', '@', 'Q', ']', 'N', 'b', '?', 'J', '^', '\x93', '«', '\x8b', '\x98', 'D', 'H', '§', 'A', 'o', 'L', 'M', 'W', '=', 'U', 'U', 'F', '\x87', 'r', 'F', 'P', 'h', '\x97', 'i', 'X', 'Q', '\x87', 'V', 'P', 'e', 'Q', 's', 'J', 'S', 'U', 'G', 'V', 'V', '{', 'R', 'l', 'c', 'N', '?', '=', '\\', 'ª', '\x91', 't', '\x97', 'u', '\x7f', 'Q', 'H', '\x81', 'F', '=', 'd', 'F', 'L', 'o', '?', '¬', '\x95', '¨', 'Ã', 'S', '»', 'Î', '>', '|', '\x8d', 'Ç', '\x92', 'J', '\x85', 'G', '6', '3', '@', 'G', 'æ', '\x8a', '\x99', '\x8f', '\x9a', 'e', 'Y', 'x', 'q', 'n', 'Q', 'v', '\x8a', '_', 'f', '\\', 'k', '?', 's', '\x8d', 'º', '\xa0', '®', '_', 'G', 'X', '8',

In [14]:
seq_list=EventStore.splitSequences(sequence_braiding, "week")
#seq_list=[]
#for seqs in sequence_braiding_split:
#    seq_list.append(Sequence(seqs))
    
#Sequence.create_attr_dict(seq_list)
raw_seq="\n".join( seqs.convertToVMSPReadable('Meal') for seqs in seq_list)

In [15]:
pat=Pattern(['2','6'])
print(pat.keyEvts)
s=pat.filterPaths(seq_list, 'Meal')

['2', '6']
filtering 22 paths by 2 checkpoints
11 matching paths


In [16]:
print(raw_seq)

0 0 5 2 7 0 0 6 1 5 0 0 6 0 0 0 1 0 0 0 1.
0 6 1 0 7 0 1 5 2 0 5 4 4 4 6 5 6 0 6 1 1.
0 1 5 6 0 1 5 6 1 3 5 6 1 5 2 0 4 1 5 6 1.
5 2 6 1 5 6 1 6 1 5 0 6 1 5 6 1 1 6.
7 8 6 1 5 2 6 4 2 6 4 1 0 0 0 6 1 3 0 6 6 1 5 4.
8 8 1 5 7 6 4 1 5 0 2 6 4 1 5 6 4 1 3 5 6 1 5 1 5 6 1.
2 8 6 1 0 0 0 0 6 4 1 2 7 6 4 1 0 0 5 0 8 0 6 1 0 0 0 3 5 0 0 6 0 1 5 6 0 0 1 5 1.
5 4 8 6 5 0 6 7 0 1 0 6 1 5 0 7 8 6 1 3 5 0 0 6 1 5 0 1 5 1.
0 4 4 6 1 5 0 6 1 5 6 1 0 0 1 3 5 0 1 0 0 1 4.
0 5 2 4 4 6 1 4 4 6 1 5 2 6 1 0 4 8 6 1 3 5 6 1 0 5 6 0 0 0 5 6 1.
5 4 6 1 5 2 6 1 6 1 5 4 4 6 1 0 0 6 1 0 6 0 5 6 0 0.
0 0 5 0 0 6 0 1 0 5 8 0 6 0 1 5 4 0 1 1 0 5 6 1 0 6 1 5 2 6 1 0 1.
5 8 6 1 0 3 0 5 1 5 1 6 3 5 8 0 0 5 6 1 6 0 0 1.
0 0 6 0 1 0 0 0 6 5 4 8 6 1 3 0 1 6 1 0.
7 6 1 5 6 1 6 4 0 4 8 6 0 0 0 0 4 0 2 1 6.
4 8 6 8 0 6 4 8 6 1 0 5 0 6 4 0 1 0 0 1 0 6 1.
4 8 6 1 0 0 0 6 6 1 6 1 0 5 7 0 6 1 0 1 2.
6 1 6 1 5 6 1 4 0 6 1 0 5 0 5 5 0 6 0 1 0.
4 6 4 0 0 1 0 6 1 6 0 0 0 8 6 1 5 0 6 0 0 1 0.
5 0 7 6 0 4 0 0 6 7 4 0 5 0 6 1 0 5 5 6

In [17]:
pat=Pattern([233,309,106,166])
print(pat.keyEvts)
#print(pat.filterPaths([seq],'Glucose'))
#print(pat.getUniqueEventsString())
print(pat.getPositions([233,309,80,168],seq.getValueHashes('Glucose')))

[233, 309, 106, 166]
[]


In [18]:
input_example_raw = """1 -1 1 2 3 -1 1 3 -1 4 -1 3 6 -1 -2
1 4 -1 3 -1 2 3 -1 1 5 -1 -2
5 6 -1 1 2 -1 4 6 -1 3 -1 2 -1 -2
5 -1 7 -1 1 6 -1 3 -1 2 -1 3 -1 -2
"""

In [19]:
spmf = Spmf("VMSP", spmf_bin_location_dir="../../Tools/coreflow/CoreFlow-backend-src/src/datastructure_python/test_files/", input_direct=raw_seq,
           input_type="text", output_filename="output.txt", arguments=[0.5])

In [20]:
spmf.run()

>/home/zinat/Documents/Research/Tools/coreflow/CoreFlow-backend-src/src/datastructure_python/test_files/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 110 ms
 Frequent sequences count : 955
 Max memory (mb) : 22.680580139160156955
minsup 11
Intersection count 12912 

Post-processing to show result in terms of string values.
Post-processing completed.



In [21]:
print(spmf.to_pandas_dataframe(pickle=True))

                         pattern  sup
0                         [2, 1]   11
1                         [2, 6]   11
2                         [5, 2]   11
3                      [5, 6, 4]   11
4                      [0, 6, 4]   12
..                           ...  ...
950  [6, 1, 6, 1, 1, 5, 0, 6, 1]   11
951  [6, 1, 6, 1, 6, 1, 5, 0, 1]   11
952  [6, 1, 6, 1, 6, 1, 0, 1, 5]   11
953  [6, 1, 0, 6, 1, 0, 0, 0, 1]   11
954  [6, 1, 0, 6, 0, 0, 0, 0, 1]   12

[955 rows x 2 columns]


In [22]:
# pip install spmf

In [23]:
cfm= CoreFlowMiner()
root=cfm.getNewRootNode(Sequence.getSeqVolume(seq_list), seq_list)
cfm.run(seq_list, "Meal", root, 5 * Sequence.getSeqVolume(seq_list)/100.0, Sequence.getSeqVolume(seq_list), [], {}, -1)

topPattern ['0']
hashval 0
Position 0
next ['0', '5', '2', '7', '0', '0', '6', '1', '5', '0', '0', '6', '0', '0', '0', '1', '0', '0', '0', '1']
Position 0
next ['6', '1', '0', '7', '0', '1', '5', '2', '0', '5', '4', '4', '4', '6', '5', '6', '0', '6', '1', '1']
Position 0
next ['1', '5', '6', '0', '1', '5', '6', '1', '3', '5', '6', '1', '5', '2', '0', '4', '1', '5', '6', '1']
Position 10
previous ['5', '2', '6', '1', '5', '6', '1', '6', '1', '5']
next ['6', '1', '5', '6', '1', '1', '6']
Position 12
previous ['7', '8', '6', '1', '5', '2', '6', '4', '2', '6', '4', '1']
next ['0', '0', '6', '1', '3', '0', '6', '6', '1', '5', '4']
Position 9
previous ['8', '8', '1', '5', '7', '6', '4', '1', '5']
next ['2', '6', '4', '1', '5', '6', '4', '1', '3', '5', '6', '1', '5', '1', '5', '6', '1']
Position 4
previous ['2', '8', '6', '1']
next ['0', '0', '0', '6', '4', '1', '2', '7', '6', '4', '1', '0', '0', '5', '0', '8', '0', '6', '1', '0', '0', '0', '3', '5', '0', '0', '6', '0', '1', '5', '6', '0', '0

In [24]:
#currentnode, numnodes, uniquenodes,outdegree, depth
w,x,y,z= TreeAnalyzer.run(root,0,{},[],0)
print(w)
print(x)
print(y)
print(z)

depth 0
numnodes 1
children 1
depth 1
numnodes 2
children 1
depth 2
numnodes 3
children 1
depth 3
numnodes 4
children 1
depth 4
numnodes 5
children 2
depth 5
numnodes 6
children 2
depth 6
numnodes 7
children 2
depth 7
numnodes 8
children 1
depth 8
numnodes 9
children 2
depth 9
numnodes 10
children 2
depth 10
numnodes 11
children 1
depth 11
numnodes 12
children 0
depth 11
numnodes 11
children 0
depth 10
numnodes 10
children 0
depth 8
numnodes 8
children 0
depth 7
numnodes 7
children 0
depth 6
numnodes 6
children 0
1
{'-1': True, 'Sugar to treat': True, 'Lunch': True, 'Breakfast': True, 'Dinner': True, 'Exit': True}
[1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0]
1


In [25]:
pat=Pattern(['3','0'])
print(pat.keyEvts)
s=pat.filterPaths(seq_sublist, 'Meal')
pat.computePatternStats('Meal')

['3', '0']


NameError: name 'seq_sublist' is not defined

In [None]:
print(pat.medianPos)
print(pat.meanPos)
        
print(pat.medianPathLength)
print(pat.meanPathLength)


In [None]:
indices=[0,1,5]
seq_sublist=[seq_list[index] for index in indices]

In [None]:
cfm= CoreFlowMiner()
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=cfm.getNewRootNode(Sequence.getSeqVolume(seq_sublist), seq_sublist)
cfm.truncateSequences(seq_sublist, '3', 'Meal', root,[], [])
#cfm.run(seq_sublist, "Meal", cfm.getNewRootNode(Sequence.getSeqVolume(seq_sublist), seq_sublist), 5 * Sequence.getSeqVolume(seq_sublist)/100.0, Sequence.getSeqVolume(seq_sublist), [], {}, -1);

In [None]:
cfm.run(seq_sublist, "Meal", root, 5 * Sequence.getSeqVolume(seq_sublist)/100.0, Sequence.getSeqVolume(seq_sublist), [], {}, -1)

In [None]:
TreeAnalyzer.traverse(root, "")