In [1]:
from datetime import datetime, timedelta
import pandas as pd
import csv
import requests
import os
import re
from itertools import count
import numpy as np

from itertools import accumulate

from spmf import Spmf
import json
import jsonpickle
import heapq

import pprint
from collections import defaultdict

import jsonpickle

In [2]:
import sys
sys.setrecursionlimit(10000)

# Event Representations

In [3]:
# A common class for all Events

class Event:
    def __init__(self, eventtype):
        self.type=eventtype
    
    #Return Attribute value given attribute name
    def getAttrVal(self, attrName):
        return self.attributes.get(attrName,None)

    
# A class that represents a point event
class PointEvent(Event):
    def __init__(self, timestamp, attributes):
        super().__init__("point")
        #self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 
        
    

# class to represent an interval event
class IntervalEvent(Event):
    def __init__(self, t1, t2, attributes):
        super().__init__("interval")
        #self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

In [4]:
class EventStore:
    
    def __init__(self, eventlist=[]):
        self.attrdict={}
        self.reverseatttrdict={}
        self.events=eventlist

    #should be moved to EventStore
    # hold the list of events, also the dictionaries
    
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # header is list of column names if they are not provided in the dataset
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    #@staticmethod
    def importPointEvents(self, src, timestampColumnIdx, timeFormat, sep='\t', local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
            # for all attributes other tahn time, add them to attributes dict
            for i in range(len(data)):
                if i != timestampColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = PointEvent(timestamp, attribs)
            events.append(e)
        self.events=events
        #sequence=Sequence(events)
        self.create_attr_dict()
        #return sequence

    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    #@staticmethod
    def importIntervalEvents(self, src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for the start and end times of the event
            t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
            t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
            # for all attributes other than times, add them to attributes dict
            for i in range(len(data)):
                if i != startTimeColumnIdx and i != endTimeColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        self.events=events    
        #sequence=Sequence(events)
        self.create_attr_dict()
        #return sequence

    # Import a dataset that has both interval and point events
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    #@staticmethod
    def importMixedEvents(self, src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for timestamp (if point events) or t1 and t2 (if interval event)
            # If the endTimeColumnIdx value is NaN ie a float instead of a time string then its a point event
            if type(data[endTimeColumnIdx]) is float:
                t = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                event_type = "point"
            # Otherwise its an interval event
            else:
                t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
                event_type = "interval"
            # for all attributes other than times, add them to attributes dict
            ignore=[startTimeColumnIdx, endTimeColumnIdx] # list of indices to be ignored
            attribute_columns = [ind for ind in range(len(data)) if ind not in ignore]
            for i in attribute_columns:
                attribs[cols[i]] = data[i]
            # use time stamp (or t1 and t2) and attributes map to construct event object
            if event_type == "point":
                e = PointEvent(t, attribs)
            else:
                e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        self.events=events   
        #sequence=Sequence(events)
        self.create_attr_dict()
        #return sequence

    #should take an eventlist as input
    # Group events by attributeName, and order them by timestamp
    #@staticmethod
    #should return a list of sequences
    def generateSequence(self, attributeName):
        eventList=self.events
        grouped_by = {}
        # Sort the event list
        eventList = sorted(eventList, key=get_time_to_sort_by)
        for event in eventList:
            value = event.attributes[attributeName]
            # If have seen this value before, append it the list of events in grouped_by for value
            if value in grouped_by:
                grouped_by[value].append(event)
            # otherwise store a new list with just that event
            else:
                grouped_by[value] = [event]
        sequences= list(grouped_by.values())
        seqlist=[]
        for seq in sequences:
            seqlist.append(Sequence(seq, self))
        return seqlist
    
    # Split a long sequence into shorter ones by timeUnit. For example, a sequence may span several days and we want to 
    # break it down into daily sequences. The argument timeUnit can be one of the following strings: “hour”, “day”, 
    # “week”, “month”, “quarter”, and “year”.
    # For interval events I used the start time of the event to determine its category when splitting it
    
    #ZINAT- changes
    #SequenceList represents a list of objects of type Sequence. The sequences are further splitted into
    #sequence objects, this way we can use generate sequences and then splitSequences 
    @staticmethod
    def splitSequences(sequenceLists, timeUnit, record=None):
        if not isinstance(sequenceLists, list):
            sequenceLists=[sequenceLists]
        eventstore=sequenceLists[0].eventstore
        results = []
        resultlist=[]
        timeUnit = timeUnit.lower()
        # Check if the time unit is a valid argument
        valid_time_units = ["hour", "day", "week", "month", "quarter", "year"]
        if timeUnit not in valid_time_units:
            raise ValueError("timeUnit must be hour, day, week, month, quarter, or year")
        
        for sequence in sequenceLists:
            # Sort the events by the timestamp or event start time
            sequenceList= sequence.events
            sequenceList = sorted(sequenceList, key=get_time_to_sort_by)

            # Process the event sequence based on the given time unit
            # Generally, create a map for that time unit and then add each event into that map 
            # (key=time such as May 2021 in case of month, value=sequence) and then return the values of the map as a list
            if timeUnit == "hour":
                hours = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.hour, time.day, time.month, time.year)
                    insert_event_into_dict(key,hours,event)
                    if record is None:
                        event.attributes["record"]=' '.join([str(k) for k in key])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+' '.join([str(k) for k in key])
                results = list(hours.values())

            elif timeUnit == "day":
                days = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.day, time.month, time.year)
                    insert_event_into_dict(key,days,event)
                    #print(days)
                    if record is None:
                        event.attributes["record"]=datetime(*(key[::-1])).strftime("%Y%m%d")
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d")
                results = list(days.values())

            elif timeUnit == "month":
                months = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = (time.month,time.year)
                    insert_event_into_dict(key,months,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(months.values())

            elif timeUnit == "week":
                weeks = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    year = time.year
                    week_num = time.isocalendar()[1]
                    key = (year,week_num)
                    insert_event_into_dict(key,weeks,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+"W"+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+"W"+str(key[1])
                results = list(weeks.values())

            elif timeUnit == "year":
                years = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    key = time.year
                    insert_event_into_dict(key,years,event)
                    if record is None:
                        event.attributes["record"]=str(key)
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key)
                results = list(years.values())

            elif timeUnit == "quarter":
                quarters = {}
                for event in sequenceList:
                    time = get_time_to_sort_by(event)
                    year = time.year
                    month = time.month
                    # Determine the year, quarter pair/key for quarter dict
                    # January, February, and March (Q1)
                    if month in range(1, 4):
                        key = (year, "Q1")
                    # April, May, and June (Q2)
                    elif month in range(4, 7):
                        key = (year, "Q2")
                    # July, August, and September (Q3)
                    elif month in range(7,10):
                        key = (year, "Q3")
                    # October, November, and December (Q4)
                    elif month in range(10,13):
                        key = (year, "Q4")
                    # Put the event in the dictionary
                    insert_event_into_dict(key,quarters,event)
                    if record is None:
                        event.attributes["record"]=str(key[0])+str(key[1])
                    else:
                        event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(quarters.values())
            resultlist.extend(results)
        resultlists= [Sequence(x, eventstore) for x in resultlist]

        return resultlists
    
    def getUniqueValues(self, attr):
        l=list(set(event.getAttrVal(attr) for event in self.events))
        return l
    
    #Assuming we are given a list of events and from those events we create 
    #the mapping and reverse mapping dictionary
    def create_attr_dict(self):
        attr_list=self.events[0].attributes.keys()
        print(attr_list)
        
        for attr in attr_list:
            a=48
            unique_list=[]
            unique_list.extend(self.getUniqueValues(attr))
            unique_list=list(set(unique_list))
            #unique_list.clear()
            
            unicode_dict={}
            reverse_dict={}
            for uniques in unique_list:
                unicode_dict[uniques]=chr(a)
                reverse_dict[chr(a)]=uniques
                a=a+1
            self.attrdict[attr]=unicode_dict
            self.reverseatttrdict[attr]=reverse_dict
            #unicode_dict.clear()                    
   

# Sequence Representations

In [5]:
class Sequence():
    _ids = count(0)
    

    def __init__(self,  eventlist, eventstore,sid=None):
        # sequence id
        if sid is None:
            self.sid = next(self._ids)
        else:
            self.sid = sid
        
        self.events = eventlist
        self.eventstore=eventstore
        self.volume=1
        self.seqAttributes={}
        self.seqIndices=[]
    def getEventPosition(self, attr, hash_val):
        for count,event in enumerate(self.events):
            #if event.getAttrVal(attr)==hash_val:
            if self.eventstore.attrdict[attr][event.getAttrVal(attr)]==hash_val:
                return count
        return -1
    
    def setVolume(self, intValue):
        self.volume=intValue
        
    def getVolume(self):
        return self.volume
    
    def increaseVolume(self):
        self.volume += 1 
    
    
    def getUniqueValueHashes(self, attr):
        l=list(set(event.getAttrVal(attr) for event in self.events))
        uniquelist=[self.eventstore.attrdict[attr][elem] for elem in l]
        return uniquelist
    
    #Not sure this will always result in same index, will change if 
    #dictionary is updated
    #since python is unordered
    
    def getHashList(self, attr):
        #l=list(list(event.attributes.keys()).index(attr) for event in self.events)
        l=[event.getAttrVal(attr) for event in self.events]
        hashlist=[self.eventstore.attrdict[attr][elem] for elem in l]
        
        return hashlist
    
    def getValueHashes(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        hashlist=[self.eventstore.attrdict[attr][elem] for elem in l]
        
        return hashlist
        
    
    def getEventsHashString(self, attr):
        s=""
        l=list(event.getAttrVal(attr) for event in self.events)
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" "
        s+="".join(str(self.eventstore.attrdict[attr][elem]) for elem in l)
        #print(s)
        return s
    
    def convertToVMSPReadablenum(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        s=" -1 ".join(str(self.eventstore.attrdict[attr][elem]) for elem in l)
        #s=""
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" -1 "
        s+=" -2"
        
        return s
    
    def convertToVMSPReadable(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        s=" ".join(self.eventstore.attrdict[attr][elem] for elem in l)
        #s=""
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" -1 "
        s+="."
        
        return s
    
    def getPathID(self):
        return self.sid
    
    def matchPathAttribute(self, attr, val):
        # should i use eq?!
        if this.seqAttributes.get(attr)==(val):
            return True
        else:
            return False
        
    def setSequenceAttribute(self,attr, value):
        self.seqAttributes[attr]=value
        
         

    # equivalent to method signature public static int getVolume(List<Sequence> seqs)    
    def getSeqVolume(seqlist):
        return sum(seq.getVolume() for seq in seqlist)
    
    
    # Method equivalent to public String getEvtAttrValue(String attr, int hash) in DataManager.java
    def getEvtAttrValue(self, attr, hashval):
        return self.eventstore.reverseatttrdict[attr][hashval]
        
    # Method equivalent to public List<String> getEvtAttrValues(String attr) in DataManager.java    
    def getEvtAttrValues(self, attr):
        return list(self.eventstore.reverseatttrdict[attr].values())
    
    # Method equivalent to int getEvtAttrValueCount(String attr) in DataManager.java    
    def getEvtAttrValueCount(self, attr):
        return len(self.eventstore.reverseatttrdict[attr])
    
    @staticmethod
    
    def getUniqueEvents(seqlist):
        l=list(set(event.getAttrVal(attr) for event in seq for seq in seqlist))
        return l
    

# SentenTreeModel

In [6]:
class Node():
    NID=count(1)
    nodeHash={}
    
    
    def __init__(self, name="", count=0, value=""):
        self.nid=next(self.NID)
        self.name=name
        self.seqCount=count
        ## What's the difference between name and value?
        self.value=value
        self.hash=-1
        self.pos=[]
        self.meanStep=0
        self.medianStep=0
        #self.zipCompressRatio=0
        self.incomingBranchUniqueEvts=None
        #self.incomingBranchSimMean=None
        #self.incomingBranchSimMedian=None
        #self.incomingBranchSimVariance=None
        self.keyevts=[]
        self.incomingSequences=[]
        self.outgoingSequences=[]
        
        self.meanRelTimestamp=0
        self.medianRelTimestamp=0
        
        TreeNode.nodeHash[self.nid]=self
        
        
    def getNode(self, node_id):
        return nodeHash[node_id]
    
    def clearHash(self):
        nodeHash.clear()
        
    def getIncomingSequences(self):
        return self.incomingSequences
    
    def getSeqCount(self):
        return self.seqCount
    
    def setSeqCount(self, seqCount):
        self.seqCount=seqCount
        
    def getName(self):
        return self.name
    
    def setName(self, name):
        self.name=name
        
    def getMeanStep(self):
        return self.meanStep
    
    #need a better implementation
    def toJSONObject(self):
        return json.dumps(self, default=lambda o: o.__dict__)#,sort_keys=True, indent=4) 
    
    def toString(self):
        return self.name+": "+self.seqCount
    
    def setPositions(self, l):
        self.pos=l
        self.pos.sort()
        d=sum(self.pos)+len(self.pos)
        mid=len(self.pos)/2
        
        if len(self.pos)==0:
            self.meanStep=0
            slf.medianStep=0
        else:
            #WHY WE ARE ADDING 1 to mean and medianStep?
            self.meanStep=d/len(self.pos)
            self.medianStep= np.median(self.pos)+1#((self.pos[mid-1]+self.pos[mid])/2.0)+1 if len(self.pos)%2==0 else self.pos[mid]+1
            
    def getValue(self):
        return self.value
    
    def setValue(self, value):
        self.value=value
        
    def getMedianStep(self):
        return self.medianStep
    
    #def getZipCompressRatio(self):
    #    return self.zipCompressRatio
    
    #def setZipCompressRatio(self, zipcompressratio):
    #    self.zipCompressRatio=zipcompressratio
        
    def getIncomingBranchUniqueEvts(self):
        return self.incomingBranchUniqueEvts
    
    def setIncomingBranchUniqueEvts(self, incomingbranchuniqueevts):
        self.incomingBranchUniqueEvts=incomingbranchuniqueevts
        
    #def setIncomingBranchSimilarityStats(self, mean, median, variance):
    #    self.incomingBranchSimMean=mean
    #    self.incomingBranchSimMedian=median
    #    self.incomingBranchSimVariance=variance
        
    
    def setIncomingSequences(self, incomingbrancseqs, evtattr):
        self.incomingSequences=incomingbrancseqs
        
    def setRelTimeStamps(self, reltimestamps):
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')
        reltimestamps.sort()
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')
        
        mid=len(reltimestamps)/2
        
        if(len(reltimestamps)==0):
            self.meanRelTimestamp=0
            self.medianRelTimestamp=0
            
        else:
        
            self.meanRelTimestamp=d*1.0/len(reltimestamps)
            self.medianRelTimestamp=np.median(reltimestamps) #(reltimestamps[mid-1]+reltimestamps[mid])/2.0 if len(reltimestamps%2==0) else reltimestamps[mid]
        
        #print(f'Time Stamp {self.meanRelTimestamp}')
        #print(f'Time Stamp {self.meanRelTimestamp}')
        
    def getHash():
        return self.hash
        d=sum(reltimestamps, timedelta())
        
        
    def setHash(self, value):
        self.hash=value
        
        
        
    #def json_serialize(self):
    #    json.dump(self, indent=4, default= TreeNode.json_default_dump)
    def json_default_dump(self)-> dict:
        pass
    
    def json_serialize(self) -> None:
    
        pass
    
    @staticmethod
    def json_serialize_dump(obj):
    
        pass
    

In [7]:
class TreeNode(Node):
    def __init__(self, name="", count=0, value=""):
        super().__init__(name, count, value)
        self.children = []
        
    def json_default_dump(self)-> dict:
        return {
            "event_attribute": self.hash,
            "value": self.seqCount,
            "median_index": self.medianStep,
            "average_index":self.meanStep,

            "children":[TreeNode.json_serialize_dump(x) for x in self.children]
            
        }
    
    def json_serialize(self) -> None:
    
        json.dump(self,  indent=4, default=TreeNode.json_serialize_dump)
    
    @staticmethod
    def json_serialize_dump(obj):
    
        if hasattr(obj, "json_default_dump"):
            
            return obj.json_default_dump()
        return None

    

In [8]:
class GraphNode(Node):
    def __init__(self, name="", count=0, value=""):
        super().__init__(name, count, value)
        self.before = []
        self.after = []
        
    def json_default_dump(self)-> dict:
        return {
            "before": GraphNode.json_serialize_dump(self.before),
            "event_attribute": self.value,
            "Pattern": self.keyevts,
            "value": self.seqCount,
            "After": GraphNode.json_serialize_dump(self.after)

        }

    def json_serialize(self) -> None:
    
        json.dump(self,  indent=4, default=GraphNode.json_serialize_dump)
    
    @staticmethod
    def json_serialize_dump(obj):
    
        if hasattr(obj, "json_default_dump"):
            
            return obj.json_default_dump()
        return None

    

In [9]:
class Rawnode:
    def __init__ (self, node):
        self.nid=node.nid
        self.seqCount=node.seqCount
        self.value=node.value
        
class Graph():
    
    def __init__ (self):
        self.links= defaultdict(set)
        self.nodes=[]
    
    
    def add(self, node1, node2):
        self.links[node1].add(node2)
        #self.links[node2].add(node1)
        
    def json_default_dump(self)-> dict:
        return {
            "nodes": self.nodes,
            "links":self.links

        }

    def json_serialize(self) -> None:
    
        json.dump(self,  indent=4, default=Graph.json_serialize_dump)
    
    @staticmethod
    def json_serialize_dump(obj):
    
        if hasattr(obj, "json_default_dump"):
            
            return obj.json_default_dump()
        return obj

    
        
    def print_graph(self):
        for node in self.nodes:
            print(self.nid)

# SentenTree Miner

In [10]:
class SentenTreeMiner:
    
    def expandSeqTree(self, attr, rootNode,  expandCnt, minSupport, maxSupport, graph):
        
        #if len(rootSeq.eventlist>0):
        expandCnt-=len(rootNode.keyevts)
        
        seqs = []
        seqs.append(rootNode)
        rootNode.setSeqCount(Sequence.getSeqVolume(rootNode.incomingSequences))
        leafSeqs = []
        
        graph.nodes.append(Rawnode(rootNode))
        while seqs and expandCnt > 0:
            s = max(seqs,key=lambda x: x.seqCount) 
            print(f'seqCount: {s.seqCount}')
            #print(f'this: {s}')

            s0 = s.after
            s1 = s.before
            
            #print(f' s : {s}')
            #print(f' s0 : {s0}')
            #print(f' s1: {s1}')
            
            print(f'this.pattern s : {s.keyevts}')
            #print(f'this.pattern s0 : {s0.keyevts}')
            #print(f'this.pattern s1: {s1.keyevts}')
        
        
            if not s1 and not s0:
                word, pos, count, s0, s1= self.growSeq(attr, s,  minSupport, maxSupport)
                print(f'word: {word}, pos: {pos}, count: {count}')
                
                
                if count < minSupport:
                    leafSeqs.append(s)
                else:
                    
                    s1.setHash(word)
                    s1.setValue(s.incomingSequences[0].getEvtAttrValue(attr, word))
                    s1.keyevts=s.keyevts[:] #deep copy
                    s0.keyevts=s.keyevts[:]
                    #for i,x in enumerate(s.pattern.keyEvts):
                    #    print(s.pattern.keyEvts)
                    #    s1.pattern.addKeyEvent(x)
                    #    s0.pattern.addKeyEvent(x)
                        
                    s1.keyevts.append(word) 
                
                #print(f'this.pattern s after: {s.keyevts}')
                #print(f'this.pattern s0 after: {s0.keyevts}')
                #print(f'this.pattern s1 after: {s1.keyevts}')
        
                    
            if s1 and s1.seqCount>= minSupport:
                expandCnt-=1
                seqs.append(s1)
                #s1.after=s
            s.before=s1
            s.after=s0
            
            graph.nodes.append(Rawnode(s1))
            graph.nodes.append(Rawnode(s0))
            
            graph.add(s.nid,s1.nid)
            graph.add(s.nid,s0.nid)
            
            if s0 and s0.seqCount>= minSupport:
                seqs.append(s0)
                #s0.before=s
            print(f'seqCount: {[s.seqCount for s in seqs]}')
            #print(f'before: {s.before}')
            #print(f'after: {s.after}')
            #print(f'this: {s}')
            #print(f' s after: {s}')
            #print(f' s0 after: {s0}')
            #print(f' s1 after: {s1}')
        
            del seqs[seqs.index(s)]
            #print(f'seqCount: {[s.seqCount for s in seqs]}')
            #print(f'before: {seqs[0].before}')
            #print(f'after: {seqs[0].after}')
            #print(f'this: {seqs[0]}')
            
            #print(f' s : {s}')
            #print(f' s0 : {s0}')
            #print(f' s1: {s1}')
        
        return leafSeqs.append(seqs)
    
    
    def growSeq(self, attr, seq,  minSupport, maxSupport) :
        #this is not right
        pos=-1
        word=""
        count=0
        #print(f'this.pattern in growseq: {seq.pattern}')
        #eventcol=Sequence.getUniqueEvents(seq.incomingSequences)
        #print(f'seq pattern len {seq.keyevts}')
        for i in range (0,len(seq.keyevts)+1):
            fdist={}
            #print(f'i: {i}, len {len(seq.keyevts)}')
            for  ind, s in enumerate(seq.incomingSequences):
                #print(f's.seqIndices: {s.seqIndices}')
                evtHashes= s.getHashList(attr)
                l=0 if i==0 else   s.seqIndices[i - 1] + 1
                r=len(evtHashes) if i==len(seq.keyevts) else s.seqIndices[i]
                
                
                #print(f'l index: {l}, r index {r}')
                #print(f'evt Hash: {evtHashes}')
                duplicate=[]
                for j in range (l,r):
                    w=evtHashes[j]
                    #print(w)
                    if w in duplicate:
                        continue
                    duplicate.append(w)
                    if w not in fdist:
                        fdist[w] = s.getVolume()
                    else:
                        fdist[w]+= s.getVolume()
                
                maxw=""
                maxc=0
                for w in fdist.keys():
                    value= fdist[w]
                    
                    if value < maxSupport and value > maxc:
                        maxw= str(w)
                        maxc= value
                
                if maxc > count:
                    pos=i
                    word=maxw
                    count=maxc
        #print(f'{word}: word')
        #print(f'{maxc}: count')
                    
        s0=GraphNode()
        s1=GraphNode()
        
        #print(f'this.pattern s0 in growseq: {s0.pattern}')
        #print(f'this.pattern s1 in growseq: {s1.pattern}')
        
        #print(f'minSupport {minSupport} count {count}')    
        if count >= minSupport:
            words=seq.keyevts
            for t in seq.incomingSequences:
                l=0 if pos==0 else t.seqIndices[pos - 1] + 1
                r= len(t.events) if pos == len(words) else  t.seqIndices[pos]
                try:
                    i = t.getHashList(attr).index(word,l,r)
                    #print(f'position: {i}')
                    #i+=l
                    
                    t.seqIndices.insert(pos,i)
                    s1.incomingSequences.append(t)
                    s1.seqCount+=t.getVolume()

                except ValueError:
                    #print(f'Value error')
                    s0.incomingSequences.append(t)
                    s0.seqCount+=t.getVolume()
                
        s0.setSeqCount(Sequence.getSeqVolume(s0.incomingSequences))
        s1.setSeqCount(Sequence.getSeqVolume(s1.incomingSequences))
        print(f'Not contain: {len(s0.incomingSequences)}')
        print(f'contain: {len(s1.incomingSequences)}')
        return word, pos, count, s0, s1    

# Helper Functions

In [11]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def get_dataframe( src, local=False, sep="\t", header=[]):
    if not local:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        url_content = req.content
        csv_file = open('data.txt', 'wb') 
        csv_file.write(url_content)
        csv_file.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv("data.txt", sep)
        #else use header param for column names
        else:
            df = pd.read_csv("data.txt", sep, names=header)
        # Delete the csv file
        os.remove("data.txt")
        return df
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            print(src)
            df = pd.read_csv(src, sep)
        # else use header param for column names
        else:
            df = pd.read_csv(src, sep, names=header)
        return df
    
    
# Helper function for generateSequence to use when sorting events to get what time field to sort by
# Also used in splitSequences to give the time of an event when splitting the events up

def get_time_to_sort_by(e):
    # Sort by starting time of event if its an interval event
    if type(e) == IntervalEvent:
        return e.time[0]
    # Otherwise use the timestamp
    else:
        return e.timestamp


    
# Helper to insert an event into a map
# Params are key=unique id for that time, map of key to event list, event object
def insert_event_into_dict(key, dictionary, event):
    if key in dictionary:
        dictionary[key].append(event)
    else:
        dictionary[key] = [event]



# Event Aggregation
For aggregateEventsRegex and aggregateEventsDict, see what the files are expected to look like in the repo in DataModel/testFiles

In [12]:
# Helper function to run the mappings file as a dictionary
def give_dictionary_of_mappings_file(fileName):
    # Open the file and split the contents on new lines
    file = open(fileName, "r")
    mappings = file.read().split("\n")
    file.close()
    # Remove any empty strings from the list of mappings
    mappings = list(filter(None, mappings))
    # Raise an error if there is an odd number of items in mapping
    if (len(mappings) % 2) != 0:
        raise ValueError("There must be an even number of lines in the mappings file.")
    # Create a dictionary based on read in mappings
    aggregations = {}
    for i in range(len(mappings)):
        if i % 2 == 0:
            aggregations[mappings[i]] = mappings[i+1]
    #print(aggregations)
    return aggregations

# NOTE: this current modifies the events in eventList argument
# merge events by rules expressed in regular expressions. For example, in the highway incident dataset, we can 
# replace all events with the pattern “CHART Unit [number] departed” by “CHART Unit departed”. The argument 
# regexMapping can be a path pointing to a file defining such rules. We can assume each rule occupies two lines: 
# first line is the regular expression, second line is the merged event name 
def aggregateEventsRegex(eventList, regexMapping, attributeName): 
    aggregations = give_dictionary_of_mappings_file(regexMapping)
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # For all the regexes
        for regex in aggregations.keys():
            # If its a match then replace the attribute value for event with
            if re.match(regex, attribute_val):
                event.attributes[attributeName] = aggregations[regex]
                break
    return eventList
    
# NOTE: this current modifies the events in eventList argument
# merge events by a dictionary mapping an event name to the merged name. The argument nameDict can be a path 
# pointing to a file defining such a dictionary. We can assume each mapping occupies two lines: first line is the 
# original name, second line is the merged event name.    
def aggregateEventsDict(eventList, nameDict, attributeName):
    aggregations = give_dictionary_of_mappings_file(nameDict)
    # Iterate over all events and replace evevnts in event list with updated attribute name
    # if directed to by given mappings
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # If the attribute value has a mapping then replace the event's current value with the one in give map
        if attribute_val in aggregations:
            
            event.attributes[attributeName] = aggregations[attribute_val]
    return eventList

# Importing events functions

# Generating Sequences

In [13]:
sequence_braiding_Es= EventStore()
sequence_braiding_Es.importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding_Es.events, sequence_braiding_Es)
#Sequence.create_attr_dict([seq])
#seq.getEventPosition('Meal','Lunch')
#print(seq.getUniqueValueHashes('Meal'))
#print(seq.getHashList('Glucose'))
#print(seq.getValueHashes('Glucose'))
#print(seq.getEventsHashString('Glucose'))
#raw_seq=seq.convertToVMSPReadable('Meal')
#print(raw_seq)
#print(seq.getPathID())
#sequence_braiding[0].attributes.keys()
#print(sequence_braiding[0].getAttrVal('Meals'))
#print(sequence_braiding[0].type)
#for events in sequence_braiding:
#    print(events.getAttrVal('Meal'))


../datasets/sequence_braiding_refined.csv
dict_keys(['Glucose', 'Meal'])


In [14]:
seq_list=sequence_braiding_Es.splitSequences(seq, "week")
#seq_list=[]
#for seqs in sequence_braiding_split:
#    seq_list.append(Sequence(seqs))
    
#Sequence.create_attr_dict(seq_list)
raw_seq="\n".join( seqs.getEventsHashString('Meal') for seqs in seq_list)

In [15]:
print(sequence_braiding_Es.reverseatttrdict['Meal'])

{'0': 'Lunch', '1': 'Sugar to treat', '2': 'Afternoon snack', '3': 'Other', '4': 'Bedtime snack', '5': 'Breakfast', '6': 'Exercise snack', '7': 'Nothing', '8': 'Dinner'}


In [16]:
print(raw_seq)

118231105811011151115
105131582187770801055
158015805480582175805
820580505810580550
360582072075111054100587
665830758120758075480585805
26051111075230751181610511148110158011585
876081031510581360548110581585
17705810580511548151157
182770577058205176054805180111805
87058205058770511051018011
118110151861015871551805105820515
860514185850486118050115
11015111087605415051
305805071760111171250
76061076051810715115105
760511100505183105152
050580571051818810151
70711510501116058101151
813017110371810518805480858
1605105158151051580
1


In [17]:
#vocabularies- can be emulated from attrdict
# itemset- keys of vocabularies
#count- seq volume


In [18]:
seq_list[0].events[0].getAttrVal('Meal')

'Sugar to treat'

In [19]:
indices=[0,1,5]
seq_sublist=[seq_list[index] for index in indices]

In [20]:
seq_sublist[2].events[7].getAttrVal('Meal')

'Breakfast'

In [21]:
raw_seq= [seqs.getEventsHashString('Meal') for seqs in seq_list]

In [22]:
stm= SentenTreeMiner()
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=GraphNode()
root.incomingSequences=seq_list
graph=Graph()
visibleGroups=stm.expandSeqTree("Meal",root,  expandCnt=30, minSupport=5, maxSupport=len(raw_seq),graph=graph)

seqCount: 22
this.pattern s : []
Not contain: 0
contain: 22
word: 1, pos: 0, count: 21
seqCount: [22, 22]
seqCount: 22
this.pattern s : ['1']
Not contain: 1
contain: 21
word: 0, pos: 1, count: 21
seqCount: [22, 21]
seqCount: 21
this.pattern s : ['1', '0']
Not contain: 0
contain: 21
word: 5, pos: 2, count: 21
seqCount: [21, 21]
seqCount: 21
this.pattern s : ['1', '0', '5']
Not contain: 0
contain: 21
word: 0, pos: 3, count: 21
seqCount: [21, 21]
seqCount: 21
this.pattern s : ['1', '0', '5', '0']
Not contain: 2
contain: 19
word: 5, pos: 4, count: 19
seqCount: [21, 19]
seqCount: 19
this.pattern s : ['1', '0', '5', '0', '5']
Not contain: 3
contain: 16
word: 5, pos: 5, count: 16
seqCount: [19, 16]
seqCount: 16
this.pattern s : ['1', '0', '5', '0', '5', '5']
Not contain: 5
contain: 11
word: 1, pos: 6, count: 11
seqCount: [16, 11, 5]
seqCount: 11
this.pattern s : ['1', '0', '5', '0', '5', '5', '1']
Not contain: 2
contain: 9
word: 5, pos: 7, count: 9
seqCount: [11, 5, 9]
seqCount: 9
this.patter

In [23]:
x=json.dumps(root, ensure_ascii=False, default=GraphNode.json_serialize_dump, indent=1)
print(x)

{
 "before": {
  "before": {
   "before": {
    "before": {
     "before": {
      "before": {
       "before": {
        "before": {
         "before": {
          "before": {
           "before": {
            "before": {
             "before": {
              "before": null,
              "event_attribute": "",
              "Pattern": [],
              "value": 0,
              "After": null
             },
             "event_attribute": "Breakfast",
             "Pattern": [
              "1",
              "0",
              "5",
              "0",
              "5",
              "5",
              "1",
              "5",
              "8",
              "0",
              "8",
              "5"
             ],
             "value": 5,
             "After": {
              "before": null,
              "event_attribute": "",
              "Pattern": [],
              "value": 0,
              "After": null
             }
            },
            "event_attribute": "Dinner",
 

In [24]:
empJSON = jsonpickle.encode(graph, unpicklable=False)

print("Writing JSON Encode data into Python String")
employeeJSONData = json.dumps(empJSON, indent=4)
print(employeeJSONData)

Writing JSON Encode data into Python String
"{\"links\": {\"1\": [2, 3], \"3\": [4, 5], \"5\": [6, 7], \"7\": [8, 9], \"9\": [10, 11], \"11\": [12, 13], \"13\": [14, 15], \"15\": [16, 17], \"17\": [18, 19], \"19\": [20, 21], \"21\": [22, 23], \"14\": [24, 25], \"23\": [26, 27], \"25\": [28, 29], \"27\": [30, 31], \"default_factory\": {\"py/type\": \"builtins.set\"}}, \"nodes\": [{\"nid\": 1, \"seqCount\": 22, \"value\": \"\"}, {\"nid\": 3, \"seqCount\": 22, \"value\": \"Sugar to treat\"}, {\"nid\": 2, \"seqCount\": 0, \"value\": \"\"}, {\"nid\": 5, \"seqCount\": 21, \"value\": \"Lunch\"}, {\"nid\": 4, \"seqCount\": 1, \"value\": \"\"}, {\"nid\": 7, \"seqCount\": 21, \"value\": \"Breakfast\"}, {\"nid\": 6, \"seqCount\": 0, \"value\": \"\"}, {\"nid\": 9, \"seqCount\": 21, \"value\": \"Lunch\"}, {\"nid\": 8, \"seqCount\": 0, \"value\": \"\"}, {\"nid\": 11, \"seqCount\": 19, \"value\": \"Breakfast\"}, {\"nid\": 10, \"seqCount\": 2, \"value\": \"\"}, {\"nid\": 13, \"seqCount\": 16, \"value\

In [25]:
x=json.dumps(graph, ensure_ascii=False, default=Graph.json_serialize_dump, indent=1)
print(x)

ValueError: Circular reference detected

In [None]:
json.dumps(graph, default=lambda o: o.__dict__)

In [None]:
json.dumps(vars(graph))

In [31]:
graph.links


defaultdict(set,
            {32: {33, 34},
             34: {35, 36},
             33: {37, 38},
             36: {39, 40},
             35: {41, 42},
             38: {43, 44},
             40: {45, 46},
             44: {47, 48},
             48: {49, 50}})

In [32]:
graph.nodes

[<__main__.Rawnode at 0x7fc9df1489d0>,
 <__main__.Rawnode at 0x7fc9def9dcd0>,
 <__main__.Rawnode at 0x7fc9def9dd10>,
 <__main__.Rawnode at 0x7fc9def9e510>,
 <__main__.Rawnode at 0x7fc9def9e550>,
 <__main__.Rawnode at 0x7fc9def9ed10>,
 <__main__.Rawnode at 0x7fc9def9ed50>,
 <__main__.Rawnode at 0x7fc9defa0590>,
 <__main__.Rawnode at 0x7fc9defa05d0>,
 <__main__.Rawnode at 0x7fc9df0666d0>,
 <__main__.Rawnode at 0x7fc9df066ad0>,
 <__main__.Rawnode at 0x7fca083c9390>,
 <__main__.Rawnode at 0x7fca083c9410>,
 <__main__.Rawnode at 0x7fc9df143e90>,
 <__main__.Rawnode at 0x7fc9df143e50>,
 <__main__.Rawnode at 0x7fc9df069790>,
 <__main__.Rawnode at 0x7fc9df069390>,
 <__main__.Rawnode at 0x7fc9defa0f90>,
 <__main__.Rawnode at 0x7fc9defa0fd0>]

In [None]:
h=[]
#merge(h,key=lambda e:e[0],reverse=True)
heapq.heappush(h, (200, 1))
heapq.heappush(h, (300,2))
heapq.heappush(h, (400,3))
print(heapq.heappop(h))


In [None]:

"abcd".index('c',2,5)

In [26]:
sequence_braiding_Es= EventStore()
sequence_braiding_Es.importPointEvents('../Sample_Dataset.csv', 0, "%m/%d/%Y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding_Es.events, sequence_braiding_Es)


../Sample_Dataset.csv
dict_keys(['Events', 'Counts'])


In [27]:
seq_list=sequence_braiding_Es.splitSequences(seq, "day")
raw_seq="\n".join( seqs.getEventsHashString('Events') for seqs in seq_list)

In [28]:
print(sequence_braiding_Es.reverseatttrdict['Events'])

{'0': 'B', '1': 'E', '2': 'F', '3': 'A', '4': 'K', '5': 'C', '6': 'G', '7': 'H', '8': 'L', '9': 'I', ':': 'J', ';': 'D'}


In [29]:
stm= SentenTreeMiner()
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=GraphNode()
root.incomingSequences=seq_list
graph=Graph()
visibleGroups=stm.expandSeqTree("Events",root,  expandCnt=30, minSupport=2, maxSupport=len(seq_list),graph=graph)

seqCount: 6
this.pattern s : []
Not contain: 2
contain: 4
word: 3, pos: 0, count: 4
seqCount: [6, 4, 2]
seqCount: 4
this.pattern s : ['3']
Not contain: 2
contain: 2
word: 5, pos: 1, count: 2
seqCount: [4, 2, 2, 2]
seqCount: 2
this.pattern s : []
Not contain: 0
contain: 2
word: 5, pos: 0, count: 2
seqCount: [2, 2, 2, 2]
seqCount: 2
this.pattern s : ['3', '5']
Not contain: 0
contain: 2
word: 5, pos: 2, count: 2
seqCount: [2, 2, 2, 2]
seqCount: 2
this.pattern s : ['3']
Not contain: 0
contain: 0
word: ;, pos: 0, count: 1
seqCount: [2, 2, 2]
seqCount: 2
this.pattern s : ['5']
Not contain: 0
contain: 2
word: 2, pos: 1, count: 2
seqCount: [2, 2, 2]
seqCount: 2
this.pattern s : ['3', '5', '5']
Not contain: 0
contain: 0
word: 5, pos: 0, count: 1
seqCount: [2, 2]
seqCount: 2
this.pattern s : ['5', '2']
Not contain: 0
contain: 2
word: 6, pos: 2, count: 2
seqCount: [2, 2]
seqCount: 2
this.pattern s : ['5', '2', '6']
Not contain: 0
contain: 0
word: 8, pos: 1, count: 1
seqCount: [2]


In [30]:
x=json.dumps(root, ensure_ascii=False, default=GraphNode.json_serialize_dump, indent=1)
print(x)

{
 "before": {
  "before": {
   "before": {
    "before": {
     "before": null,
     "event_attribute": "",
     "Pattern": [],
     "value": 0,
     "After": null
    },
    "event_attribute": "C",
    "Pattern": [
     "3",
     "5",
     "5"
    ],
    "value": 2,
    "After": {
     "before": null,
     "event_attribute": "",
     "Pattern": [],
     "value": 0,
     "After": null
    }
   },
   "event_attribute": "C",
   "Pattern": [
    "3",
    "5"
   ],
   "value": 2,
   "After": {
    "before": null,
    "event_attribute": "",
    "Pattern": [
     "3",
     "5"
    ],
    "value": 0,
    "After": null
   }
  },
  "event_attribute": "A",
  "Pattern": [
   "3"
  ],
  "value": 4,
  "After": {
   "before": {
    "before": null,
    "event_attribute": "",
    "Pattern": [],
    "value": 0,
    "After": null
   },
   "event_attribute": "",
   "Pattern": [
    "3"
   ],
   "value": 2,
   "After": {
    "before": null,
    "event_attribute": "",
    "Pattern": [],
    "value": 0,
  