In [None]:
from datetime import datetime, timedelta
import pandas as pd
import csv
import requests
import os
import re
from itertools import count
import numpy as np

from itertools import accumulate

from spmf import Spmf
import json
import jsonpickle
import heapq

import pprint
from collections import defaultdict

import jsonpickle

# Event Representations

In [None]:
class Event:
    """Base Event class, holds the types."""

    def __init__(self, eventtype):
        self.type = eventtype
        self.attributes = {}

    def addAttribute(self, attr, value):
        """Add attributes to the Event object."""
        self.attributes[attr] = value

    def getAttrVal(self, attrName):
        """Return Attribute value given attribute name."""
        return self.attributes.get(attrName, None)


class PointEvent(Event):
    """Derivative class for Point events"""

    def __init__(self, timestamp):
        Event.__init__(self, "point")
        #self.type = "point"
        self.timestamp = timestamp


class IntervalEvent(Event):
    """Derivative class for interval events."""

    def __init__(self, t1, t2):
        Event.__init__(self, "interval")
        #self.type = "interval"
        self.time = [t1, t2]


In [None]:
class EventStore:
    """EventStore class holds all tevents present in the dataset. Also creates a
    dictionary of event attribute to unicode mapping and the reverse mapping.
    """

    def __init__(self, eventlist=None):
        if eventlist is None:
            eventlist = []
        self.attrDict = {}
        self.reverseAttrDict = {}
        self.events = eventlist

    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # header is list of column names if they are not provided in the dataset
    # The foursquare datasets are all using a differnet encoding that pandas cannot
    #  auto identify so for those
    # I thought the simplest thing was just to give this function the dataFrame and
    # then use that instead of calling my helper
    # for those cases

    def importPointEvents(self, src, timestampColumnIdx, timeFormat,
                          sep='\t', local=False, header=None, dataFrame=None):
        """ Returns a list of event objects
        src is a url or directory path, if local is false its url else its path
        header is list of column names if they are not provided in the dataset.
        """
        events = []
        # if the dataFrame is not provided
        if dataFrame is None:
            dataFrame = getDataframe(src, local, sep, header)
        cols = dataFrame.columns
        # For each event in the csv construct an event object
        for row in dataFrame.iterrows():
            data = row[1]
            timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
            # for all attributes other tahn time, add them to attributes dict
            evt = PointEvent(timestamp)
            for i, _ in enumerate(data):
                if i != timestampColumnIdx:
                    evt.addAttribute(cols[i], data[i])
            # use time stamp and attributes map to construct event object
            events.append(evt)
        self.events = events
        # sequence=Sequence(events)
        self.createAttrDict()
        # return sequence

    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas
    # cannot auto identify so for those
    # I thought the simplest thing was just to give this function
    # the dataFrame and then use that instead of calling my helper
    # for those cases

    def importIntervalEvents(self, src, startTimeColumnIdx, endTimeColumnIdx,
                             timeFormat, sep="\t", local=False, header=None, dataFrame=None):
        """Returns a list of event objects
        src is a url or directory path, if local is false its url else its path.
        """
        events = []
        # if the dataFrame is not provided
        if dataFrame is None:
            dataFrame = getDataframe(src, local, sep, header)
        cols = dataFrame.columns
        # For each event in the csv construct an event object
        for row in dataFrame.iterrows():
            data = row[1]
            # create datetime object for the start and end times of the event
            timestamp1 = datetime.strptime(
                data[startTimeColumnIdx], timeFormat)
            timestamp2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
            # for all attributes other than times, add them to attributes dict
            evt = IntervalEvent(timestamp1, timestamp2)
            for i, _ in enumerate(data):
                if i not in (startTimeColumnIdx, endTimeColumnIdx):
                    evt.addAttribute(cols[i], data[i])
                    #attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            events.append(evt)
        self.events = events
        # sequence=Sequence(events)
        self.createAttrDict()
        # return sequence

    # Import a dataset that has both interval and point events
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas
    # cannot auto identify so for those
    # I thought the simplest thing was just to give this function the dataFrame and then
    # use that instead of calling my helper

    def importMixedEvents(self, src, startTimeColumnIdx, endTimeColumnIdx,
                          timeFormat, sep="\t", local=False, header=None, dataFrame=None):
        """Import a dataset that has both interval and point events
        Returns a list of event objects
        src is a url or directory path, if local is false its url else its path.
        """
        events = []
        # if the dataFrame is not provided
        if dataFrame is None:
            dataFrame = getDataframe(src, local, sep, header)
        cols = dataFrame.columns
        # For each event in the csv construct an event object
        for row in dataFrame.iterrows():
            data = row[1]
            # create datetime object for timestamp (if point events)
            # or t1 and t2 (if interval event)
            # If the endTimeColumnIdx value is NaN ie a float instead of a time
            # string then its a point event
            # if isinstance(data[endTimeColumnIdx], float):
            if data[endTimeColumnIdx] is None or isinstance(data[endTimeColumnIdx], float):
                timeStamp = datetime.strptime(
                    data[startTimeColumnIdx], timeFormat)
                eventType = "point"
            # Otherwise its an interval event
            else:
                timeStamp1 = datetime.strptime(
                    data[startTimeColumnIdx], timeFormat)
                timeStamp2 = datetime.strptime(
                    data[endTimeColumnIdx], timeFormat)
                eventType = "interval"
            # for all attributes other than times, add them to attributes dict
            # list of indices to be ignored
            ignore = [startTimeColumnIdx, endTimeColumnIdx]
            attributeColumns = [ind for ind in range(
                len(data)) if ind not in ignore]
            if eventType == "point":
                evt = PointEvent(timeStamp)
            else:
                evt = IntervalEvent(timeStamp1, timeStamp2)
            for i in attributeColumns:
                evt.addAttribute(cols[i], data[i])
                #attribs[cols[i]] = data[i]
            # use time stamp (or t1 and t2) and attributes map to construct event object
            events.append(evt)
        self.events = events
        # sequence=Sequence(events)
        self.createAttrDict()
        # return sequence

    def generateSequence(self, attributeName):
        """Group events by attributeName, and order them by timestamp
        returns a list of sequences.
        """
        eventList = self.events
        groupedBy = {}
        # Sort the event list
        eventList = sorted(eventList, key=getTimeToSortBy)
        for event in eventList:
            value = event.attributes[attributeName]
            # If have seen this value before, append it the list of events in groupedBy for value
            if value in groupedBy:
                groupedBy[value].append(event)
            # otherwise store a new list with just that event
            else:
                groupedBy[value] = [event]
        sequences = list(groupedBy.values())
        seqlist = []
        for seq in sequences:
            seqlist.append(Sequence(seq, self))
        return seqlist

    def getUniqueValues(self, attr):
        """returns the unique values of a certain attribute
         present in the dataset.
         """
        uniqVals = list(set(event.getAttrVal(attr) for event in self.events))
        return uniqVals

    def getEventValue(self, attr, hashlist):
        """Given a list of hash values, return the original value of event."""
        return [self.reverseAttrDict[attr][val] for val in hashlist]

    def createAttrDict(self):
        """ Assuming we are given a list of events and from those events we create
        the mapping and reverse mapping dictionary.
        """
        attrList = self.events[0].attributes.keys()
        print(attrList)

        for attr in attrList:
            unicode = 48
            uniqueList = []
            uniqueList.extend(self.getUniqueValues(attr))
            uniqueList = list(set(uniqueList))
            # uniqueList.clear()

            unicodeDict = {}
            reverseDict = {}
            for uniques in uniqueList:
                unicodeDict[uniques] = chr(unicode)
                reverseDict[chr(unicode)] = uniques
                unicode = unicode+1
            self.attrDict[attr] = unicodeDict
            self.reverseAttrDict[attr] = reverseDict
            # unicodeDict.clear()


# Sequence Representations

In [None]:
class Sequence():
    """Collection of events sharing similar property."""

    _ids = count(0)

    def __init__(self, eventlist, eventstore, sid=None):
        # sequence id
        if sid is None:
            self.sid = self  # next(self._ids)
        else:
            self.sid = sid

        self.events = eventlist
        self.eventstore = eventstore
        self.volume = 1
        self.seqAttributes = {}
        self.seqIndices = []

    def getEventPosition(self, attr, hashVal):
        """Returns the position of first event where the value of attr matches the
        given hash value.
        """
        for pos, event in enumerate(self.events):
            # if event.getAttrVal(attr)==hashVal:
            if self.eventstore.attrDict[attr][event.getAttrVal(attr)] == hashVal:
                return pos
        return -1

    def setVolume(self, intValue):
        """Assigns the volume value to intValue."""
        self.volume = intValue

    def getVolume(self):
        """Returns the volume value for this object."""
        return self.volume

    def increaseVolume(self):
        """Increases volume value by 1."""
        self.volume += 1

    def getUniqueValueHashes(self, attr):
        """Returns Hash values for unique attribute types for the specified attribute
         in the dictionary
         """
        lst = list(set(event.getAttrVal(attr) for event in self.events))
        uniquelist = [self.eventstore.attrDict[attr][elem] for elem in lst]
        return uniquelist

    # Not sure this will always result in same index, will change if
    #dictionary is updated
    # since python is unordered

    def getHashList(self, attr):
        """"Returns a list of index positions of the specified attribute for all
        the events  in the sequence
        """
        #lst=list(list(event.attributes.keys()).index(attr) for event in self.events)
        lst = [event.getAttrVal(attr) for event in self.events]
        hashlist = [self.eventstore.attrDict[attr][elem] for elem in lst]

        return hashlist

    def getValueHashes(self, attr):
        """Returns a list of values of the specified attribute for all the
        events in the sequence
        """
        lst = list(event.getAttrVal(attr) for event in self.events)
        hashlist = [self.eventstore.attrDict[attr][elem] for elem in lst]

        return hashlist

    def getEventsHashString(self, attr):
        """Returns a string containing a list of values of the specified attribute
        for all the events in the sequence
        """
        string = attr + ": "
        lst = list(event.getAttrVal(attr) for event in self.events)
        # for count,event in enumerate(self.events):
        #    string+=str(event.getAttrVal(attr))+" "
        string += "".join(str(self.eventstore.attrDict[attr][elem])
                          for elem in lst)
        return string

    def convertToVMSPReadablenum(self, attr):
        """Returns a VMSP readable string of numbers containing a list of values of
        the specified attribute for all the events in the sequence
        """
        lst = list(event.getAttrVal(attr) for event in self.events)
        string = " -1 ".join(str(self.eventstore.attrDict[attr][elem])
                             for elem in lst)
        # string=""
        # for count,event in enumerate(self.events):
        #    string+=str(event.getAttrVal(attr))+" -1 "
        string += " -2"

        return string

    def convertToVMSPReadable(self, attr):
        """Returns a VMSP readable string containing a list of values of
        the specified attribute for all the events in the sequence
        """
        lst = list(event.getAttrVal(attr) for event in self.events)
        string = " ".join(self.eventstore.attrDict[attr][elem] for elem in lst)
        # string=""
        # for count,event in enumerate(self.events):
        #    string+=str(event.getAttrVal(attr))+" -1 "
        string += "."

        return string

    def getPathID(self):
        """Returns the sequence ID value for this object."""
        return self.sid

    def matchPathAttribute(self, attr, val):
        """Returns True if the value for specified sequence attribute matches the specified value"""
        # should i use eq?!
        return bool(self.seqAttributes.get(attr) == (val))

    def setSequenceAttribute(self, attr, value):
        """Assigns the value of the specified attribute to the specified Value"""
        self.seqAttributes[attr] = value

    # equivalent to method signature public static int getVolume(List<Sequence> seqs)
    @staticmethod
    def getSeqVolume(seqlist):
        """Return aggregated value of total volume of all the sequences
        in the given List of Sequences.
        """
        return sum(seq.getVolume() for seq in seqlist)

    @staticmethod
    def getUniqueEvents(seqlist, attr):
        """Get all possible event types"""
        # return self.eventstore.reverseAttrDict[attr].values()
        return list(set(event.getAttrVal(attr) for event in seq for seq in seqlist))

    # Method equivalent to public String getEvtAttrValue(String attr, int hash) in DataManager.java

    def getEvtAttrValue(self, attr, hashVal):
        """Given hashVal, return original value for the specified attr"""
        return self.eventstore.reverseAttrDict[attr][hashVal]

    # Method equivalent to public List<String> getEvtAttrValues(String attr) in DataManager.java
    def getEvtAttrValues(self, attr):
        """Given attr name, return all possible values for that attribute"""
        return [event.getAttrVal(attr) for event in self.events]

    # Method equivalent to int getEvtAttrValueCount(String attr) in DataManager.java
    def getEvtAttrValueCount(self, attr):
        """return the number of distinct types present given an attribute"""
        return len(self.eventstore.reverseAttrDict[attr])

    def getEventsString(self, attr):
        """Convert the sequence events to a string."""
        return " ".join(elem for elem in self.getEvtAttrValues(attr))

    #ZINAT- changes
    # SequenceList represents a list of objects of type Sequence.
    # The sequences are further splitted into
    # sequence objects, this way we can use generate sequences and then splitSequences
    @staticmethod
    def splitSequences(sequenceLists, timeUnit, record=None):
        """Split a long sequence into shorter ones by timeUnit. For example, a sequence
        may span several days and we want to break it down into daily sequences. The argument
        timeUnit can be one of the following strings: “hour”, “day”, “week”, “month”, “quarter”,
        and “year”. For interval events the start time of the event  is used to determine its
        category when splitting it
        """
        if not isinstance(sequenceLists, list):
            sequenceLists = [sequenceLists]
        eventstore = sequenceLists[0].eventstore
        results = []
        resultlist = []
        timeUnit = timeUnit.lower()
        # Check if the time unit is a valid argument
        validTimeUnits = ["hour", "day", "week", "month", "quarter", "year"]
        if timeUnit not in validTimeUnits:
            raise ValueError(
                "timeUnit must be hour, day, week, month, quarter, or year")

        for sequence in sequenceLists:
            # Sort the events by the timestamp or event start time
            sequenceList = sequence.events
            sequenceList = sorted(sequenceList, key=getTimeToSortBy)

            # Process the event sequence based on the given time unit
            # Generally, create a map for that time unit and then add each event into that map
            # (key=time such as May 2021 in case of month, value=sequence) and then return the
            # values of the map as a list
            if timeUnit == "hour":
                hours = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    key = (time.hour, time.day, time.month, time.year)
                    insertEventIntoDict(key, hours, event)
                    if record is None:
                        event.attributes["record"] = ' '.join(
                            [str(k) for k in key])
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+' '.join([str(k) for k in key])
                results = list(hours.values())

            elif timeUnit == "day":
                days = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    key = (time.day, time.month, time.year)
                    insertEventIntoDict(key, days, event)
                    # print(days)
                    if record is None:
                        event.attributes["record"] = datetime(
                            *(key[::-1])).strftime("%Y%m%d")
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d")
                results = list(days.values())

            elif timeUnit == "month":
                months = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    key = (time.month, time.year)
                    insertEventIntoDict(key, months, event)
                    if record is None:
                        event.attributes["record"] = str(key[0])+str(key[1])
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(months.values())

            elif timeUnit == "week":
                weeks = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    year = time.year
                    weekNum = time.isocalendar()[1]
                    key = (year, weekNum)
                    insertEventIntoDict(key, weeks, event)
                    if record is None:
                        event.attributes["record"] = str(
                            key[0])+"W"+str(key[1])
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+str(key[0])+"W"+str(key[1])
                results = list(weeks.values())

            elif timeUnit == "year":
                years = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    key = time.year
                    insertEventIntoDict(key, years, event)
                    if record is None:
                        event.attributes["record"] = str(key)
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+str(key)
                results = list(years.values())

            elif timeUnit == "quarter":
                quarters = {}
                for event in sequenceList:
                    time = getTimeToSortBy(event)
                    year = time.year
                    month = time.month
                    # Determine the year, quarter pair/key for quarter dict
                    # January, February, and March (Q1)
                    if month in range(1, 4):
                        key = (year, "Q1")
                    # April, May, and June (Q2)
                    elif month in range(4, 7):
                        key = (year, "Q2")
                    # July, August, and September (Q3)
                    elif month in range(7, 10):
                        key = (year, "Q3")
                    # October, November, and December (Q4)
                    elif month in range(10, 13):
                        key = (year, "Q4")
                    # Put the event in the dictionary
                    insertEventIntoDict(key, quarters, event)
                    if record is None:
                        event.attributes["record"] = str(key[0])+str(key[1])
                    else:
                        event.attributes[record] = str(
                            event.attributes[record])+"_"+str(key[0])+str(key[1])
                results = list(quarters.values())
            resultlist.extend(results)
        resultlists = [Sequence(x, eventstore) for x in resultlist]

        return resultlists


# Node

In [None]:
class Node():
    """Base Node class holds information of the branching patterns in sequences"""

    nodeCounter = count(1)
    nodeHash = {}

    def __init__(self, name="", count_=0, value="", attr=""):
        super().__init__()
        self.nid = next(self.nodeCounter)
        self.name = name
        self.seqCount = count_
        # What's the difference between name and value?
        self.value = value
        self.hash = -1
        self.pos = []
        self.meanStep = 0
        self.medianStep = 0
        # self.zipCompressRatio=0
        self.incomingBranchUniqueEvts = None
        # self.incomingBranchSimMean=None
        # self.incomingBranchSimMedian=None
        # self.incomingBranchSimVariance=None
        self.keyevts = []
        self.sequences = []
        self.incomingSequences = []
        self.outgoingSequences = []

        self.meanRelTimestamp = 0
        self.medianRelTimestamp = 0

        self.attr = attr

        TreeNode.nodeHash[self.nid] = self

    def getNode(self, nodeId):
        """Returns the node for given node if in nodeHash table."""
        return self.nodeHash[nodeId]

    def clearHash(self):
        """Clears the nodeHash dictionary"""
        self.nodeHash.clear()

    def getIncomingSequences(self):
        """Returns the list of incoming sequences"""
        return self.incomingSequences

    def getSeqCount(self):
        """returns the sequence count"""
        return self.seqCount

    def setSeqCount(self, seqCount):
        """Assigns the sequence coun"""
        self.seqCount = seqCount

    def getName(self):
        """Returns name of the node"""
        return self.name

    def setName(self, name):
        """Assigns name of the node"""
        self.name = name

    def getMeanStep(self):
        """Returns the value of meanStep"""
        return self.meanStep

    # need a better implementation
    def toJSONObject(self):
        """converts the node to siple JSON object"""
        # ,sort_keys=True, indent=4)
        return json.dumps(self, default=lambda o: o.__dict__)

    def toString(self):
        """Returns name and seqCount for the node"""
        return self.name+": "+self.seqCount

    def setPositions(self, lst):
        """set meanStep and medianStep"""
        self.pos = lst
        print(f'positions {self.pos}')
        self.pos.sort()
        sumVal = sum(self.pos)+len(self.pos)
        #mid = len(self.pos)/2

        if len(self.pos) == 0:
            self.meanStep = 0
            self.medianStep = 0
        else:
            # WHY WE ARE ADDING 1 to mean and medianStep?
            self.meanStep = sumVal/(len(self.pos))-1
            # ((self.pos[mid-1]+self.pos[mid])/2.0)+1 if len(self.pos)%2==0 else self.pos[mid]+1
            self.medianStep = np.median(self.pos)

    def getValue(self):
        """Returns value of the node."""
        return self.value

    def setValue(self, value):
        """Assigns value to the node."""
        self.value = value

    def getMedianStep(self):
        """Returns medianStep of the node"""
        return self.medianStep

    # def getZipCompressRatio(self):
    #    return self.zipCompressRatio

    # def setZipCompressRatio(self, zipcompressratio):
    #    self.zipCompressRatio=zipcompressratio

    def getIncomingBranchUniqueEvts(self):
        """returns Unique events for the incoming branch"""
        return self.incomingBranchUniqueEvts

    def setIncomingBranchUniqueEvts(self, incomingBranchUniqueEvts):
        """Assigns value to incomingBranchUniqueEvts"""
        self.incomingBranchUniqueEvts = incomingBranchUniqueEvts

    # def setIncomingBranchSimilarityStats(self, mean, median, variance):
    #    self.incomingBranchSimMean=mean
    #    self.incomingBranchSimMedian=median
    #    self.incomingBranchSimVariance=variance

    def setIncomingSequences(self, incomingBranchSeqs):
        """Assigns value to incomingSequences"""
        self.incomingSequences = incomingBranchSeqs

    def setRelTimeStamps(self, relTimeStamps):
        """Assigns value to  meanRelTimestamp and medianRelTimestamp"""
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')
        relTimeStamps.sort()
        #print(f'Time Stamp {reltimestamps}')
        #print(f'Time Stamp {type(reltimestamps[0])}')

        sumVal = sum(relTimeStamps, timedelta())

        #mid = len(reltimestamps)/2

        if len(relTimeStamps) == 0:
            self.meanRelTimestamp = 0
            self.medianRelTimestamp = 0

        else:

            self.meanRelTimestamp = sumVal*1.0/len(relTimeStamps)
            # (reltimestamps[mid-1]+reltimestamps[mid])/2.0
            # if len(reltimestamps%2==0) else reltimestamps[mid]
            self.medianRelTimestamp = np.median(relTimeStamps)

        #print(f'Time Stamp {self.meanRelTimestamp}')
        #print(f'Time Stamp {self.meanRelTimestamp}')

    def getPatternString(self):
        """Returns the pattern string for this node"""
        return "-".join(str(
            self.incomingSequences[0].eventstore.reverseAttrDict[self.attr][hashVal])
                        for hashVal in self.keyevts if self.incomingSequences)

    def getHash(self):
        """Returns hash value for this node."""
        return self.hash

    def setHash(self, value):
        """Assigns hash value for this node"""
        self.hash = value

    # def jsonSerialize(self):
    #    json.dump(self, indent=4, default= TreeNode.jsonDefaultDump)

    def jsonDefaultDump(self) -> dict:
        """dummy method- implemented in derived class"""

    def jsonSerialize(self) -> None:
        """dummy method- implemented in derived class"""

    @staticmethod
    def jsonSerializeDump(obj):
        """dummy method- implemented in derived class"""


In [None]:
class TreeNode(Node):
    """Class to visualize Coreflow-like Tree data structures"""

    def __init__(self, name="", count_val=0, value="", attr=""):
        super().__init__(name, count_val, value)
        self.children = []

    def jsonDefaultDump(self) -> dict:
        return {
            "event_attribute": self.value,
            "Pattern": self.getPatternString(),
            "value": self.seqCount,
            "median_index": self.medianStep,
            "average_index": self.meanStep,

            "children": [TreeNode.jsonSerializeDump(x) for x in self.children]

        }

    def jsonSerialize(self) -> None:
        json.dumps(self, indent=4, default=TreeNode.jsonSerializeDump)

    @staticmethod
    def jsonSerializeDump(obj):

        if hasattr(obj, "jsonDefaultDump"):

            return obj.jsonDefaultDump()
        return None


In [None]:
class GraphNode(Node):

    """Class to support graphs where multiple branching of nodes are possible"""

    def __init__(self, name="", count_val=0, value="", attr=""):
        super().__init__(name, count_val, value, attr)
        self.before = []
        self.after = []

    def jsonDefaultDump(self) -> dict:
        return {
            "before": GraphNode.jsonSerializeDump(self.before),
            "event_attribute": self.value,
            "Pattern": self.getPatternString(),
            "value": self.seqCount,
            "After": GraphNode.jsonSerializeDump(self.after)

        }

    def jsonSerialize(self) -> None:
        json.dumps(self, indent=4, default=GraphNode.jsonSerializeDump)

    @staticmethod
    def jsonSerializeDump(obj):

        if hasattr(obj, "jsonDefaultDump"):

            return obj.jsonDefaultDump()
        return None


In [None]:
""" Creates the RawNode, Links and Graph class."""

import json


class RawNode:
    """RawNode contains selected attributes from Node class for json conversion."""

    def __init__(self, node):
        self.nid = node.nid
        self.seqCount = node.seqCount
        self.value = node.value
        self.pattern = node.getPatternString()
        self.meanStep = node.meanStep
        self.medianStep = node.medianStep

    def jsonDefaultDump(self) -> dict:
        """creates the Json format output for the class RawNode."""
        return {
            "node_id": self.nid,
            "event_attribute": self.value,
            "Pattern": self.pattern,
            "value": self.seqCount,
            "median_index": self.medianStep,
            "average_index": self.meanStep
        }

    def printNode(self):
        """ Prints details for a node."""
        print(f'node {self.nid}, value {self.value}, Pattern {self.pattern}, meanStep {self.meanStep} seqcount {self.seqCount}')

    @staticmethod
    def printNodes(nodeList):
        """Print all nodes in the list."""
        for node in nodeList:
            node.printNode()



class Links:
    """Links class contains information regarding which node is connected to which one"""

    def __init__(self, node1, node2, count):
        self.source = node1
        self.target = node2
        self.count = count

    def jsonDefaultDump(self) -> dict:
        """creates the Json format output for the class Links."""
        return {
            "source": self.source,
            "target": self.target,
            "count": self.count
        }


class Graph:
    """Graph class consusts of Links and Nodes."""

    def __init__(self):
        self.links = []  # defaultdict(set)
        self.nodes = []

    def jsonDefaultDump(self) -> dict:
        """creates the Json format output for the class Graph."""
        return {
            "nodes": self.nodes,
            "links": self.links

        }

    def jsonSerialize(self) -> None:
        """Default JSON serializer"""
        json.dumps(self, indent=4, default=Graph.jsonSerializeDump)

    @staticmethod
    def jsonSerializeDump(obj):
        """static method to call jsonDefaultDump on all custom objects"""
        if hasattr(obj, "jsonDefaultDump"):

            return obj.jsonDefaultDump()
        if isinstance(obj, set):
            return list(obj)
        return None  # obj.__dict__

    def printGraph(self):
        """Print the node ids."""
        for i, node in enumerate(self.nodes):
            print(f'node {node.nid}, index {i}')
        for i, link in enumerate(self.links):
            print(f'links {link.source} {link.target}, index {i}')

    def collapseNode(self):
        """Gets rid of extra nrange(lenodes and links"""

        RawNode.printNodes(self.nodes)
        delNodes = []
        delLinks = []
        newLinks = []

        for node in self.nodes:
            if node.value == -2:
                # ideally. there should be one source
                linkArrSrc = [
                    x for x in self.links if x.target == node.nid]
                print(f'source1 {[lnk.source for lnk in linkArrSrc]}')
                print(f'target1 {[lnk.target for lnk in linkArrSrc]}')
                # if len(linkArrSrc) == 1:
                #    linkArrSrc = linkArrSrc[0]

                linkArrTrgt = [
                    x for x in self.links if x.source == node.nid]
                print(f'source2 {[lnk.source for lnk in linkArrTrgt]}')
                print(f'target2 {[lnk.target for lnk in linkArrTrgt]}')

                if len(linkArrSrc) > 1 and len(linkArrTrgt) > 1:
                    print(f'len source {len(linkArrSrc)}, len target {len(linkArrTrgt)}')
                    #continue
                    #[x.printNode() for x in self.nodes if x.node_id in ]
                    raise ValueError('multiple source and target')


                for j, _ in enumerate(linkArrSrc):
                    for i, _ in enumerate(linkArrTrgt):
                        newLinks.append(
                            Links(linkArrSrc[j].source,
                                  linkArrTrgt[i].target, linkArrTrgt[i].count))
                        delLinks.append(linkArrTrgt[i])
                    delLinks.append(linkArrSrc[j])


                delNodes.append(node)

        #print(f'Node delete {[node. nid for node in delNodes]}')
        # print(
        #    f'Link delete {[((link.source, link.target)) for link in delLinks]}')

        # print(self.printGraph())

        delNodeIndices = [self.nodes.index(x) for x in delNodes]
        delLinkIndices = [self.links.index(x) for x in delLinks]

        #print(f'Node delete {delNodeIndices}')
        #print(f'Link delete {delLinkIndices}')
        print(delLinkIndices)
        for idx in sorted(delNodeIndices, reverse=True):
            del self.nodes[idx]

        # To sure uniqueness we use list(set) operation here
        for idx in sorted(list(set(delLinkIndices)), reverse=True):
            print(f'index {idx}')
            del self.links[idx]

        self.links.extend(newLinks)

    def allignNodes(self):
        """ Align  nodes according to their position in sequence. """
        print("nodes sorted")
        node = sorted(self.nodes, key=lambda x: x.meanStep)
        RawNode.printNodes(node)
        

# Ranking Function

In [None]:


class RankingFunction:
    """ Class to perform ranking and tiebreaker among events."""

    def __init__(self, maxSup):
        self.fdist = {}
        self.fdistInd = {}
        self.pos = -1
        self.word = ""
        self.count = 0
        self.maxSupport = maxSup
        self.rankingFunc = self.numberOfSequence
        self.tieBreaker = self.performRankingMedianIndex#self.
        
    def setRankingFunc(self, method1):
        """Set ranking function."""
        self.rankingFunc = method1

    def setTieBreaker(self, method1):
        """Set tie breaker."""
        self.tieBreaker = method1

    def clearfdists(self):
        """clear fdist and fdistInd."""
        self.fdist.clear()
        self.fdistInd.clear()

    def initValues(self):
        """Initialize pos, word and count."""
        self.pos = -1
        self.word = ""
        self.count = 0

    def performRankingNaive(self, index, _minpos):
        """Naive ranking of events, does not consider index."""
        maxWord = ""
        maxCount = 0
        for word in self.fdist:
            value = self.fdist[word]

            if maxCount < value <= self.maxSupport:
                maxWord = str(word)
                maxCount = value

        if maxCount > self.count:
            self.pos = index
            self.word = maxWord
            self.count = maxCount

    def performRankingMeanIndex(self, index, minPos):
        """If two events have the same number of Occurrences tie breake
        based on minimum Mean Index value.
        """
        maxWord = ""
        maxCount = 0

        for word in self.fdist:
            value = self.fdist[word]

            meanPos = sum(self.fdistInd[word]) / len(self.fdistInd[word])

            if maxCount < value <= self.maxSupport:
                maxWord = str(word)
                maxCount = value
                minPos = meanPos

            if value == maxCount and meanPos < minPos:
                maxWord = str(word)
                maxCount = value
                minPos = meanPos

        if maxCount > self.count or (maxCount == self.count and self.pos < index):
            self.pos = index
            self.word = maxWord
            self.count = maxCount


    def performRankingMedianIndex(self, index, minPos):
        """If two events have the same number of Occurrences tie breake
        based on minimum Mean Index value.
        """
        maxWord = ""
        maxCount = 0

        for word in self.fdist.keys():
            value = self.fdist[word]

            meadianPos = np.median(self.fdistInd[word])

            if maxCount < value <= self.maxSupport:
                maxWord = str(word)
                maxCount = value
                minPos = meadianPos

            if value == maxCount and meadianPos < minPos:
                maxWord = str(word)
                maxCount = value
                minPos = meadianPos

        if maxCount > self.count or (maxCount == self.count and self.pos < index):
            self.pos = index
            self.word = maxWord
            self.count = maxCount

    def numberOfSequence(self, evtHashes, startPos, endPos, seq):
        """Choose the event present in maximum number of sequences
        as the next Pattern event.
        """

        duplicate = []
        for j in range(startPos, endPos):
            word = evtHashes[j]
            # print(word)
            if word in duplicate:
                continue
            duplicate.append(word)
            if word not in self.fdist:
                self.fdist[word] = seq.getVolume()
                self.fdistInd[word] = [j]
            else:
                self.fdist[word] += seq.getVolume()
                self.fdistInd[word].append(j)

    def allOccurrence(self, evtHashes, startPos, endPos, seq):
        """Choose the event present maximum number of time across sequences
        as the next Pattern event.
        """

        for j in range(startPos, endPos):
            word = evtHashes[j]
            # print(word)
            if word not in self.fdist:
                self.fdist[word] = seq.getVolume()
                self.fdistInd[word] = [j]
            else:
                self.fdist[word] += seq.getVolume()
                self.fdistInd[word].append(j)


# SentenTree Miner

In [None]:
class SentenTreeMiner:
    
    def __init__ (self , minSup, maxSup):
        self.minSupport = minSup
        self.maxSupport = maxSup
        
        self.ranker = RankingFunction(maxSup)
        self.ranker.setRankingFunc(self.ranker.numberOfSequence)
        self.ranker.setTieBreaker(self.ranker.performRankingMedianIndex)

    def expandSeqTree(self, attr, rootNode, expandCnt, graph):
        """Chooses which branch of the tree to expand next."""
        # if len(rootSeq.eventlist>0):
        expandCnt -= len(rootNode.keyevts)
        seqs = []
        seqs.append(rootNode)
        rootNode.setSeqCount(Sequence.getSeqVolume(rootNode.incomingSequences))
        rootNode.attr = attr
        leafSeqs = []

        graph.nodes.append(RawNode(rootNode))
        while seqs and expandCnt > 0:
            currentSeq = max(seqs, key=lambda x: x.seqCount)
            print(f'seqCount: {currentSeq.seqCount}')

            seq0 = currentSeq.after
            seq1 = currentSeq.before

            print(f'this.pattern currentSeq : {currentSeq.keyevts}')

            if not seq1 and not seq0:
                word, pos, count, seq0, seq1 = self.growSeq(
                    attr, currentSeq)
                print(f'event: {word}, pos: {pos}, count: {count}')

                if count < self.minSupport:
                    leafSeqs.append(currentSeq)
                else:

                    seq1.setHash(word)
                    seq1.setValue(
                        currentSeq.incomingSequences[0].getEvtAttrValue(attr, word))
                    seq1.keyevts = currentSeq.keyevts[:]  # deep copy
                    seq0.keyevts = currentSeq.keyevts[:]

                    seq1.keyevts.insert(pos, word)

            if seq1 and seq1.seqCount >= self.minSupport:
                expandCnt -= 1
                seqs.append(seq1)
                graph.nodes.append(RawNode(seq1))
                graph.links.append(
                    Links(currentSeq.nid, seq1.nid, seq1.seqCount))

            currentSeq.before = seq1
            currentSeq.after = seq0

            if seq0 and seq0.seqCount >= self.minSupport:
                seqs.append(seq0)
                graph.nodes.append(RawNode(seq0))
                graph.nodes[-1].value = -2  # dummy node value
                graph.links.append(
                    Links(currentSeq.nid, seq0.nid, seq0.seqCount))

            print(f'seqCount: {[s.seqCount for s in seqs]}')

            del seqs[seqs.index(currentSeq)]

        #graph.collapseNode()
        #graph.allignNodes()

        return leafSeqs.append(seqs)

    
    
    def growSeq(self, attr, seq):
        """Expands the current max Pattern by another event."""
        self.ranker.initValues()

        for i in range(0, len(seq.keyevts)+1):
            self.ranker.clearfdists()

            for elem in seq.incomingSequences:

                evtHashes = elem.getHashList(attr)
                startPos = 0 if i == 0 else elem.seqIndices[i - 1] + 1
                endPos = len(evtHashes) if i == len(
                    seq.keyevts) else elem.seqIndices[i]

                self.ranker.rankingFunc(evtHashes, startPos,
                                        endPos, elem)

            minPos = max(len(x.events) for x in seq.incomingSequences)

            self.ranker.tieBreaker(i, minPos)

        seq0 = GraphNode(attr=attr)
        seq1 = GraphNode(attr=attr)

        if self.ranker.count >= self.minSupport:
            words = seq.keyevts
            for elem in seq.incomingSequences:
                startPos = 0 if self.ranker.pos == 0 else elem.seqIndices[self.ranker.pos - 1] + 1
                endPos = len(elem.events) if self.ranker.pos == len(
                    words) else elem.seqIndices[self.ranker.pos]
                try:
                    i = elem.getHashList(attr).index(self.ranker.word, startPos, endPos)
                    # sequence index value for the word being inserted. e.g. A-C-G seq indice 1,4,8
                    elem.seqIndices.insert(self.ranker.pos, i)
                    seq1.incomingSequences.append(elem)
                    seq1.seqCount += elem.getVolume()

                except ValueError:
                    seq0.incomingSequences.append(elem)
                    seq0.seqCount += elem.getVolume()
            # calculate average index
            posArr = [seq.seqIndices[self.ranker.pos] for seq in seq1.incomingSequences]
            seq1.setPositions(posArr)
            seq0.setSeqCount(Sequence.getSeqVolume(seq0.incomingSequences))
            seq1.setSeqCount(Sequence.getSeqVolume(seq1.incomingSequences))
            seq0.sequences = seq0.incomingSequences
            seq1.sequences = seq1.incomingSequences

            print(f'Not contain: {len(seq0.incomingSequences)}')
            print(f'contain: {len(seq1.incomingSequences)}')

        return self.ranker.word, self.ranker.pos, self.ranker.count, seq0, seq1 

# Helper Functions

In [None]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def getDataframe(src, local=False, sep="\t", header=None):
    """Helper function to return a data frame
    Local is boolean, if local then source should be path to the file
    Otherwise it should be a URL to the the file
    """

    if not local:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        urlContent = req.content
        csvFile = open('data.txt', 'wb')
        csvFile.write(urlContent)
        csvFile.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if header is None:
            dataFrame = pd.read_csv("data.txt", sep)
        # else use header param for column names
        else:
            dataFrame = pd.read_csv("data.txt", sep, names=header)
        # Delete the csv file
        os.remove("data.txt")
        # return dataFrame
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            print(src)
            dataFrame = pd.read_csv(src, sep)
        # else use header param for column names
        else:
            dataFrame = pd.read_csv(src, sep, names=header)
    return dataFrame    
    
# Helper function for generateSequence to use when sorting events to get what time field to sort by
# Also used in splitSequences to give the time of an event when splitting the events up

def getTimeToSortBy(evt):
    """Helper function for generateSequence to use when sorting events to get
    what time field to sort by. Also used in splitSequences to give the time of
    an event when splitting the events up
    """

    # Sort by starting time of event if its an interval event
    if isinstance(evt, IntervalEvent):
        return evt.time[0]
    # Otherwise use the timestamp
    return evt.timestamp


    
# Helper to insert an event into a map
# Params are key=unique id for that time, map of key to event list, event object
def insertEventIntoDict(key, dictionary, event):
    if key in dictionary:
        dictionary[key].append(event)
    else:
        dictionary[key] = [event]



# Event Aggregation
For aggregateEventsRegex and aggregateEventsDict, see what the files are expected to look like in the repo in DataModel/testFiles

In [None]:
# Helper function to run the mappings file as a dictionary
def give_dictionary_of_mappings_file(fileName):
    # Open the file and split the contents on new lines
    file = open(fileName, "r")
    mappings = file.read().split("\n")
    file.close()
    # Remove any empty strings from the list of mappings
    mappings = list(filter(None, mappings))
    # Raise an error if there is an odd number of items in mapping
    if (len(mappings) % 2) != 0:
        raise ValueError("There must be an even number of lines in the mappings file.")
    # Create a dictionary based on read in mappings
    aggregations = {}
    for i in range(len(mappings)):
        if i % 2 == 0:
            aggregations[mappings[i]] = mappings[i+1]
    #print(aggregations)
    return aggregations

# NOTE: this current modifies the events in eventList argument
# merge events by rules expressed in regular expressions. For example, in the highway incident dataset, we can 
# replace all events with the pattern “CHART Unit [number] departed” by “CHART Unit departed”. The argument 
# regexMapping can be a path pointing to a file defining such rules. We can assume each rule occupies two lines: 
# first line is the regular expression, second line is the merged event name 
def aggregateEventsRegex(eventList, regexMapping, attributeName): 
    aggregations = give_dictionary_of_mappings_file(regexMapping)
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # For all the regexes
        for regex in aggregations.keys():
            # If its a match then replace the attribute value for event with
            if re.match(regex, attribute_val):
                event.attributes[attributeName] = aggregations[regex]
                break
    return eventList
    
# NOTE: this current modifies the events in eventList argument
# merge events by a dictionary mapping an event name to the merged name. The argument nameDict can be a path 
# pointing to a file defining such a dictionary. We can assume each mapping occupies two lines: first line is the 
# original name, second line is the merged event name.    
def aggregateEventsDict(eventList, nameDict, attributeName):
    aggregations = give_dictionary_of_mappings_file(nameDict)
    # Iterate over all events and replace evevnts in event list with updated attribute name
    # if directed to by given mappings
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # If the attribute value has a mapping then replace the event's current value with the one in give map
        if attribute_val in aggregations:
            
            event.attributes[attributeName] = aggregations[attribute_val]
    return eventList

# Importing events functions

# Generating Sequences

In [None]:
sequence_braiding_Es= EventStore()
sequence_braiding_Es.importPointEvents('../datasets/sequence_braiding/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding_Es.events, sequence_braiding_Es)
#Sequence.create_attr_dict([seq])
#seq.getEventPosition('Meal','Lunch')
#print(seq.getUniqueValueHashes('Meal'))
#print(seq.getHashList('Glucose'))
#print(seq.getValueHashes('Glucose'))
#print(seq.getEventsHashString('Glucose'))
#raw_seq=seq.convertToVMSPReadable('Meal')
#print(raw_seq)
#print(seq.getPathID())
#sequence_braiding[0].attributes.keys()
#print(sequence_braiding[0].getAttrVal('Meals'))
#print(sequence_braiding[0].type)
#for events in sequence_braiding:
#    print(events.getAttrVal('Meal'))


In [None]:
seq_list=Sequence.splitSequences(seq, "week")
#seq_list=[]
#for seqs in sequence_braiding_split:
#    seq_list.append(Sequence(seqs))
    
#Sequence.create_attr_dict(seq_list)
raw_seq="\n".join( seqs.getEventsString('Meal') for seqs in seq_list)
raw_seq2=[seqs.getEventsString('Meal') for seqs in seq_list]

In [None]:
print(sequence_braiding_Es.reverseAttrDict['Meal'])

In [None]:
import csv
with open('demo3.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(["id", "text", "count"])
    for index, elem in enumerate(raw_seq2):
        writer.writerow([index, elem, 1])


In [None]:
print(raw_seq)

In [None]:
#vocabularies- can be emulated from attrdict
# itemset- keys of vocabularies
#count- seq volume


In [None]:
seq_list[0].events[0].getAttrVal('Meal')

In [None]:
indices=[0,1,5]
seq_sublist=[seq_list[index] for index in indices]

In [None]:
seq_sublist[2].events[7].getAttrVal('Meal')

In [None]:
raw_seq= [seqs.getEventsHashString('Meal') for seqs in seq_list]

In [None]:
stm= SentenTreeMiner(minSup=5, maxSup=len(raw_seq))
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=GraphNode()
root.incomingSequences=seq_list
graph=Graph()
visibleGroups=stm.expandSeqTree("Meal",root,  expandCnt=30, graph=graph)

In [None]:
x=json.dumps(root, ensure_ascii=False, default=GraphNode.jsonSerializeDump, indent=1)
print(x)

In [None]:
empJSON = jsonpickle.encode(graph, unpicklable=False)
#empJSON = jsonpickle.encode(graph)
print("Writing JSON Encode data into Python String")
employeeJSONData = json.dumps(empJSON, indent=4)
print(employeeJSONData)

In [None]:
x=json.dumps(graph, ensure_ascii=False, default=Graph.jsonSerializeDump, indent=1)
print(x)

In [None]:
"""
my_dict = {'name': 'flare',
           'children': [{'name': k,
                         'children': [{'name': child} for child in v]}
                            for k, v in my_defaultdict.items()]}

json_data = json.dumps(my_dict, indent=2)
"""

In [None]:
json.dumps(graph, default=lambda o: o.__dict__)

In [None]:
graph.links


In [None]:
graph.nodes

In [None]:
h=[]
#merge(h,key=lambda e:e[0],reverse=True)
heapq.heappush(h, (200, 1))
heapq.heappush(h, (300,2))
heapq.heappush(h, (400,3))
print(heapq.heappop(h))


In [None]:

"abcd".index('c',2,5)

In [None]:
sequence_braiding_Es= EventStore()
sequence_braiding_Es.importPointEvents('../Sample_Dataset.csv', 0, "%m/%d/%Y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding_Es.events, sequence_braiding_Es)


In [None]:
seq_list=sequence_braiding_Es.splitSequences(seq, "day")
raw_seq="\n".join( seqs.getEventsHashString('Events') for seqs in seq_list)

In [None]:
print(sequence_braiding_Es.reverseAttrDict['Events'])

In [None]:
stm= SentenTreeMiner()
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=GraphNode()
root.incomingSequences=seq_list
graph=Graph()
visibleGroups=stm.expandSeqTree("Events",root,  expandCnt=30, minSupport=2, maxSupport=len(seq_list)*5,graph=graph)

In [None]:
x=json.dumps(root, ensure_ascii=False, default=GraphNode.json_serialize_dump, indent=1)
print(x)

In [None]:
sequence_braiding_Es= EventStore()
sequence_braiding_Es.importPointEvents('../corelow_paper_test.csv', 1, "%m/%d/%y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding_Es.events, sequence_braiding_Es)
seq_list=sequence_braiding_Es.generateSequence("Category")

In [None]:
#seq_list=sequence_braiding_Es.splitSequences(seq, "day")
raw_seq="\n".join( seqs.getEventsString('Event') for seqs in seq_list)

In [None]:
print(raw_seq)

In [None]:
raw_seq="\n".join( seqs.getEventsHashString('Event') for seqs in seq_list)

In [None]:
print(sequence_braiding_Es.reverseAttrDct['Event'])

In [None]:
stm= SentenTreeMiner()
#cfm.truncateSequences(self, seqs, hashval, evtAttr, node,trailingSeqSegs, notContain)
root=GraphNode()
root.incomingSequences=seq_list
graph=Graph()
visibleGroups=stm.expandSeqTree("Event",root,  expandCnt=30, minSupport=1, maxSupport=len(seq_list)*5,graph=graph)

In [None]:
x=json.dumps(root, ensure_ascii=False, default=GraphNode.json_serialize_dump, indent=1)
print(x)

In [None]:
x=json.dumps(graph, ensure_ascii=False, default=Graph.json_serialize_dump, indent=1)
print(x)

In [None]:
json.dumps(graph.nodes)

In [None]:
json.dumps(vars(graph))