In [3]:
from datetime import datetime
import pandas as pd
import csv
import requests
import os
import re
from itertools import count
import numpy as np

from itertools import accumulate

from spmf import Spmf

# Event Representations

In [4]:
# A common class for all Events

class Event:
    def __init__(self, eventtype):
        self.type=eventtype
    
    #Return Attribute value given attribute name
    def getAttrVal(self, attrName):
        return self.attributes.get(attrName,"Not found")

    
# A class that represents a point event
class PointEvent(Event):
    def __init__(self, timestamp, attributes):
        Event.__init__(self,"point")
        #self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 
        
    

# class to represent an interval event
class IntervalEvent(Event):
    def __init__(self, t1, t2, attributes):
        Event.__init__(self,"interval")
        #self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# Sequence Representations

In [17]:
class Sequence:
    _ids = count(0)
    attrdict={}
    def __init__(self, events, sid=None):
        if sid is None:
            self.sid = next(self._ids)
        else:
            self.sid = sid
        
        self.events = events
        self.volume=1
        self.seqAttributes={}
    def getEventPosition(self, attr, hash_val):
        for count,event in enumerate(self.events):
            if event.getAttrVal(attr)==hash_val:
                return count
        return -1
    
    def setVolume(self, intValue):
        self.volume=intValue
        
    def getVolume():
        return self.volume
    
    def increaseVolume():
        self.volume += 1 
        
    def getUniqueValueHashes(self, attr):
        l=list(set(event.getAttrVal(attr) for event in self.events))
        return l
    
    #Not sure this will always result in same index, will change if 
    #dictionary is updated
    #since python is unordered
    
    def getHashList(self, attr):
        l=list(list(event.attributes.keys()).index(attr) for event in self.events)
        return l
    
    def getValueHashes(self, attr):
        l=list(event.getAttrVal(attr) for event in self.events)
        return l
    
    def getEventsHashString(self, attr):
        s=attr+": "
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" "
        s+=" ".join(str(event.getAttrVal(attr)) for event in self.events)
        return s
    
    def convertToVMSPReadable(self, attr):
        s=" -1 ".join(str(event.getAttrVal(attr)) for event in self.events)
        #s=""
        #for count,event in enumerate(self.events):
        #    s+=str(event.getAttrVal(attr))+" -1 "
        s+=" -2"
        
        return s
    
    def getPathID(self):
        return self.sid
    
    def matchPathAttribute(self, attr, val):
        # should i use eq?!
        if this.seqAttributes.get(attr)==(val):
            return True
        else:
            return False
        
    def setSequenceAttribute(self,attr, value):
        self.seqAttributes[attr]=value
        
    
    def create_attr_dict(seqList):
        attr_list=seqList[0].events[0].attributes.keys()
        print(attr_list)
        
        for attr in attr_list:
            a=0
            unique_list=[]
            for sequences in seqList:
                unique_list.extend(sequences.getUniqueValueHashes(attr))
            unique_list=list(set(unique_list))
            unique_list.clear()
            
            unicode_dict={}
            for uniques in unique_list:
                unicode_dict[uniques]=chr(a)
                a=a+1
            self.attrdict[attr]=unicode_dict
            unicode_dict.clear()    
            

    # equivalent to method signature public static int getVolume(List<Sequence> seqs)    
    def getSeqVolume(seqlist):
        return sum(seq.getVolume() for seq in seqlist)
    

# Pattern Representation

In [6]:
class Pattern:
    _pids = count(1)

    def __init__(self, events=[]):
        #pattern id
        self.id = next(self._pids)
        
        self.keyEvts = events
        
        self.medianPos=[]
        self.meanPos=[]
        
        self.sids=[]
        
        self.support=0
        self.supPercent=None
        self.cluster=None
        self.medianPathLength=0
        self.meanPathLength=0
        
        self.parentSegment=None
        self.segSizes=None
        
    def filterPaths(self, paths, evtType):
        print("filtering "+ str(len(paths))+" paths by "+str(len(self.keyEvts))+" checkpoints")
        
        for sequences in paths:
            if(self.matchMilestones(sequences.getValueHashes(evtType),self.keyEvts)==False):
                continue
            self.sids.append(sequences)
            
        print(str(len(self.sids))+" matching paths")

        
    def matchMilestones(self, arr, milestones):
        ja=arr
        idx=-1
        for elems in milestones:
            try:
                idx=arr[idx+1:].index(elems)
                print(idx)
            except ValueError:
                return False
        return True
    
    def getMedianSpacing(self):
        l=[y - x for x,y in zip(self.medianPos,self.medianPos[1:])]
        if(len(l)<=1):
            return 100
        l=l.sort()
        middle=int(len(l)/2)
        if(len(l)%2==0):
            return ((l[middle-1]+l[middle])/2.0)
        else:
            return l[middle]
        return np.median(np.asarray(l))
    
    def addKeyEvent(self, hashval):
        self.keyEvts.append(hashval)
        
    def addToSupportSet(self, seq):
        self.sids.append(seq)
        self.support+=seq.getVolume()
        
    def getSequences(self):
        return self.sids
    
    def setMedianPathLength(self, median):
        self.medianPathLength=median
    
    def setMeanPathLength(self, mean):
        self.meanPathLength=mean
        
    def getMedianPathLength(self):
        return self.medianPathLength
    
    def getMeanPathLength(self):
        return self.meanPathLength
    
    def getEvents(self):
        return self.keyEvts
    
    def getEventMeanPos(self):
        return self.meanPos
    
    def getEventMedianPos(self):
        return self.medianPos
    
    #Do we need to preserve order here??

    def getUniqueEventsString(self):
        #return "-".join(str(x) for x in list(set(self.keyEvts)))
        #return "-".join(str(x) for x in list(dict.fromkeys(self.keyEvts)))
        return "-".join(str(x) for x in self.keyEvts)
    
    def getPositions(self, events, path):
        sequence=path
        pos=[]
        idx=-1
        offset=0
        
        for elems in events:
            
            offset+=idx+1
            try:
                idx=path[offset:].index(elems)
            except ValueError:
                continue
            pos.append(offset+idx)
        return pos
    
    def getMedian(self, data):
        #middle=len(data)/2
        #if(len(data)%2==0 and len(data)>1):
        #    return (data[middle-1]+data[middle])/2.0
        #else: 
        #    return data[middle]
        return np.median(data)
    
    def computePatternStats(self, evtAttr):
        pathsOfStrings=[]
        for path in self.sids:
            pageSequence=path.getHashList(evtAttr)
            pathsOfStrings.append(pageSequence)
            
        medians=[]
        means=[]
        
        for i,events in enumerate(self.keyEvts):
            numSteps=[]
            
            for idx,paths in enumerate(pathsOfStrings):
                if(self.matchMilestones(paths, self.keyEvts[0,i+1])):
                    pos=self.getPositions(self.keyEvts[0,i+1], paths)
                    if i==0:
                        #add position value of first element id sequence
                        numSteps.append(pos[i])
                    else:
                        #in other cases add the difference
                        numSteps.append(pos[i]-pos[i-1])
            sum_steps=sum(numSteps)
            
            median= self.getMedian(numsteps)
            
            medians.append(median)
            means.append(sum_steps*1.0/ numSteps.size())
                
            
                
        #list(accumulate(means))
        means=np.cumsum(np.asarray(means))
        medians=np.cumsum(np.asarray(medians))
        
        self.setMedianPositions(medians)
        self.setMeanPositions(means)
        
        trailingSteps=[0]*len(self.sids)
        for i,path in enumerate(self.sids):
            pos=self.getPositions(self.keyEvts, path.getHashList(evtAttr))
            trailingSteps[i]= len(path.events)- pos[-1]
        
        trailStepSum=sum(trailingSteps)
        median= self.getMedian(trailingSteps)
        mean= trailStepSum/len(trailingSteps)
        
        self.setMedianPathLength(median+medians[-1])
        self.setMeanPathLength(mean+means[-1])
                                  
    def getMedianPositions(self, allPos, pids):
        median=[]
        for k in range(0, len(pid)):
            posInPaths=allPos[k]
            median.append(self.getMedian(posInPaths))
        #return list(self.getMedian(posInPaths) for posInPaths in allPos)
        return median
    
    def getMeanPositions(self, allPos, pids):
        mean=[]
        for k in range(0, len(allPos)):
            mean.append(sum(allPos[k])*1.0/(len(allPos[k])))
        return mean
    
    def setMedianPositions(self, median):
        self.medianPos=median
        
    def setMeanPositions(self, mean):
        self.meanPos=mean
        
    def toJson(self):
        return json.dumps(self, default=lambda o: o.__dict__)#,sort_keys=True, indent=4)
    
    def getSupport(self):
        return this.support
    
    def setCluster(self, cluster):
        self.cluster=cluster
        
    def setParent(self, parent, segment):
        self.parent=parent
        self.parentSegment=segment
    
    
    # How to implement this with BitArray?
    #def getEventBitSet(self)
    
    def getParent(self):
        return self.parent
    
    def getParentSegment(self):
        return self.parentSegment
    
    def setMeanPathLength(self,d):
        self.meanPathLength=d
    
    def getMeanPathLength(self):
        return self.meanPathLength
        
    def setSupport(self, sup, total):
        self.support=sup
        self.supPercent= sup*1.0/total
    

# FlowNode Representation

In [7]:
class FlowNode:
    NID=count(1)
    nodeHash={}
    
    def __init__(self, name="", count=0, value=""):
        self.nid=next(self.NID)
        self.name=name
        self.seqCount=count
        self.value=value
        self.hash=-1
        self.pos=[]
        self.meanStep=0
        self.medianStep=0
        self.zipCompressRatio=0
        self.incomingBranchUniqueEvts=None
        self.incomingBranchSimMean=None
        self.incomingBranchSimMedian=None
        self.incomingBranchSimVariance=None
        
        self.incomingSequences=[]
        self.outgoingSequences=[]
        
        self.meanRelTimestamp=0
        self.medianRelTimestamp=0
        
        nodeHash[self.nid]=self
        
        
    def getNode(self, node_id):
        return nodeHash[node_id]
    
    def clearHash(self):
        nodeHash.clear()
        
    def getIncomingSequences(self):
        return self.incomingSequences
    
    def getSeqCount(self):
        return self.seqCount
    
    def setSeqCount(self, seqCount):
        self.seqCount=seqCount
        
    def getName(self):
        return self.name
    
    def setName(self, name):
        self.name=name
        
    def getMeanStep(self):
        return self.meanStep
    
    def toString(self):
        return self.name+": "+self.seqCount
    
    def setPositions(self, l):
        self.pos=l
        self.sort(l)
        
        
    

# Importing events functions

In [8]:
class EventStore:

    # Helper function to return a data frame
    # Local is boolean, if local then source should be path to the file
    # Otherwise it should be a URL to the the file

    def get_dataframe( src, local=False, sep="\t", header=[]):
        if not local:
            # To force a dropbox link to download change the dl=0 to 1
            if "dropbox" in src:
                src = src.replace('dl=0', 'dl=1')
            # Download the CSV at url
            req = requests.get(src)
            url_content = req.content
            csv_file = open('data.txt', 'wb') 
            csv_file.write(url_content)
            csv_file.close()
            # Read the CSV into pandas
            # If header list is empty, the dataset provides header so ignore param
            if not header:
                df = pd.read_csv("data.txt", sep)
            #else use header param for column names
            else:
                df = pd.read_csv("data.txt", sep, names=header)
            # Delete the csv file
            os.remove("data.txt")
            return df
        # Dataset is local
        else:
            # If header list is empty, the dataset provides header so ignore param
            if not header:
                df = pd.read_csv(src, sep)
            # else use header param for column names
            else:
                df = pd.read_csv(src, sep, names=header)
            return df



    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # header is list of column names if they are not provided in the dataset
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    
    def importPointEvents(src, timestampColumnIdx, timeFormat, sep='\t', local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = EventStore.get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
            # for all attributes other tahn time, add them to attributes dict
            for i in range(len(data)):
                if i != timestampColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = PointEvent(timestamp, attribs)
            events.append(e)
        return events

    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
    # for those cases
    
    def importIntervalEvents( src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for the start and end times of the event
            t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
            t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
            # for all attributes other than times, add them to attributes dict
            for i in range(len(data)):
                if i != startTimeColumnIdx and i != endTimeColumnIdx:
                    attribs[cols[i]] = data[i]
            # use time stamp and attributes map to construct event object
            e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        return events

    # Import a dataset that has both interval and point events
    # Returns a list of event objects
    # src is a url or directory path, if local is false its url else its path
    # The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
    # I thought the simplest thing was just to give this function the df and then use that instead of calling my helper

    def importMixedEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
        events = []
        # if the df is not provided
        if df is None:
            df = get_dataframe(src, local, sep, header)
        cols = df.columns
        # For each event in the csv construct an event object
        for row in df.iterrows():
            data = row[1]
            attribs = {}
            # create datetime object for timestamp (if point events) or t1 and t2 (if interval event)
            # If the endTimeColumnIdx value is NaN ie a float instead of a time string then its a point event
            if type(data[endTimeColumnIdx]) is float:
                t = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                event_type = "point"
            # Otherwise its an interval event
            else:
                t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
                t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
                event_type = "interval"
            # for all attributes other than times, add them to attributes dict
            ignore=[startTimeColumnIdx, endTimeColumnIdx] # list of indices to be ignored
            attribute_columns = [ind for ind in range(len(data)) if ind not in ignore]
            for i in attribute_columns:
                attribs[cols[i]] = data[i]
            # use time stamp (or t1 and t2) and attributes map to construct event object
            if event_type == "point":
                e = PointEvent(t, attribs)
            else:
                e = IntervalEvent(t1, t2, attribs)
            events.append(e)
        return events
    
    
    # Group events by attributeName, and order them by timestamp
    
    def generateSequence(eventList, attributeName):
        grouped_by = {}
        # Sort the event list
        eventList = sorted(eventList, key=get_time_to_sort_by)
        for event in eventList:
            value = event.attributes[attributeName]
            # If have seen this value before, append it the list of events in grouped_by for value
            if value in grouped_by:
                grouped_by[value].append(event)
            # otherwise store a new list with just that event
            else:
                grouped_by[value] = [event]
        return list(grouped_by.values())
    
    # Helper to insert an event into a map
    # Params are key=unique id for that time, map of key to event list, event object
    
    def insert_event_into_dict(key, dictionary, event):
        if key in dictionary:
            dictionary[key].append(event)
        else:
            dictionary[key] = [event]
            
    # Helper function for generateSequence to use when sorting events to get what time field to sort by
    # Also used in splitSequences to give the time of an event when splitting the events up

    def get_time_to_sort_by(e):
        # Sort by starting time of event if its an interval event
        if type(e) == IntervalEvent:
            return e.time[0]
        # Otherwise use the timestamp
        else:
            return e.timestamp


    # Split a long sequence into shorter ones by timeUnit. For example, a sequence may span several days and we want to 
    # break it down into daily sequences. The argument timeUnit can be one of the following strings: “hour”, “day”, 
    # “week”, “month”, “quarter”, and “year”.
    # For interval events I used the start time of the event to determine its category when splitting it
    
    def splitSequences(sequenceList, timeUnit, record=None):
        results = []
        timeUnit = timeUnit.lower()
        # Check if the time unit is a valid argument
        valid_time_units = ["hour", "day", "week", "month", "quarter", "year"]
        if timeUnit not in valid_time_units:
            raise ValueError("timeUnit must be hour, day, week, month, quarter, or year")
        # Sort the events by the timestamp or event start time
        sequenceList = sorted(sequenceList, key=EventStore.get_time_to_sort_by)

        # Process the event sequence based on the given time unit
        # Generally, create a map for that time unit and then add each event into that map 
        # (key=time such as May 2021 in case of month, value=sequence) and then return the values of the map as a list
        if timeUnit == "hour":
            hours = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                key = (time.hour, time.day, time.month, time.year)
                EventStore.insert_event_into_dict(key,hours,event)
                if record is None:
                    event.attributes["record"]=' '.join([str(k) for k in key])
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+' '.join([str(k) for k in key])
            results = list(hours.values())

        elif timeUnit == "day":
            days = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                key = (time.day, time.month, time.year)
                EventStore.insert_event_into_dict(key,days,event)
                #print(days)
                if record is None:
                    event.attributes["record"]=datetime(*(key[::-1])).strftime("%Y%m%d")
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d")
            results = list(days.values())

        elif timeUnit == "month":
            months = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                key = (time.month,time.year)
                EventStore.insert_event_into_dict(key,months,event)
                if record is None:
                    event.attributes["record"]=str(key[0])+str(key[1])
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
            results = list(months.values())

        elif timeUnit == "week":
            weeks = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                year = time.year
                week_num = time.isocalendar()[1]
                key = (year,week_num)
                EventStore.insert_event_into_dict(key,weeks,event)
                if record is None:
                    event.attributes["record"]=str(key[0])+"W"+str(key[1])
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+"W"+str(key[1])
            results = list(weeks.values())

        elif timeUnit == "year":
            years = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                key = time.year
                EventStore.insert_event_into_dict(key,years,event)
                if record is None:
                    event.attributes["record"]=str(key)
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+str(key)
            results = list(years.values())

        elif timeUnit == "quarter":
            quarters = {}
            for event in sequenceList:
                time = EventStore.get_time_to_sort_by(event)
                year = time.year
                month = time.month
                # Determine the year, quarter pair/key for quarter dict
                # January, February, and March (Q1)
                if month in range(1, 4):
                    key = (year, "Q1")
                # April, May, and June (Q2)
                elif month in range(4, 7):
                    key = (year, "Q2")
                # July, August, and September (Q3)
                elif month in range(7,10):
                    key = (year, "Q3")
                # October, November, and December (Q4)
                elif month in range(10,13):
                    key = (year, "Q4")
                # Put the event in the dictionary
                EventStore.insert_event_into_dict(key,quarters,event)
                if record is None:
                    event.attributes["record"]=str(key[0])+str(key[1])
                else:
                    event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
            results = list(quarters.values())

        return results   

# Generating Sequences

# Event Aggregation
For aggregateEventsRegex and aggregateEventsDict, see what the files are expected to look like in the repo in DataModel/testFiles

In [9]:
# Helper function to run the mappings file as a dictionary
def give_dictionary_of_mappings_file(fileName):
    # Open the file and split the contents on new lines
    file = open(fileName, "r")
    mappings = file.read().split("\n")
    file.close()
    # Remove any empty strings from the list of mappings
    mappings = list(filter(None, mappings))
    # Raise an error if there is an odd number of items in mapping
    if (len(mappings) % 2) != 0:
        raise ValueError("There must be an even number of lines in the mappings file.")
    # Create a dictionary based on read in mappings
    aggregations = {}
    for i in range(len(mappings)):
        if i % 2 == 0:
            aggregations[mappings[i]] = mappings[i+1]
    #print(aggregations)
    return aggregations

# NOTE: this current modifies the events in eventList argument
# merge events by rules expressed in regular expressions. For example, in the highway incident dataset, we can 
# replace all events with the pattern “CHART Unit [number] departed” by “CHART Unit departed”. The argument 
# regexMapping can be a path pointing to a file defining such rules. We can assume each rule occupies two lines: 
# first line is the regular expression, second line is the merged event name 
def aggregateEventsRegex(eventList, regexMapping, attributeName): 
    aggregations = give_dictionary_of_mappings_file(regexMapping)
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # For all the regexes
        for regex in aggregations.keys():
            # If its a match then replace the attribute value for event with
            if re.match(regex, attribute_val):
                event.attributes[attributeName] = aggregations[regex]
                break
    return eventList
    
# NOTE: this current modifies the events in eventList argument
# merge events by a dictionary mapping an event name to the merged name. The argument nameDict can be a path 
# pointing to a file defining such a dictionary. We can assume each mapping occupies two lines: first line is the 
# original name, second line is the merged event name.    
def aggregateEventsDict(eventList, nameDict, attributeName):
    aggregations = give_dictionary_of_mappings_file(nameDict)
    # Iterate over all events and replace evevnts in event list with updated attribute name
    # if directed to by given mappings
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # If the attribute value has a mapping then replace the event's current value with the one in give map
        if attribute_val in aggregations:
            
            event.attributes[attributeName] = aggregations[attribute_val]
    return eventList

In [10]:
sequence_braiding = EventStore.importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
#print(type(sequence_braiding))
seq=Sequence(sequence_braiding)
#seq.getEventPosition('Meal','Lunch')
#print(seq.getUniqueValueHashes('Meal'))
#print(seq.getHashList('Glucose'))
print(seq.getValueHashes('Glucose'))
#print(seq.getEventsHashString('Glucose'))
raw_seq=seq.convertToVMSPReadable('Glucose')
print(seq.convertToVMSPReadable('Glucose'))
#print(seq.getPathID())
#sequence_braiding[0].attributes.keys()
#print(sequence_braiding[0].getAttrVal('Meals'))
#print(sequence_braiding[0].type)
#for events in sequence_braiding:
#    print(events.getAttrVal('Meal'))


[38, 233, 52, 67, 309, 66, 80, 168, 171, 56, 116, 64, 172, 68, 69, 131, 74, 161, 189, 181, 173, 198, 217, 84, 93, 141, 55, 73, 191, 82, 102, 288, 69, 239, 199, 89, 64, 63, 78, 77, 104, 67, 102, 76, 76, 147, 69, 61, 68, 44, 172, 74, 64, 65, 59, 59, 76, 88, 73, 94, 58, 69, 89, 149, 175, 138, 154, 63, 67, 171, 60, 109, 71, 72, 82, 56, 80, 80, 65, 134, 112, 65, 75, 102, 153, 103, 83, 76, 134, 81, 75, 98, 76, 113, 69, 78, 80, 66, 81, 81, 122, 77, 106, 95, 73, 58, 56, 87, 174, 147, 114, 153, 115, 126, 76, 67, 128, 65, 56, 96, 65, 71, 109, 58, 176, 151, 172, 214, 78, 199, 234, 57, 123, 140, 224, 148, 69, 132, 66, 49, 46, 59, 66, 357, 137, 155, 142, 156, 98, 84, 118, 111, 108, 76, 116, 137, 90, 100, 87, 105, 58, 113, 140, 198, 162, 180, 90, 66, 83, 51, 158, 57, 85, 57, 50, 204, 121, 112, 54, 62, 72, 172, 178, 84, 145, 77, 198, 175, 68, 107, 88, 125, 58, 175, 172, 174, 124, 205, 69, 66, 56, 94, 264, 237, 136, 121, 66, 173, 113, 49, 151, 84, 135, 162, 259, 67, 93, 318, 48, 132, 76, 72, 75, 71, 2

In [11]:
sequence_braiding_split=EventStore.splitSequences(sequence_braiding, "week")
seq_list=[]
for seqs in sequence_braiding_split:
    seq_list.append(Sequence(seqs))
    
raw_seq="\n".join( seqs.convertToVMSPReadable('Glucose') for seqs in seq_list)

In [12]:
print(raw_seq)

63 -1 62 -1 161 -1 176 -1 81 -1 68 -1 53 -1 76 -1 258 -1 124 -1 74 -1 70 -1 106 -1 78 -1 46 -1 54 -1 152 -1 61 -1 68 -1 68 -1 164 -2
66 -1 102 -1 297 -1 59 -1 104 -1 58 -1 81 -1 140 -1 196 -1 53 -1 174 -1 232 -1 306 -1 306 -1 227 -1 190 -1 240 -1 63 -1 76 -1 135 -1 135 -2
82 -1 86 -1 200 -1 61 -1 61 -1 117 -1 122 -1 182 -1 90 -1 193 -1 82 -1 82 -1 94 -1 150 -1 131 -1 75 -1 217 -1 74 -1 130 -1 113 -1 175 -2
133 -1 231 -1 70 -1 86 -1 154 -1 58 -1 275 -1 74 -1 227 -1 118 -1 52 -1 122 -1 156 -1 103 -1 64 -1 226 -1 160 -1 145 -2
77 -1 90 -1 219 -1 161 -1 241 -1 141 -1 87 -1 318 -1 109 -1 158 -1 321 -1 258 -1 55 -1 63 -1 61 -1 108 -1 239 -1 82 -1 67 -1 73 -1 86 -1 70 -1 292 -1 148 -2
82 -1 159 -1 246 -1 119 -1 74 -1 235 -1 290 -1 248 -1 210 -1 74 -1 91 -1 154 -1 174 -1 190 -1 100 -1 113 -1 378 -1 253 -1 167 -1 126 -1 139 -1 155 -1 154 -1 153 -1 105 -1 165 -1 132 -2
142 -1 155 -1 191 -1 192 -1 69 -1 60 -1 56 -1 42 -1 85 -1 248 -1 114 -1 137 -1 94 -1 105 -1 261 -1 174 -1 60 -1 60 -1 123 -1 63 

In [13]:
pat=Pattern([233,309,106,166])
print(pat.keyEvts)
#print(pat.filterPaths([seq],'Glucose'))
#print(pat.getUniqueEventsString())
print(pat.getPositions([233,309,80,168],seq.getValueHashes('Glucose')))

[233, 309, 106, 166]
[1, 4, 6, 7]


In [14]:
input_example_raw = """1 -1 1 2 3 -1 1 3 -1 4 -1 3 6 -1 -2
1 4 -1 3 -1 2 3 -1 1 5 -1 -2
5 6 -1 1 2 -1 4 6 -1 3 -1 2 -1 -2
5 -1 7 -1 1 6 -1 3 -1 2 -1 3 -1 -2
"""

In [15]:
spmf = Spmf("VMSP", spmf_bin_location_dir="./test_files/", input_direct=raw_seq,
            output_filename="output.txt", arguments=[0.2])

FileNotFoundError: spmf.jar not found. Please use the spmf_bin_location_dir argument.

In [None]:
spmf.run()

In [None]:
print(spmf.to_pandas_dataframe(pickle=True))

In [None]:
# pip install spmf