In [1]:
from datetime import datetime
i
import pandas as pd
import csv
import requests
import os
import re

In [2]:
from ipynb.fs.full.Data_Model import get_dataframe

KeyboardInterrupt: 

# Event Representations

In [3]:
# A class that represents a point event
class PointEvent:
    def __init__(self, timestamp, attributes):
        self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# class to represent an interval event
class IntervalEvent:
    def __init__(self, t1, t2, attributes):
        self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# Importing events functions

In [4]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def get_dataframe(src, local=False, sep="\t", header=[]):
    if not local:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        url_content = req.content
        csv_file = open('data.txt', 'wb') 
        csv_file.write(url_content)
        csv_file.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv("data.txt", sep)
        #else use header param for column names
        else:
            df = pd.read_csv("data.txt", sep, names=header)
        # Delete the csv file
        os.remove("data.txt")
        return df
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv(src, sep)
        # else use header param for column names
        else:
            df = pd.read_csv(src, sep, names=header)
        return df

# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# header is list of column names if they are not provided in the dataset
# The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
# I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
# for those cases
def importPointEvents(src, timestampColumnIdx, timeFormat, sep='\t', local=False, header=[], df=None):
    events = []
    # if the df is not provided
    if df is None:
        df = get_dataframe(src, local, sep, header)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
        # for all attributes other tahn time, add them to attributes dict
        for i in range(len(data)):
            if i != timestampColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = PointEvent(timestamp, attribs)
        events.append(e)
    return events

# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
# I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
# for those cases
def importIntervalEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
    events = []
    # if the df is not provided
    if df is None:
        df = get_dataframe(src, local, sep, header)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        # create datetime object for the start and end times of the event
        t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
        t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
        # for all attributes other than times, add them to attributes dict
        for i in range(len(data)):
            if i != startTimeColumnIdx and i != endTimeColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = IntervalEvent(t1, t2, attribs)
        events.append(e)
    return events

# Import a dataset that has both interval and point events
# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
# I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
def importMixedEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
    events = []
    # if the df is not provided
    if df is None:
        df = get_dataframe(src, local, sep, header)
    cols = df.columns
    #print(df)
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        #print(data)
        attribs = {}
        # create datetime object for timestamp (if point events) or t1 and t2 (if interval event)
        # If the endTimeColumnIdx value is NaN ie a float instead of a time string then its a point event
        if type(data[endTimeColumnIdx]) is float:
            t = datetime.strptime(data[startTimeColumnIdx], timeFormat)
            event_type = "point"
        # Otherwise its an interval event
        else:
            t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
            t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
            event_type = "interval"
        # for all attributes other than times, add them to attributes dict
        ignore=[startTimeColumnIdx, endTimeColumnIdx] # list of indices to be ignored
        attribute_columns = [ind for ind in range(len(data)) if ind not in ignore]
        for i in attribute_columns:
            attribs[cols[i]] = data[i]
        # use time stamp (or t1 and t2) and attributes map to construct event object
        if event_type == "point":
            e = PointEvent(t, attribs)
        else:
            e = IntervalEvent(t1, t2, attribs)
        events.append(e)
    return events

# Loading in datasets

In [14]:
# VAST mini challenge dataset
url = "http://vacommunity.org/tiki-download_file.php?fileId=492"
vast = importPointEvents(url, 0, '%Y-%m-%d %H:%M:%S', sep=',', local=False)

# Sequence braiding
# NOTE: I deleted the last 4 rows of the dataset before loading it in since they did not look like evevnts 
# NOTE: this data set only had dates not times
sequence_braiding = importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)

# Foursquare NYC
header = ["User ID", "Venue ID", "Venue category ID", "Venue category name", "Latitude", "Longitude", 
          "Timezone offset (minutes)", "UTC time"]
foursquare_time_format = "%a %b %d %H:%M:%S +0000 %Y"

df = pd.read_csv('../datasets/foursquare/nyc.txt', '\t', names=header, encoding="latin1")
fs_nyc = importPointEvents('datasets/foursquare/nyc.txt', 7, foursquare_time_format, df=df)
    
# Foursquare tokyo
df = pd.read_csv('../datasets/foursquare/tokyo.txt', names=header, encoding='latin1', sep='\t')
fs_tokyo = importPointEvents('datasets/foursquare/tokyo.txt', 7, foursquare_time_format, df=df)

# CHICAGO-SeasonD2O.txt
time_format = "%H:%M:%S.%f"
header = ["Game/Points", "EventType", "Start time", "End time"]
chicago_season_d2o = importMixedEvents("../datasets/Chicago_Bulls/CHICAGO-SeasonD2O.txt", 2, 3, time_format, sep='\t', local=True, header=header)

KeyboardInterrupt: 

# Generating Sequences

In [5]:
# Helper function for generateSequence to use when sorting events to get what time field to sort by
# Also used in splitSequences to give the time of an event when splitting the events up
def get_time_to_sort_by(e):
    # Sort by starting time of event if its an interval event
    if type(e) == IntervalEvent:
        return e.time[0]
    # Otherwise use the timestamp
    else:
        return e.timestamp

# Group events by attributeName, and order them by timestamp
def generateSequence(eventList, attributeName):
    grouped_by = {}
    # Sort the event list
    eventList = sorted(eventList, key=get_time_to_sort_by)
    for event in eventList:
        value = event.attributes[attributeName]
        # If have seen this value before, append it the list of events in grouped_by for value
        if value in grouped_by:
            grouped_by[value].append(event)
        # otherwise store a new list with just that event
        else:
            grouped_by[value] = [event]
    return list(grouped_by.values())

In [49]:
# Helper to insert an event into a map
# Params are key=unique id for that time, map of key to event list, event object
def insert_event_into_dict(key, dictionary, event):
    if key in dictionary:
        dictionary[key].append(event)
    else:
        dictionary[key] = [event]

# Split a long sequence into shorter ones by timeUnit. For example, a sequence may span several days and we want to 
# break it down into daily sequences. The argument timeUnit can be one of the following strings: “hour”, “day”, 
# “week”, “month”, “quarter”, and “year”.
# For interval events I used the start time of the event to determine its category when splitting it
def splitSequences(sequenceList, timeUnit, record=None):
    results = []
    timeUnit = timeUnit.lower()
    # Check if the time unit is a valid argument
    valid_time_units = ["hour", "day", "week", "month", "quarter", "year"]
    if timeUnit not in valid_time_units:
        raise ValueError("timeUnit must be hour, day, week, month, quarter, or year")
    # Sort the events by the timestamp or event start time
    sequenceList = sorted(sequenceList, key=get_time_to_sort_by)
    
    # Process the event sequence based on the given time unit
    # Generally, create a map for that time unit and then add each event into that map 
    # (key=time such as May 2021 in case of month, value=sequence) and then return the values of the map as a list
    if timeUnit == "hour":
        hours = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            key = (time.hour, time.day, time.month, time.year)
            insert_event_into_dict(key,hours,event)
            if record is None:
                event.attributes["record"]=datetime(*(key[::-1])).strftime("%Y%m%d%H")
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d%H")
        results = list(hours.values())
    
    elif timeUnit == "day":
        days = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            key = (time.day, time.month, time.year)
            insert_event_into_dict(key,days,event)
            #print(days)
            if record is None:
                event.attributes["record"]=datetime(*(key[::-1])).strftime("%Y%m%d")
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%Y%m%d")
        results = list(days.values())
        
    elif timeUnit == "month":
        months = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            key = (time.month,time.year)
            insert_event_into_dict(key,months,event)
            if record is None:
                event.attributes["record"]=str(key[0])+str(key[1])
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+str(key[0])+str(key[1])
        results = list(months.values())
        
    elif timeUnit == "week":
        weeks = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            year = time.year
            week_num = time.isocalendar()[1]
            key = (year,week_num)
            insert_event_into_dict(key,weeks,event)
            if record is None:
                event.attributes["record"]=datetime(*(key[::-1])).strftime("%U")
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+datetime(*(key[::-1])).strftime("%U")
        results = list(weeks.values())
                                   
    elif timeUnit == "year":
        years = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            key = time.year
            insert_event_into_dict(key,years,event)
            if record is None:
                event.attributes["record"]=str(key)
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+str(key)
        results = list(years.values())
            
    elif timeUnit == "quarter":
        quarters = {}
        for event in sequenceList:
            time = get_time_to_sort_by(event)
            year = time.year
            month = time.month
            # Determine the year, quarter pair/key for quarter dict
            # January, February, and March (Q1)
            if month in range(1, 4):
                key = (year, "Q1")
            # April, May, and June (Q2)
            elif month in range(4, 7):
                key = (year, "Q2")
            # July, August, and September (Q3)
            elif month in range(7,10):
                key = (year, "Q3")
            # October, November, and December (Q4)
            elif month in range(10,13):
                key = (year, "Q4")
            # Put the event in the dictionary
            insert_event_into_dict(key,quarters,event)
            if record is None:
                event.attributes["record"]=Period(freq='Q-JAN',year=key[0], quarter=key[1][1]).strftime("%YQ%q")
            else:
                event.attributes[record]=str(event.attributes[record])+"_"+Period(freq='Q-JAN',year=key[0], quarter=key[1][1]).strftime("%Y-Q%q")
        results = list(quarters.values())
        
    return results

# Event Aggregation
For aggregateEventsRegex and aggregateEventsDict, see what the files are expected to look like in the repo in DataModel/testFiles

In [7]:
# Helper function to run the mappings file as a dictionary
def give_dictionary_of_mappings_file(fileName):
    # Open the file and split the contents on new lines
    file = open(fileName, "r")
    mappings = file.read().split("\n")
    file.close()
    # Remove any empty strings from the list of mappings
    mappings = list(filter(None, mappings))
    # Raise an error if there is an odd number of items in mapping
    if (len(mappings) % 2) != 0:
        raise ValueError("There must be an even number of lines in the mappings file.")
    # Create a dictionary based on read in mappings
    aggregations = {}
    for i in range(len(mappings)):
        if i % 2 == 0:
            aggregations[mappings[i]] = mappings[i+1]
    return aggregations

# NOTE: this current modifies the events in eventList argument
# merge events by rules expressed in regular expressions. For example, in the highway incident dataset, we can 
# replace all events with the pattern “CHART Unit [number] departed” by “CHART Unit departed”. The argument 
# regexMapping can be a path pointing to a file defining such rules. We can assume each rule occupies two lines: 
# first line is the regular expression, second line is the merged event name 
def aggregateEventsRegex(eventList, regexMapping, attributeName): 
    aggregations = give_dictionary_of_mappings_file(regexMapping)
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # For all the regexes
        for regex in aggregations.keys():
            # If its a match then replace the attribute value for event with
            if re.match(regex, attribute_val):
                event.attributes[attributeName] = aggregations[regex]
                break
    return eventList
    
# NOTE: this current modifies the events in eventList argument
# merge events by a dictionary mapping an event name to the merged name. The argument nameDict can be a path 
# pointing to a file defining such a dictionary. We can assume each mapping occupies two lines: first line is the 
# original name, second line is the merged event name.    
def aggregateEventsDict(eventList, nameDict, attributeName):
    aggregations = give_dictionary_of_mappings_file(nameDict)
    # Iterate over all events and replace evevnts in event list with updated attribute name
    # if directed to by given mappings
    for event in eventList:
        # Get the attribute value of interest
        attribute_val = event.attributes[attributeName]
        # If the attribute value has a mapping then replace the event's current value with the one in give map
        if attribute_val in aggregations:
            event.attributes[attributeName] = aggregations[attribute_val]
    return eventList

# Exporting to EventFLow Format

In [8]:
def createEventflowFormatdataModel(eventlist, record_id, event_category, file_name="Eventflow.txt"):
    #columns in EventFlow
    column_names=["record_id","event_category","Start_time", "end_time", "event_attributes"]
    
    Start_time=None
    End_time=None
    data=[]
    
    #which keys will go to attributes list
    key_list=[keys for keys in eventlist[0].attributes.keys() if keys not in [record_id,event_category]]
    
    for event in eventlist:
        # Get the attribute value of record    
        if(event.type=="point"):    
            Start_time=event.timestamp
            End_time=""
        else:
            Start_time=event.time[0]
            End_time=event.time[1]
        attr_str=";".join([str(keys).replace(" ","_")+"=\""+str(event.attributes[keys])+"\"" for keys in key_list])
        
        data.append([(re.sub('[(),]','',str(event.attributes[record_id]))).replace(" ","_"),str(event.attributes[event_category]).replace(" ","_"),Start_time, End_time,attr_str])
        
    event_df=pd.DataFrame(data,columns = column_names)
    
    event_df.to_csv("../datasets/"+file_name,sep="\t",header=False,index=False, quoting=csv.QUOTE_NONE)

In [9]:
### Alternate method signature
#def createEventflowFormatdataframe(src, record_id_idx, event_category_idx, startTimeColumnIdx, endTimeColumnIdx, sep="\t", local=False, header=[],file_name="Eventflow.txt"):
    #events = []
    # if the df is not provided
    #if df is None:
    #    df = get_dataframe(src, local, sep, header)
    #cols = df.columns
    
    

In [23]:
def createEventflowFormatdataframe(df, record_id_idx, event_category_idx, start_time_idx, end_time_idx, file_name="Eventflow.txt"):
    

    #columns in EventFlow
    column_names=["record_id","event_category","Start_time", "end_time", "event_attributes"]
    data=[]
    key_list=[]
    cols = df.columns
    
    for i in range(0,len(cols)):
        if i not in [record_id_idx, event_category_idx, start_time_idx, end_time_idx]:
            key_list.append(cols[i])
    print(key_list)
    for row in df.iterrows():
        event = row[1]
        if type(event[end_time_idx]) is float:
            End_time = ""
        else:
            End_time = event[end_time_idx]
    
        attr_str=";".join([(re.sub('[(),]','',str(keys))).replace(" ","_")+"=\""+str(event[keys])+"\"" for keys in key_list])
        #for keys in key_list:
        #    attr_str+=str(keys)+"=\""+str(event[keys])+"\""
        
        data.append([(re.sub('[(),]','',str(event[record_id_idx]))).replace(" ","_"),(re.sub('[(),]','',str(event[event_category_idx]))).replace(" ","_"),event[start_time_idx], End_time,attr_str])
        
    event_df=pd.DataFrame(data,columns = column_names)
    
    event_df.to_csv("../datasets/"+file_name,sep="\t",header=False,index=False, quoting=csv.QUOTE_NONE)

## Example

In [38]:
# in this case, the record distinctions are not present, so we add record_id

sequence_braiding = importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
splitSequences(sequence_braiding, "day")
event_df=createEventflowFormatdataModel(sequence_braiding, "record", "Meal", "EventFlow.txt")


In [32]:
# In this case data doesn't have header, so we add headers
header = ["record_id", "EventType", "Start time", "End time","attribute"]
children_hospital = importMixedEvents('../datasets/Children Hospital/DND-ChildrensDemo-06-26-13.txt', 2,3, "%Y-%m-%d %H:%M:%S.%f", sep='\t', local=True, header= header)
event_df=createEventflowFormatdataModel(children_hospital, "record_id", "EventType", "EventFlow_CH.txt")

In [28]:
#Loading from dataframe
endomodo = get_dataframe("../datasets/endomondo/Endo_time_clean.txt", local=True, sep=" ", header=[])
#print(endomodo)
event_df=createEventflowFormatdataframe(endomodo, 0, 1,2,3, "EventFlow_Endo_test.txt")

['Gender', 'Heart Rate', 'Speed', 'SportID']


In [32]:
corona_net = get_dataframe('../coronanet_shortened.csv', local=True, sep=",", header=[])
#print(corona_net)
#splitSequences(sequence_braiding, "day")
event_df=createEventflowFormatdataframe(corona_net,0,1,2,3,  "EventFlow_corona.txt")

['policy_id', 'entry_type', 'correct_type', 'update_type', 'update_level', 'date_announced', 'init_country_level', 'type_sub_cat']


In [25]:
#header = ["record_id", "EventType", "Start time", "End time","attribute"]
invention_trajectories=get_dataframe('../datasets/Invention Trajectories/invention_expanded.csv', local=True, sep=",")
invention_trajectories=invention_trajectories[invention_trajectories["docsource"]!='""']
#invention_trajectories=invention_trajectories["mainclass"!=""]
event_df=createEventflowFormatdataframe(invention_trajectories,0,4,2,3,  "EventFlow_invention_docsrc.txt")

['Event Category', 'srcdocnum', 'apptype', 'mainclass', 'docval']


In [50]:
# in this case, the record distinctions are not present, so we add record_id

endomodo = importMixedEvents('../Endo_time_clean.txt', 2,3, "%Y-%m-%d %H:%M:%S", sep=' ', local=True)
splitSequences(endomodo, "month","UserID")
event_df=createEventflowFormatdataModel(endomodo, "UserID", "Sport", "EventFlow_endo.txt")


In [30]:
corona_net.iloc[8]

country                                                     Afghanistan
type                                  Closure and Regulation of Schools
date_start                                                   2020-03-14
date_end                                                     2020-04-21
policy_id                                                       3558044
entry_type                                                       update
correct_type                                                   original
update_type                                            Change of Policy
update_level                                              Strengthening
date_announced                                               2020-03-14
init_country_level                                             National
type_sub_cat          Primary Schools (generally for children ages 1...
Name: 8, dtype: object

In [47]:
for key_ in sequence_braiding[0].attributes.keys():
    print(key_)

Glucose
Meal
record


In [8]:
# Example usage of aggregateEventsDict with test file
# Rules in the test file:
# Dinner will become Supper 
# Lunch and Breakfast become Not Dinner
sb = aggregateEventsDict(sequence_braiding, "testFiles/aggregateEventsDictTestFileSequenceBraidings.txt", "Meal")
for e in sb[:10]:
    print(e.attributes)

{'Glucose': 38, 'Meal': 'Sugar to treat'}
{'Glucose': 233, 'Meal': 'Supper'}
{'Glucose': 52, 'Meal': 'Not Dinner'}
{'Glucose': 67, 'Meal': 'Sugar to treat'}
{'Glucose': 309, 'Meal': 'Not Dinner'}
{'Glucose': 66, 'Meal': 'Sugar to treat'}
{'Glucose': 80, 'Meal': 'Not Dinner'}
{'Glucose': 168, 'Meal': 'Not Dinner'}
{'Glucose': 171, 'Meal': 'Supper'}
{'Glucose': 56, 'Meal': 'Sugar to treat'}


In [9]:
# Example usage of aggregateEventsRegex with test file
# Rules in the test file:
# Lunch|Dinner|Breakfast -> Must eat times
# S.* -> SNACK TIME!
sequence_braiding = importPointEvents('../datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)
sb = aggregateEventsRegex(sequence_braiding, "testFiles/aggregateEventsRegexSequenceBraidings.txt", "Meal")
for e in sb[:10]:
    print(e.attributes)

{'Glucose': 38, 'Meal': 'SNACK TIME!'}
{'Glucose': 233, 'Meal': 'Must eat times'}
{'Glucose': 52, 'Meal': 'Must eat times'}
{'Glucose': 67, 'Meal': 'SNACK TIME!'}
{'Glucose': 309, 'Meal': 'Must eat times'}
{'Glucose': 66, 'Meal': 'SNACK TIME!'}
{'Glucose': 80, 'Meal': 'Must eat times'}
{'Glucose': 168, 'Meal': 'Must eat times'}
{'Glucose': 171, 'Meal': 'Must eat times'}
{'Glucose': 56, 'Meal': 'SNACK TIME!'}
