In [1]:
from datetime import datetime
import pandas as pd
import csv
import requests
import os

# Event Representations

In [2]:
# A class that represents a point event
class PointEvent:
    def __init__(self, timestamp, attributes):
        self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# class to represent an interval event
class IntervalEvent:
    def __init__(self, t1, t2, attributes):
        self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# Importing events

In [4]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def get_dataframe(src, local=False, sep="\t", header=[]):
    if not local:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        url_content = req.content
        csv_file = open('data.txt', 'wb') 
        csv_file.write(url_content)
        csv_file.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv("data.txt", sep)
        #else use header param for column names
        else:
            df = pd.read_csv("data.txt", sep, names=header)
        # Delete the csv file
        os.remove("data.txt")
        return df
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv(src, sep)
        # else use header param for column names
        else:
            df = pd.read_csv(src, sep, names=header)
        return df

# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# header is list of column names if they are not provided in the dataset
# The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
# I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
# for those cases
def importPointEvents(src, timestampColumnIdx, timeFormat, sep='\t', local=False, header=[], df=None):
    events = []
    # if the df is not provided
    if df is None:
        df = get_dataframe(src, local, sep, header)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
        # for all attributes other tahn time, add them to attributes dict
        for i in range(len(data)):
            if i != timestampColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = PointEvent(timestamp, attribs)
        events.append(e)
    return events

# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# The foursquare datasets are all using a differnet encoding that pandas cannot auto identify so for those
# I thought the simplest thing was just to give this function the df and then use that instead of calling my helper
# for those cases
def importIntervalEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep="\t", local=False, header=[], df=None):
    events = []
    # if the df is not provided
    if df is None:
        df = get_dataframe(src, local, sep, header)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        # create datetime object for the start and end times of the event
        t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
        t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
        # MIGHT BE A BETTER WAY TO DO THIS LOOP
        # for all attributes other than times, add them to attributes dict
        for i in range(len(data)):
            if i != startTimeColumnIdx and i != endTimeColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = IntervalEvent(t1, t2, attribs)
        events.append(e)
    return events

# Loading in datasets

In [5]:
# VAST mini challenge dataset
url = "http://vacommunity.org/tiki-download_file.php?fileId=492"
vast = importPointEvents(url, 0, '%Y-%m-%d %H:%M:%S', sep=',', local=False)

# Sequence braiding
# NOTE: I deleted the last 4 rows of the dataset before loading it in since they did not look like evevnts 
# here is what one looked like "Stirfry 100 120"
# NOTE: this data set only had dates not times
sequence_braiding = importPointEvents('datasets/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)

# Foursquare NYC
header = ["User ID", "Venue ID", "Venue category ID", "Venue category name", "Latitude", "Longitude", 
          "Timezone offset (minutes)", "UTC time"]
foursquare_time_format = "%a %b %d %H:%M:%S +0000 %Y"

df = pd.read_csv('datasets/foursquare/nyc.txt', '\t', names=header, encoding="latin1")
fs_nyc = importPointEvents('datasets/foursquare/nyc.txt', 7, foursquare_time_format, df=df)
    
# Foursquare tokyo
df = pd.read_csv('datasets/foursquare/tokyo.txt', names=header, encoding='latin1', sep='\t')
fs_tokyo = importPointEvents('datasets/foursquare/tokyo.txt', 7, foursquare_time_format, df=df)

# Basketball, this dataset also has the issue where some events are interval and some are point ie some have one
# and some have two times
# CHICAGO-SeasonD2O.txt
time_format = "%H:%M:%S.%f"
header = ["Game/Points", "EventType", "Start time", "End time"]
#chicago_season_d2o = importIntervalEvents("datasets/Chicago_Bulls/CHICAGO-SeasonD2O.txt", 2, 3, time_format, sep='\t', local=True, header=header)

# Load in dumby data set to test interval import/class code
# fake = importIntervalEvents("dummy_interval_data.csv", 0, 3, "%m/%d/%y", local=True, sep=',')

# Generating Sequences

In [30]:
# Helper function for generateSequence to use when sorting events to get what time field to sort by
def get_time_to_sort_by(e):
    # Sort by starting time of event if its an interval event
    if type(e) == IntervalEvent:
        return e.time[0]
    # Otherwise use the timestamp
    else:
        return e.timestamp

# Group events by attributeName, and order them by timestamp
def generateSequence(eventList, attributeName):
    grouped_by = {}
    # Sort the event list
    eventList = sorted(eventList, key=get_time_to_sort_by)
    for event in eventList:
        value = event.attributes[attributeName]
        # If have seen this value before, append it the list of events in grouped_by for value
        if value in grouped_by:
            grouped_by[value].append(event)
        # otherwise store a new list with just that event
        else:
            grouped_by[value] = [event]
    return grouped_by

# Split a long sequence into shorter ones by timeUnit. For example, a sequence may span several days and we want to 
# break it down into daily sequences. The argument timeUnit can be one of the following strings: “hour”, “day”, 
# “week”, “month”, “quarter”, and “year”.
#def splitSequences (sequenceList, timeUnit):
#    results = {}
#    timeUnit = timeUnit.lower()
#    # Check if the time unit is a valid argument
#    valid_time_units = ["hour", "day", "week", "month", "quarter", "year"]
#    if timeUnit not in valid_time_units:
#        raise ValueError("timeUnit must be hour, day, week, month, quarter, or year")
#        
#    if timeUnit == "hour":
#        
#    elif timeUnit == "day":
#        
#    elif timeUnit == "month":
#        
#            
#    elif timeUnit == "year":
#        for event in sequenceList:
#            
#    elif timeUnit == "quarter":
#        
#    elif timeUnit == "week":