In [1]:
from datetime import datetime
import pandas as pd
import csv
import requests
import os

In [2]:
# A class that represents a point event
class PointEvent:
    def __init__(self, timestamp, attributes):
        self.type = "point"
        self.timestamp = timestamp 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

# class to represent an interval event
class IntervalEvent:
    def __init__(self, t1, t2, attributes):
        self.type = "interval"
        self.time = [t1,t2] 
        # dictionary: key=attribute value=attribute value
        self.attributes = attributes 

In [3]:
# Helper function to return a data frame
# Local is boolean, if local then source should be path to the file
# Otherwise it should be a URL to the the file
def get_dataframe(src, local, sep, header=[], encoding="UTF-8"):
    if local == False:
        # To force a dropbox link to download change the dl=0 to 1
        if "dropbox" in src:
            src = src.replace('dl=0', 'dl=1')
        # Download the CSV at url
        req = requests.get(src)
        url_content = req.content
        csv_file = open('data.txt', 'wb') # was data.csv
        csv_file.write(url_content)
        csv_file.close()
        # Read the CSV into pandas
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv("data.txt", sep, encoding=encoding)
        # else use header param for column names
        else:
            df = pd.read_csv("data.txt", sep, names=header, encoding=encoding)
        # Delete the csv file
        os.remove("data.txt")
        return df
    # Dataset is local
    else:
        # If header list is empty, the dataset provides header so ignore param
        if not header:
            df = pd.read_csv(src, sep, encoding=encoding)
        # else use header param for column names
        else:
            df = pd.read_csv(src, sep, names=header, encoding=encoding)
        return df

In [4]:
# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
# header is list of column names if they are not provided in the dataset
def importPointEvents(src, timestampColumnIdx, timeFormat, sep, local, header=[], encoding="UTF-8"):
    events = []
    df = get_dataframe(src, local, sep, header, encoding)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        timestamp = datetime.strptime(data[timestampColumnIdx], timeFormat)
        # for all attributes other tahn time, add them to attributes dict
        for i in range(len(data)):
            if i != timestampColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = PointEvent(timestamp, attribs)
        events.append(e)
    return events

# Returns a list of event objects
# src is a url or directory path, if local is false its url else its path
def importIntervalEvents(src, startTimeColumnIdx, endTimeColumnIdx, timeFormat, sep, local, header=[], encoding="UTF-8"):
    events = []
    df = get_dataframe(src, local, sep, header, encoding)
    cols = df.columns
    # For each event in the csv construct an event object
    for row in df.iterrows():
        data = row[1]
        attribs = {}
        # create datetime object for the start and end times of the event
        t1 = datetime.strptime(data[startTimeColumnIdx], timeFormat)
        t2 = datetime.strptime(data[endTimeColumnIdx], timeFormat)
        # for all attributes other than times, add them to attributes dict
        for i in range(len(data)):
            if i != startTimeColumnIdx and i != endTimeColumnIdx:
                attribs[cols[i]] = data[i]
        # use time stamp and attributes map to construct event object
        e = IntervalEvent(t1, t2, attribs)
        events.append(e)
    return events

# Given a list of events and an attribute, group the events by the value of said attribute
# Returns a dictionary, keys are attribute values and values are lists of events that have that attribute value
def groupEventsBy(eventList, attributeName):
    grouped_by = {}
    for event in eventList:
        value = event.attributes[attributeName]
        # If have seen this value before, append it the list of events in grouped_by for value
        if value in grouped_by:
            grouped_by[value].append(event)
        # otherwise store a new list with just that event
        else:
            grouped_by[value] = [event]
    return grouped_by

In [5]:
# VAST mini challenge dataset
url = "http://vacommunity.org/tiki-download_file.php?fileId=492"
vast = importPointEvents(url, 0, '%Y-%m-%d %H:%M:%S', ',', local=False)

# print the first few events
for e in vast[:5]:
    print(e.timestamp, e.attributes)

2015-05-01 00:43:28 {'car-id': '20154301124328-262', 'car-type': '4', 'gate-name': 'entrance3'}
2015-05-01 01:03:48 {'car-id': '20154301124328-262', 'car-type': '4', 'gate-name': 'general-gate1'}
2015-05-01 01:06:24 {'car-id': '20154301124328-262', 'car-type': '4', 'gate-name': 'ranger-stop2'}
2015-05-01 01:09:25 {'car-id': '20154301124328-262', 'car-type': '4', 'gate-name': 'ranger-stop0'}
2015-05-01 01:12:36 {'car-id': '20154301124328-262', 'car-type': '4', 'gate-name': 'general-gate2'}


In [6]:
# Sequence braiding
# NOTE: I deleted the last 4 rows of the dataset before loading it in since they did not look like evevnts 
# here is what one looked like "Stirfry 100 120"
# NOTE: this data set only had dates not times
sequence_braiding = importPointEvents('data/sequence_braiding_refined.csv', 0, "%m/%d/%y", sep=',', local=True)

# print the first few events
for e in sequence_braiding[:10]:
    print(e.timestamp, e.attributes)

2019-06-25 00:00:00 {'Glucose': 38, 'Meal': 'Sugar to treat'}
2019-06-23 00:00:00 {'Glucose': 233, 'Meal': 'Dinner'}
2019-06-23 00:00:00 {'Glucose': 52, 'Meal': 'Lunch'}
2019-06-22 00:00:00 {'Glucose': 67, 'Meal': 'Sugar to treat'}
2019-06-22 00:00:00 {'Glucose': 309, 'Meal': 'Breakfast'}
2019-06-21 00:00:00 {'Glucose': 66, 'Meal': 'Sugar to treat'}
2019-06-21 00:00:00 {'Glucose': 80, 'Meal': 'Lunch'}
2019-06-21 00:00:00 {'Glucose': 168, 'Meal': 'Breakfast'}
2019-06-20 00:00:00 {'Glucose': 171, 'Meal': 'Dinner'}
2019-06-20 00:00:00 {'Glucose': 56, 'Meal': 'Sugar to treat'}


In [8]:
# Foursquare NYC
header = ["User ID", "Venue ID", "Venue category ID", "Venue category name", "Latitude", "Longitude", 
          "Timezone offset (minutes)", "UTC time"]
foursquare_time_format = "%a %b %d %H:%M:%S +0000 %Y"

fs_nyc = importPointEvents('data/foursquare/cities/nyc.txt', 7, foursquare_time_format, sep='\t', local=True, header=header, encoding='latin1')

# print the first few events
for e in fs_nyc[:5]:
    print(e.timestamp, e.attributes)

2012-04-03 18:00:09 {'User ID': 470, 'Venue ID': '49bbd6c0f964a520f4531fe3', 'Venue category ID': '4bf58dd8d48988d127951735', 'Venue category name': 'Arts & Crafts Store', 'Latitude': 40.71981037548853, 'Longitude': -74.00258103213994, 'Timezone offset (minutes)': -240}
2012-04-03 18:00:25 {'User ID': 979, 'Venue ID': '4a43c0aef964a520c6a61fe3', 'Venue category ID': '4bf58dd8d48988d1df941735', 'Venue category name': 'Bridge', 'Latitude': 40.606799581406435, 'Longitude': -74.04416981025437, 'Timezone offset (minutes)': -240}
2012-04-03 18:02:24 {'User ID': 69, 'Venue ID': '4c5cc7b485a1e21e00d35711', 'Venue category ID': '4bf58dd8d48988d103941735', 'Venue category name': 'Home (private)', 'Latitude': 40.71616168484322, 'Longitude': -73.88307005845945, 'Timezone offset (minutes)': -240}
2012-04-03 18:02:41 {'User ID': 395, 'Venue ID': '4bc7086715a7ef3bef9878da', 'Venue category ID': '4bf58dd8d48988d104941735', 'Venue category name': 'Medical Center', 'Latitude': 40.7451638, 'Longitude': -

In [9]:
# Foursquare tokyo
fs_tokyo = importPointEvents('data/foursquare/cities/tokyo.txt', 7, foursquare_time_format, sep='\t', local=True, header=header, encoding='latin1')

# print the first few events
for e in fs_tokyo[:5]:
    print(e.timestamp, e.attributes)

2012-04-03 18:17:18 {'User ID': 1541, 'Venue ID': '4f0fd5a8e4b03856eeb6c8cb', 'Venue category ID': '4bf58dd8d48988d10c951735', 'Venue category name': 'Cosmetics Shop', 'Latitude': 35.705101088587135, 'Longitude': 139.61959004402158, 'Timezone offset (minutes)': 540}
2012-04-03 18:22:04 {'User ID': 868, 'Venue ID': '4b7b884ff964a5207d662fe3', 'Venue category ID': '4bf58dd8d48988d1d1941735', 'Venue category name': 'Ramen /  Noodle House', 'Latitude': 35.715581120393146, 'Longitude': 139.8003172874451, 'Timezone offset (minutes)': 540}
2012-04-03 19:12:07 {'User ID': 114, 'Venue ID': '4c16fdda96040f477cc473a5', 'Venue category ID': '4d954b0ea243a5684a65b473', 'Venue category name': 'Convenience Store', 'Latitude': 35.714542173995646, 'Longitude': 139.4800649934587, 'Timezone offset (minutes)': 540}
2012-04-03 19:12:13 {'User ID': 868, 'Venue ID': '4c178638c2dfc928651ea869', 'Venue category ID': '4bf58dd8d48988d118951735', 'Venue category name': 'Food & Drink Shop', 'Latitude': 35.72559198

In [16]:
# Foursquare global
# There 33 million eventt for this dataset and so I was not able to get it the importPointEvents to fully run
#header = ["User ID", "Venue ID", "UTC time", "Timezone offset in minutes"]
#fs_gloabl = importPointEvents("data/foursquare/global_scale/checkins.txt", 2, foursquare_time_format, sep='\t', local=True, header=header, encoding='latin1')