In [27]:
from collections import OrderedDict
import csv
import json
import xml.etree.ElementTree as ET 
from datetime import datetime

import osmnx as ox
from shapely.geometry import Point
from geopy.distance import distance, lonlat

In [28]:
## Utils

def convert_timestamp_offset(timestamp):
    dt = datetime.utcfromtimestamp(int(timestamp))
    midnight = dt.replace(hour=0, minute=0, second=0, microsecond=0)
    delta = dt - dt.replace(hour=0, minute=0, second=0, microsecond=0)
    return delta.seconds


def is_valid_point(current_point, previous_point, geo_boundary):
    coords1 = (current_point['long'], current_point['lat'])
    time1 = int(current_point['time'])
    point = Point(coords1)
    if not point.within(geo_boundary):
        return False
    elif not previous_point:
        return True
    else:
        coords2 = (previous_point['long'], previous_point['lat'])
        time2 = int(previous_point['time'])
        dist = distance(lonlat(*coords1), lonlat(*coords2)).meters
        delta_t = time1 - time2
        vel = dist / delta_t if delta_t > 0 else 0
        if dist < 20 or delta_t < 20 or vel > 22.35:
            return False
        else:
            return True

In [29]:
## Strava data

VALID_FILE_EXT = '.gpx'
BASE_PATH = 'raw/strava_data/'
DATE_FMT = '%Y-%m-%dT%H:%M:%SZ'

def parse_gpx(filename):
    xmlfile = filename
    NS = '{http://www.topografix.com/GPX/1/1}'

    tree = ET.parse(xmlfile)
    root = tree.getroot()
    
    points = []
    for point in root.iter('{}trkpt'.format(NS)):
        timestamp = datetime.strptime(point.find('{}time'.format(NS)).text, DATE_FMT).strftime('%s')
        point_data = {
            'lat': float(point.get('lat')),
            'long': float(point.get('lon')),
            'time': int(timestamp),
            'offset': convert_timestamp_offset(timestamp),
            'type': 'strava'
        }

        points.append(point_data)

    return points

with open(BASE_PATH + 'activities.csv', 'r') as infile:
    reader = csv.DictReader(infile)
    strava_lookup_data = [row for row in reader]

valid_filenames = [row['filename'] for row in strava_lookup_data if VALID_FILE_EXT in row['filename'] and row['commute'] == 'true']

strava_parsed_data = OrderedDict()
for index, filename in enumerate(valid_filenames):
    parsed_data = parse_gpx(BASE_PATH + filename)
    strava_parsed_data[index] = parsed_data
    


In [30]:
## Scoot data

FILEPATH = 'raw/scoot-data/pthaas_ride_traces.csv'
DATE_FMT = '%Y-%m-%d %H:%M:%S'

with open(FILEPATH, 'r') as infile:
    reader = csv.DictReader(infile)
    scoot_data = [row for row in reader]
    
scoot_parsed_data = OrderedDict()
for row in scoot_data:
    ride_id = row['ride_id']
    timestamp = datetime.strptime(row['scu_timestamp'], DATE_FMT).strftime('%s')
    point_data = {
        'lat': float(row['latitude']),
        'long': float(row['longitude']),
        'time': int(timestamp),
        'offset': convert_timestamp_offset(timestamp),
        'type': 'scoot'
    }
    if ride_id in scoot_parsed_data.keys():
        scoot_parsed_data[ride_id].append(point_data)
    else:
        scoot_parsed_data[ride_id] = [point_data]

In [31]:
segments = []
for _, item in strava_parsed_data.items():
    segments.append(item)
    
for _, item in scoot_parsed_data.items():
    segments.append(item)

SF_GEO = ox.gdf_from_place('San Francisco, CA, USA').iloc[0]['geometry']

type_dict = {
    'strava': 0,
    'scoot': 1
}

parsed_trips = []
for trip in segments:
    trip_data = {}
    trip_data['type'] = type_dict[trip[0]['type']]
    segment_data = []
    previous_point = None
    for this_point in trip:
        is_valid = is_valid_point(this_point, previous_point, SF_GEO)
        if not is_valid:
            continue
        data = [this_point['long'], this_point['lat'], this_point['offset']]
        segment_data.append(data)
        previous_point = this_point
    trip_data['segments'] = segment_data
    parsed_trips.append(trip_data)
    
with open('trip_data.json', 'w') as outfile:
    json.dump(parsed_trips, outfile)