In [1]:
import pandas as pd
from collections import OrderedDict
import csv
import json
import xml.etree.ElementTree as ET 
from datetime import datetime
import pytz

In [2]:
## Utils

def convert_timestamp_offset(timestamp):
    dt = datetime.utcfromtimestamp(int(timestamp))
    midnight = dt.replace(hour=0, minute=0, second=0, microsecond=0)
    delta = dt - dt.replace(hour=0, minute=0, second=0, microsecond=0)
    return delta.seconds


In [3]:
## Strava data

VALID_FILE_EXT = '.gpx'
BASE_PATH = 'raw/strava_data/'
DATE_FMT = '%Y-%m-%dT%H:%M:%SZ'

def parse_gpx(filename):
    xmlfile = filename
    NS = '{http://www.topografix.com/GPX/1/1}'

    tree = ET.parse(xmlfile)
    root = tree.getroot()
    
    points = []
    for point in root.iter('{}trkpt'.format(NS)):
        timestamp = datetime.strptime(point.find('{}time'.format(NS)).text, DATE_FMT).strftime('%s')
        point_data = {
            'lat': point.get('lat'),
            'long': point.get('lon'),
            'time': timestamp,
            'offset': convert_timestamp_offset(timestamp),
            'type': 'strava'
        }

        points.append(point_data)

    return points

with open(BASE_PATH + 'activities.csv', 'r') as infile:
    reader = csv.DictReader(infile)
    strava_lookup_data = [row for row in reader]

valid_filenames = [row['filename'] for row in strava_lookup_data if VALID_FILE_EXT in row['filename'] and row['commute'] == 'true']

strava_parsed_data = OrderedDict()
for index, filename in enumerate(valid_filenames):
    parsed_data = parse_gpx(BASE_PATH + filename)
    strava_parsed_data[index] = parsed_data

In [4]:
## Scoot data

FILEPATH = 'raw/scoot-data/pthaas_ride_traces.csv'
DATE_FMT = '%Y-%m-%d %H:%M:%S'

with open(FILEPATH, 'r') as infile:
    reader = csv.DictReader(infile)
    scoot_data = [row for row in reader]
    
scoot_parsed_data = OrderedDict()
for row in scoot_data:
    ride_id = row['ride_id']
    timestamp = datetime.strptime(row['scu_timestamp'], DATE_FMT).strftime('%s')
    point_data = {
        'lat': row['latitude'],
        'long': row['longitude'],
        'time': timestamp,
        'offset': convert_timestamp_offset(timestamp),
        'type': 'scoot'
    }
    if ride_id in scoot_parsed_data.keys():
        scoot_parsed_data[ride_id].append(point_data)
    else:
        scoot_parsed_data[ride_id] = [point_data]

In [8]:
segments = []
for _, item in strava_parsed_data.items():
    segments.append(item)
    
for _, item in scoot_parsed_data.items():
    segments.append(item)

parsed_trips = []
for trip in segments:
    trip_data = {}
    trip_data['type'] = trip[0]['type']
    segment_data = []
    for segment in trip:
        data = [segment['long'], segment['lat'], segment['offset']]
        segment_data.append(data)
    trip_data['segments'] = segment_data
    parsed_trips.append(trip_data)
    
with open('trip_data.json', 'w') as outfile:
    json.dump(parsed_trips, outfile)