# Set up

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import lag, unix_timestamp

from classes import StravaLoader

# Load data

In [None]:
# Initialize Strava activity loader
sl = StravaLoader('local', 'strava-activities', sc=sc, hiveContext=sqlContext)

# Load the dataset
df = sl.get_dataset()

In [None]:
df.groupBy(['athlete', 'activity_type']).count().show()

# Analysis

In [None]:
# Convert timestamp to seconds
df = df.withColumn('unix_time', unix_timestamp(df['time'], "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Window for time difference
window = Window.partitionBy('athlete', 'activity_type').orderBy('unix_time')
df = df.withColumn('unix_time_prev', lag('unix_time', count=1).over(window))
df = df.withColumn('unix_time_diff', df['unix_time'] - df['unix_time_prev'])

In [None]:
# Select prev time and alias fields
dff = df.select( 
    df['@lat'].alias('lat'), 
    df['@lon'].alias('long'), 
    df['ele'].alias('ele'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:atemp'].alias('atemp'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:cad'].alias('cad'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:hr'].alias('hr'), 
    df['time'].alias('time'), 
    df['unix_time'].alias('unix_time'), 
    df['unix_time_diff'].alias('unix_time_diff'), 
    df['athlete'].alias('athlete'), 
    df['activity_type'].alias('activity_type') 
)

dff.show()

In [None]:
#df.orderBy('unix_time_diff', ascending=False).withColumn('hrs', df['unix_time_diff']/(60*60)).show()