# Set up

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, concat, lag, lit, lower, sum, unix_timestamp

from classes import StravaLoader

# Load data

In [None]:
# Initialize Strava activity loader
sl = StravaLoader('local', 'strava-activities-subset', sc=sc, hiveContext=sqlContext)

# Load the dataset
df = sl.get_dataset()

# Analysis
### Parameters

In [None]:
stage_gap_min = 10

## Calculate additional columns

In [None]:
# Partitioning on <athlete> and <activity_type>
window = Window.partitionBy('athlete', 'activity_type').orderBy('unix_time')

# Timestamp in seconds
df = df.withColumn( 
    'unix_time', 
    unix_timestamp(df['time'], "yyyy-MM-dd'T'HH:mm:ss'Z'")
) 

# Time difference in seconds between tracking point and previous tracking point
df = df.withColumn(
    'unix_time_diff',
    df['unix_time'] - lag('unix_time', count=1).over(window)
)

# Indicator (0,1) of whether time difference is greater than threshold (new activity block)
df = df.withColumn(
    'CALC_block_isnew',
    coalesce(
        (df['unix_time_diff'] >= stage_gap_min * 60).cast('integer'),
        lit(0)
    )
)

# Sequence number of activity block per athlete and activity 
df = df.withColumn(
    'CALC_block_seqnum', 
    sum('CALC_block_isnew').over(window)
)

# Activity block id "<athlete>_<activity_type>_<integer>"
df = df.withColumn(
    'CALC_block_id',
    concat(
        df['athlete'],
        lit('_'),
        lower(df['activity_type']),
        lit('_'),
        df['CALC_block_seqnum'].cast('string')
    )
)

## Flatten DataFrame

In [None]:
dff = df.select( 
    df['@lat'].alias('lat'), 
    df['@lon'].alias('long'), 
    df['ele'].alias('ele'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:atemp'].alias('atemp'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:cad'].alias('cad'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:hr'].alias('hr'), 
    df['time'].alias('time'), 
    df['athlete'].alias('athlete'), 
    df['activity_type'].alias('activity_type'),
    df['CALC_block_id'].alias('block_id')
)

# Output

In [None]:
dff.groupBy('block_id').count().show()