# Set up

In [None]:
import pandas as pd
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, concat, lag, lit, lower, sum, unix_timestamp, max, min, \
                                  sin, cos, atan, sqrt, atan2, toRadians, round

from classes import StravaLoader

%matplotlib inline
from matplotlib import pyplot as plt
from IPython.core.pylabtools import figsize

# Load data

In [None]:
# Initialize Strava activity loader
sl = StravaLoader('s3', 'strava-activities-subset', sc=sc, hiveContext=sqlContext)

# Load the dataset
df = sl.get_dataset()

# Analysis
### Parameters

In [None]:
stage_gap_min = 10
speed_lower_lim = {
    'Ride': 3
}

## Calculate additional columns

In [None]:
def calculate_speed(df):

    # Latitude and longditude in radians
    df = df.withColumn(
        '@latR',
        toRadians(df['@lat'])
    )
    df = df.withColumn(
        '@lonR',
        toRadians(df['@lon'])
    )
    
    # Latitude and longditude in previous tracking point
    df = df.withColumn(
        '@latR_prev',
        lag('@latR', count=1).over(window)
    )
    df = df.withColumn(
        '@lonR_prev',
        lag('@lonR', count=1).over(window)
    )
    
    # Difference in latitude and longditude since previous tracking point
    df = df.withColumn(
        '@latR_diff',
        coalesce(df['@latR'] - df['@latR_prev'], lit(0))
    )
    df = df.withColumn(
        '@lonR_diff',
        coalesce(df['@lonR'] - df['@lonR_prev'], lit(0))
    )

    # Havesine distance calculation between two tracking points
    df = df.withColumn(
        'a',
        sin(df['@latR_diff']/2) * sin(df['@latR_diff']/2)
        + cos(df['@latR']) * cos(df['@latR_prev'])
        * sin(df['@lonR_diff']/2) * sin(df['@lonR_diff']/2)
    )
    df = df.withColumn(
        'c',
        2 * atan2(sqrt(df['a']), sqrt(1 - df['a']))
    )
    df = df.withColumn(
        'dist_diff',
        coalesce(R * df['c'], lit(0))
    )
    
    # Momentary speed in km/h
    df = df.withColumn(
        'speed_kmh',
        coalesce(
            3.6 * df['dist_diff'] / (df['unix_time'] - lag('unix_time', count=1).over(window)),
            lit(0)
        )
    )
    
    return df

In [None]:
R = 6371000 # Earth radius

# Partitioning on <athlete> and <activity_type>
window = Window.partitionBy('athlete', 'activity_type').orderBy('unix_time')

# Timestamp in seconds
df = df.withColumn( 
    'unix_time', 
    unix_timestamp(df['time'], "yyyy-MM-dd'T'HH:mm:ss'Z'")
)

df = calculate_speed(df)

### Filter tracking points
* From cycling when speed <= 3 km/h

In [None]:
df = df.filter((df['activity_type']=='Ride') & (df['speed_kmh']>speed_lower_lim['Ride']))

df = calculate_speed(df)

### Calculate more additional columns

In [None]:
# Time difference in seconds between tracking point and previous tracking point
df = df.withColumn(
    'unix_time_diff',
    df['unix_time'] - lag('unix_time', count=1).over(window)
)

# Indicator (0,1) of whether time difference is greater than threshold (new activity block)
df = df.withColumn(
    'CALC_block_isnew',
    coalesce(
        (df['unix_time_diff'] >= stage_gap_min * 60).cast('integer'),
        lit(0)
    )
)

# Sequence number of activity block per athlete and activity 
df = df.withColumn(
    'CALC_block_seqnum', 
    sum('CALC_block_isnew').over(window)
)

# Activity block id "<athlete>_<activity_type>_<integer>"
df = df.withColumn(
    'CALC_block_id',
    concat(
        df['athlete'],
        lit('_'),
        lower(df['activity_type']),
        lit('_'),
        df['CALC_block_seqnum'].cast('string')
    )
)

### Calculate block specific columns

In [None]:
window_block = Window.partitionBy('CALC_block_id').orderBy('unix_time')

df = df.withColumn(
    'dist_cum_km',
    sum('dist_diff').over(window_block) / 1000
)

## Flatten DataFrame

In [None]:
dff = df.select( 
    df['@lat'].alias('lat'), 
    df['@lon'].alias('lon'), 
    df['ele'].alias('ele'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:atemp'].alias('atemp'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:cad'].alias('cad'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:hr'].alias('hr'), 
    df['time'].alias('time'), 
    df['unix_time'].alias('time_seconds'),
    df['athlete'].alias('athlete'), 
    df['activity_type'].alias('activity_type'),
    df['CALC_block_id'].alias('block_id'),
    df['speed_kmh'],
    df['dist_diff'],
    df['dist_cum_km']
)

# Output

In [None]:
sp = dff.filter('activity_type="Ride"').select(round(dff['speed_kmh']).alias('speed_int')).groupBy('speed_int').count().filter('speed_int<80').toPandas()
plt.bar(sp['speed_int'], sp['count'])

In [None]:
dfa = dff.groupBy('block_id').agg(
    min(dff['time']),
    ((max(dff['time_seconds']) - min(dff['time_seconds']))/60).alias('block_minutes'),
    max(dff['dist_cum_km']).alias('block_dist')
)

dfa = dfa.withColumn(
    'avg_speed',
    dfa['block_dist'] / (dfa['block_minutes'] / 60)
)

pdf = dfa.toPandas()

pdf

In [None]:
dff.filter('block_id="akrogvig_ride_1"').select([
        'athlete',
        'activity_type',
        'time',
        'dist_diff',
        'dist_cum_km',
        'speed_kmh'
    ]).orderBy('time_seconds', ascending=False).show(1000)

In [None]:
dff.filter('block_id="akrogvig_ride_1"').select([
        ''
        'speed_kmh'
    ]).orderBy('time_seconds', ascending=True).toPandas().plot(figsize=(15,5))