# Preprocessing Notebook
This notebook prepares a flat dataset of GPS tracking points for further analysis.
* Data is loaded from .gpx files to a Spark SQL `DataFrame` using the custom `StravaLoader` class
* Nested columns are flattened 
* Additional attributes such as speed and accumulated distance are calculated
* Tracking points where the athlete is at rest are filtered out to better calculate pause times

## Input parameters
- `dataset` - The name of the dataset to load. Use "strava-activities" or "strava-activities-subset"
- `pause_threshold_minutes` - The minimum gap in minutes between two *activity_blocks*. If the time gap between to consecutive tracking points is greater than this limit the two points belong to different blocks
- `rest_speed_threshold_kmh[acticity_type]` - The upper limit for when to consider the athlete to be *at rest* for each `activity_type`. If the speed is lower than this limit the athlete is at rest and the tracking point will be dropped

In [None]:
dataset = 'strava-activities-subset'
pause_threshold_minutes = 10
rest_speed_threshold_kmh = {
    'Ride': 3
}

## Script
Import libraries

In [None]:
from pyspark.sql import Window
from pyspark.sql.functions import coalesce, concat, lag, lit, lower, sum, unix_timestamp, max, min, \
                                  sin, cos, atan, sqrt, atan2, toRadians, round
import pandas as pd
from classes import StravaLoader

%matplotlib inline
from matplotlib import pyplot as plt
from IPython.core.pylabtools import figsize

Load data with `StravaLoader`

In [None]:
# Initialize Strava activity loader
sl = StravaLoader('s3', dataset, sc=sc, hiveContext=sqlContext)

# Load the dataset
df = sl.get_dataset()

Compute time gap between consecutive points

In [None]:
# Partitioning on <athlete> and <activity_type>
window = Window.partitionBy('athlete', 'activity_type').orderBy('unix_time')

# Timestamp in seconds
df = df.withColumn( 
    'TIME_unix_time', 
    unix_timestamp(df['time'], "yyyy-MM-dd'T'HH:mm:ss'Z'")
)

Define function for calulation of speed

In [None]:
def calculate_speed(df):
    
    R = 6371000 # Earth radius

    # Latitude and longditude in radians
    df = df.withColumn(
        'DIST_@latR',
        toRadians(df['@lat'])
    )
    df = df.withColumn(
        'DIST_@lonR',
        toRadians(df['@lon'])
    )
    
    # Latitude and longditude in previous tracking point
    df = df.withColumn(
        'DIST_@latR_prev',
        lag('DIST_@latR', count=1).over(window)
    )
    df = df.withColumn(
        'DIST_@lonR_prev',
        lag('DIST_@lonR', count=1).over(window)
    )
    
    # Difference in latitude and longditude since previous tracking point
    df = df.withColumn(
        'DIST_@latR_diff',
        coalesce(df['DIST_@latR'] - df['DIST_@latR_prev'], lit(0))
    )
    df = df.withColumn(
        'DIST_@lonR_diff',
        coalesce(df['@lonR'] - df['@lonR_prev'], lit(0))
    )

    # Havesine distance calculation between two tracking points
    df = df.withColumn(
        'DIST_a',
        sin(df['DIST_@latR_diff']/2) * sin(df['DIST_@latR_diff']/2)
        + cos(df['DIST_@latR']) * cos(df['DIST_@latR_prev'])
        * sin(df['DIST_@lonR_diff']/2) * sin(df['DIST_@lonR_diff']/2)
    )
    df = df.withColumn(
        'DIST_c',
        2 * atan2(sqrt(df['DIST_a']), sqrt(1 - df['DIST_a']))
    )
    
    # Distance between consecutive points
    df = df.withColumn(
        'DIST_diff_meters',
        coalesce(R * df['c'], lit(0))
    )
    
    # Momentary speed in km/h
    df = df.withColumn(
        'SPEED_kmh',
        coalesce(
            3.6 * df['DIST_diff_meters'] / (df['TIME_unix_time'] - lag('TIME_unix_time', count=1).over(window)),
            lit(0)
        )
    )
    
    return df

Calculate speed, filter points at rest and recalculate speed.

In [None]:
df = calculate_speed(df)
df = df.filter((df['activity_type']=='Ride') & (df['SPEED_kmh']>rest_speed_threshold_kmh['Ride']))
df = calculate_speed(df)

Derive activity blocks by checking distance between consecutive tracking points

In [None]:
# Time difference in seconds between tracking point and previous tracking point
df = df.withColumn(
    'TIME_unix_time_diff',
    df['TIME_unix_time'] - lag('TIME_unix_time', count=1).over(window)
)

# Indicator (0,1) of whether time difference is greater than threshold (new activity block)
df = df.withColumn(
    'BLOCK_isnew',
    coalesce(
        (df['TIME_unix_time_diff'] >= pause_threshold_minutes * 60).cast('integer'),
        lit(0)
    )
)

# Sequence number of activity block per athlete and activity 
df = df.withColumn(
    'BLOCK_seqnum', 
    sum('BLOCK_isnew').over(window)
)

# Activity block id "<athlete>_<activity_type>_<integer>"
df = df.withColumn(
    'BLOCK_id',
    concat(
        df['athlete'],
        lit('_'),
        lower(df['activity_type']),
        lit('_'),
        df['BLOCK_seqnum'].cast('string')
    )
)

Compute activity block specific metrics, like accumulated distance within each block

In [None]:
window_block = Window.partitionBy('BLOCK_id').orderBy('TIME_unix_time')

df = df.withColumn(
    'DIST_BLOCK_km',
    sum('DIST_diff_meters').over(window_block) / 1000
)

Finally, flatten the `DataFrame` and remove redundant columns 

In [None]:
dff = df.select( 
    df['@lat'].alias('lat'), 
    df['@lon'].alias('lon'), 
    df['ele'].alias('ele'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:atemp'].alias('atemp'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:cad'].alias('cad'), 
    df['extensions.gpxtpx:TrackPointExtension.gpxtpx:hr'].alias('hr'), 
    df['time'].alias('time'), 
    df['TIME_unix_time'].alias('time_seconds'), # Block time?
    df['athlete'].alias('athlete'), 
    df['activity_type'].alias('activity_type'),
    df['BLOCK_id'].alias('block_id'),
    df['SPEED_kmh'],
    df['DIST_diff_meters'],
    df['DIST_BLOCK_km']
)

Save flat dataset in parquet file format for later use

In [None]:
dff.write.mode('overwrite').parquet('s3n://larsbk/parquet/%s/' % dataset) # sl.path -> sl.root_path, sl.dataset