In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import tools
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [3]:
engine = sa.create_engine("postgresql://jakekirsch:@localhost/jakekirsch")


The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.



In [4]:
engine.execute("SELECT COUNT(*) FROM activities;").fetchall()

[(328,)]

In [5]:
engine.execute("""
SELECT column_name, data_type 
FROM INFORMATION_SCHEMA.COLUMNS 
WHERE table_name = 'activities';""").fetchall()

[('achievement_count', 'text'),
 ('athlete_id', 'text'),
 ('athlete_resource_state', 'text'),
 ('athlete_count', 'text'),
 ('average_cadence', 'double precision'),
 ('average_heartrate', 'double precision'),
 ('average_speed', 'double precision'),
 ('average_temp', 'double precision'),
 ('comment_count', 'double precision'),
 ('commute', 'boolean'),
 ('display_hide_heartrate_option', 'boolean'),
 ('distance', 'double precision'),
 ('device_watts', 'text'),
 ('elapsed_time', 'double precision'),
 ('elev_high', 'double precision'),
 ('elev_low', 'double precision'),
 ('end_latlng', 'text'),
 ('external_id', 'text'),
 ('flagged', 'boolean'),
 ('from_accepted_tag', 'boolean'),
 ('gear_id', 'text'),
 ('has_heartrate', 'boolean'),
 ('has_kudoed', 'boolean'),
 ('heartrate_opt_out', 'boolean'),
 ('id', 'bigint'),
 ('kudos_count', 'text'),
 ('location_city', 'text'),
 ('location_country', 'text'),
 ('location_state', 'text'),
 ('manual', 'boolean'),
 ('map_id', 'text'),
 ('map_resource_state', 

In [6]:
# What is the date range of data set?
engine.execute("""
SELECT min(start_date), max(start_date)
FROM activities;
""").fetchall()

[(datetime.datetime(2016, 4, 15, 21, 52, 44), datetime.datetime(2019, 2, 2, 20, 5, 39))]

In [7]:
# How many of each type of activity in that time?
engine.execute("""
SELECT type, COUNT(*)
FROM activities
GROUP BY type;
""").fetchall()

[('Swim', 6),
 ('StandUpPaddling', 1),
 ('AlpineSki', 2),
 ('Ride', 14),
 ('Hike', 5),
 ('Run', 300)]

Don't remember when I used strava for SUP! Ok, I'm primarily focused on the Runs. Build a summary table that shows the number of runs per year and total number of mileage

Snippet from Strava --

distance: float	The activity's distance, in meters

moving_time :integer	The activity's moving time, in seconds

elapsed_time :integer	The activity's elapsed time, in seconds

Let's try writing a function that converts a column in seconds to time interval

In [8]:
engine.execute("""
DROP FUNCTION IF EXISTS meters_to_miles;

CREATE FUNCTION meters_to_miles(meters float) RETURNS float AS 
$$
SELECT meters / 1609.344;
$$
LANGUAGE SQL
RETURNS NULL ON NULL INPUT;""")

<sqlalchemy.engine.result.ResultProxy at 0x118f84668>

In [28]:
runs_by_month = pd.read_sql("""
SELECT 
    to_char(date_trunc('month', start_date_local), 'YYYY-MM') as year_month,
    COUNT(type) as num_runs,
    meters_to_miles(CAST(SUM(distance) AS float)) as total_dist,
    SUM(meters_to_miles(CAST(distance AS float))) as total_dist_v2,
    'Run' as type
FROM activities
WHERE type = 'Run'
GROUP BY to_char(date_trunc('month', start_date_local), 'YYYY-MM')
ORDER BY to_char(date_trunc('month', start_date_local), 'YYYY-MM');
""", con = engine)

In [29]:
data = [
    go.Scatter(
        x=runs_by_month['year_month'],
        y=runs_by_month['num_runs'],
        xaxis='x',
        name="Total Runs"
    ),
    
    go.Scatter(
        x=runs_by_month['year_month'],
        y=runs_by_month['total_dist'],
        yaxis='y2',
        xaxis='x',
        name="Total Miles"
    )
]

layout = go.Layout(
    title='Runs by Month',
    xaxis={"title":'Month'},
    yaxis={"title":'Number of Runs',
          "rangemode":"tozero"},
    yaxis2={"overlaying":"y",
            "side":"right",
           "rangemode":"tozero"},
    legend={"x":.35, "y":1.1,
           "orientation":"h"}
)

figure = go.Figure(data=data, layout=layout)

In [30]:
iplot(figure)

In [33]:
# same idea, but weekly granularity
runs_by_week = pd.read_sql("""
SELECT 
    date_trunc('week', start_date_local)::date week,
    COUNT(type) total_runs,
    SUM(meters_to_miles(distance)) total_distance
FROM activities
WHERE type = 'Run'
GROUP BY date_trunc('week', start_date_local)
ORDER BY week
""", con=engine)

In [None]:
trace1 = go.Scatter(x=runs_by_week['week'],
                   y=runs_by_week['total_runs'],
                   name="Total Runs")

trace2 = go.Scatter(x=runs_by_week['week'],
                   y=runs_by_week['total_distance'],
                   name="Total Distance",
                   yaxis='y2')


layout = go.Layout(
    title="Runs by Week",
    xaxis={"title":"Weeks"},
    yaxis={"title":"Number of Runs",
          "rangemode":"tozero"},
    yaxis2={"title":"Total Mileage",
           "rangemode":"tozero",
           "overlaying":"y",
           "side":"right"}
)
figure = go.Figure(data = [trace1, trace2], layout = layout)

In [37]:
iplot(figure) # a little too granular I guess, kind jarring to look at

As expected, mileage trends with number of runs, however in 2018, it looks like I was more frequently doing shorter runs. 

It's also interesting to see the seasonal nature of my running. I get into it, then take some time off/get injured

For more SQL practice, let's see what it looks like when I breakdown my runs by week and hour. Am I a morning runner or evening runner?

In [12]:
# first, define a function that ta
runs_by_day = pd.read_sql("""
SELECT
    'Run' as type,
    COUNT(type) as total_runs,
    SUM(meters_to_miles(CAST(distance AS FLOAT))) as total_miles,
    EXTRACT(DOW FROM start_date_local) || ' ' || to_char(start_date_local, 'DAY') day_of_week_name
FROM activities
WHERE type = 'Run'
GROUP BY day_of_week_name
ORDER BY day_of_week_name;
""", con = engine)

In [13]:
# create bar chart
data = [
    go.Bar(
        x=runs_by_day["day_of_week_name"],
        y=runs_by_day["total_runs"],
        xaxis="x",
        yaxis="y",
        name="Total Runs"
    ),
    go.Scatter(
        x=runs_by_day["day_of_week_name"],
        y=runs_by_day["total_miles"],
        xaxis="x",
        yaxis="y2",
        name="Total Miles"
    )]

layout = go.Layout(
    title="Runs by Day of Week",
    yaxis={"title":"Total Runs"},
    yaxis2={"title":"Total Mileage",
           "overlaying":"y",
            "side":"right"}
)
    
figure = go.Figure(data=data, layout=layout)

In [14]:
iplot(figure)    

Saturday morning church of the long run

Also, I love taking it easy on Tuesday's

Let's add some more data points here

In [15]:
runs_dow = pd.read_sql("""
SELECT
    COUNT(type) as total_runs,
    SUM(moving_time) / 60.0 as total_moving_time,
    SUM(meters_to_miles(distance)) as total_mileage,
    AVG(meters_to_miles(distance)) as avg_distance_per_run,
    (SUM(moving_time) / 60.0) / SUM(meters_to_miles(distance)) as avg_pace,
    AVG((moving_time / 60.0) / meters_to_miles(distance)) as pace_averaged,
    EXTRACT(DOW FROM start_date_local) || ' ' || TO_CHAR(start_date_local, 'Day') as dow
FROM activities
WHERE type = 'Run'
GROUP BY dow
ORDER BY dow;
""", con=engine)

Let's practice creating subplots with this data - we'll create two plots

In [16]:
trace1 = go.Bar(
        x=runs_dow['dow'],
        y=runs_dow['total_runs'],
        yaxis='y',
        xaxis='x',
        name='Total Runs')
trace2 = go.Scatter(
        x=runs_dow['dow'],
        y=runs_dow['total_mileage'],
        yaxis='y2',
        xaxis='x',
        name='Total Mileage')
trace3 = go.Bar(
        x=runs_dow['dow'],
        y=runs_dow['avg_distance_per_run'],
        yaxis='y3',
        xaxis='x2',
        name='Avg Mileage')
trace4 = go.Scatter(
        x=runs_dow['dow'],
        y=runs_dow['avg_pace'],
        yaxis='y4',
        xaxis='x2',
        name='Avg Pace')

fig = tools.make_subplots(rows=1, cols=2, horizontal_spacing = .15)

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 1)
fig.append_trace(trace3, 1, 2)
fig.append_trace(trace4, 1, 2)


fig['layout']['yaxis1'].update({'title':'Total Runs'})

fig['layout']["yaxis3"] = {"title":"Total Mileage",
                          "overlaying":"y1",
                          "side":"right",
                          "anchor":"x1"}

fig['layout']['yaxis2'].update({'title':'Avg Mileage'})


fig['layout']['yaxis4'] = {'title':'Avg Pace',
                          'overlaying':'y2',
                          'side':'right',
                          "anchor":"x2"}

fig['data'][1].update(yaxis='y3')
fig['data'][3].update(yaxis='y4')

fig['layout']['legend'] = {"orientation":"h",
                          "x":.25,
                          "y":-.24}

fig['layout'].update(title="Run Breakdown by Day of Week")

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



Layout({
    'legend': {'orientation': 'h', 'x': 0.25, 'y': -0.24},
    'title': {'text': 'Run Breakdown by Day of Week'},
    'xaxis': {'anchor': 'y', 'domain': [0.0, 0.425]},
    'xaxis2': {'anchor': 'y2', 'domain': [0.575, 1.0]},
    'yaxis': {'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'Total Runs'}},
    'yaxis2': {'anchor': 'x2', 'domain': [0.0, 1.0], 'title': {'text': 'Avg Mileage'}},
    'yaxis3': {'anchor': 'x', 'overlaying': 'y', 'side': 'right', 'title': {'text': 'Total Mileage'}},
    'yaxis4': {'anchor': 'x2', 'overlaying': 'y2', 'side': 'right', 'title': {'text': 'Avg Pace'}}
})

In [17]:
iplot(fig)

Not surprising that I run the longest on the weekends, but a little surprising that my fastest runs are on Wednesday. But I'm still slow? I think that has to do the fact I'm running trails and lots of elevation

Before diving into that, let's take a look at a breakdown by start hour

Also would like to see box plots for avg mileage and pace for each day

In [20]:
runs_by_hour = pd.read_sql("""
SELECT 
    EXTRACT(HOUR FROM start_date_local) as start_hour,
    COUNT(type) as total_runs,
    SUM(meters_to_miles(distance)) as total_mileage,
    SUM(moving_time / 60.0) as total_moving_time,
    AVG(meters_to_miles(distance)) as avg_mileage,
    AVG(moving_time / 60.0) as avg_moving_time,
    SUM(moving_time / 60.0) / SUM(meters_to_miles(distance)) as avg_pace,
    AVG( (moving_time / 60.0) / meters_to_miles(distance)) as pace_averaged
FROM activities
WHERE type = 'Run'
GROUP BY EXTRACT(HOUR FROM start_date_local)
ORDER BY EXTRACT(HOUR FROM start_date_local);
""", con=engine)

In [87]:
fig = tools.make_subplots(rows = 1, cols = 2, horizontal_spacing=.2)

trace1 = go.Bar(
    x=runs_by_hour['start_hour'],
    y=runs_by_hour['total_runs'],
    name='Number of Runs'
)

trace2 = go.Scatter(
    x=runs_by_hour['start_hour'],
    y=runs_by_hour['total_mileage'],
    name='Total Mileage'
)

trace3 = go.Bar(
    x=runs_by_hour['start_hour'],
    y=runs_by_hour['avg_mileage'],
    name='Average Mileage'
)

trace4 = go.Scatter(
    x=runs_by_hour['start_hour'],
    y=runs_by_hour['avg_pace'],
    name='Average Pace'
)

fig.append_trace(trace1, row = 1, col = 1)
fig.append_trace(trace2, row = 1, col = 1)
fig.append_trace(trace3, row = 1, col = 2)
fig.append_trace(trace4, row = 1, col = 2)

fig['data'][1].update(yaxis='y3')
fig['data'][3].update(yaxis='y4')

# create secondary axis' for trace3 and trace 4
fig['layout']['yaxis3'] = {"title":"Total Mileage",
                          "overlaying":"y",
                          "side":"right",
                          "anchor":"x",
                          "rangemode":"tozero"} 
fig['layout']['yaxis4'] = {"title":"Average Pace",
                          "overlaying":"y2",
                          "side":"right",
                          "anchor":"x2"}

fig['layout'].update(title="Running Summary by Hour of Day")
fig['layout'].update(yaxis={"title":"Number of Runs"})
fig['layout'].update(yaxis2={"title":"Average Mileage"})
fig['layout'].update(xaxis={"title":"Time of Day"})
fig['layout'].update(xaxis2={"title":"Time of Day"})
fig['layout']['legend'] = {"orientation":"h",
                          "y":1.1, "x":.2}

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [88]:
iplot(fig)

I'm a bit suprised at how many runs occur in the middle of the day...

Looks like I enjoy the early morning run, as well as the lunchtime run. Not sure what I was doing running at 10pm...

On average I run the fastest when I run at 5pm, possibly because I'm blowing off steam after work? Likely going for shorter runs as well.

Now what will be interesting to look at is elevation - I remember that my garmin was acting up throughout the year, and I would get elevation gains that were absurd.

I think the best way to handle this is to create a cleaned data column in the database for elevation and set to NULL if the elevation / mile is greater than some cutoff value. To determine that, let's look at a scatter plot of mileage vs elevation

In [113]:
mileage_elevation = pd.read_sql("""
SELECT
    start_date_local,
    meters_to_miles(distance) miles,
    total_elevation_gain
FROM activities
WHERE type = 'Run'
ORDER BY start_date_local;
""", con=engine)

In [118]:
trace1 = go.Scatter(
    x=mileage_elevation['start_date_local'],
    y=mileage_elevation['total_elevation_gain']/mileage_elevation['miles'],
    name="Mileage vs Elevation",
    mode='markers'
)

layout = go.Layout(
    title="Mileage vs Elevation",
    xaxis={"title":"Mileage"},
    yaxis={"title":"Elevation"}
)

figure = go.Figure(data = [trace1], layout=layout)
iplot(figure)

Looks like 300 is a good cutoff - that is damn steep

In [131]:
# update the rows where the value is not correct
engine.execute("""
ALTER TABLE activities
    DROP IF EXISTS total_elevation_clean;
    
ALTER TABLE activities    
    ADD COLUMN total_elevation_clean float;

UPDATE activities
SET total_elevation_clean = CASE WHEN 
    (total_elevation_gain / meters_to_miles(distance)) > 300 THEN NULL
    ELSE total_elevation_gain END;
""")

<sqlalchemy.engine.result.ResultProxy at 0x11b8f3400>

In [132]:
mileage_elevation = pd.read_sql("""
SELECT
    start_date_local,
    meters_to_miles(distance) miles,
    total_elevation_clean
FROM activities
WHERE type = 'Run'
ORDER BY start_date_local;
""", con=engine)

In [133]:
trace1 = go.Scatter(
    x=mileage_elevation['miles'],
    y=mileage_elevation['total_elevation_clean'],
    name="Mileage vs Elevation",
    mode='markers'
)

layout = go.Layout(
    title="Mileage vs Elevation",
    xaxis={"title":"Mileage"},
    yaxis={"title":"Elevation"}
)

figure = go.Figure(data = [trace1], layout=layout)
iplot(figure)

In [135]:
# total_elevation_gain in month plot
runs_detail = pd.read_sql("""
SELECT 
    to_char(date_trunc('month', start_date_local), 'YYYY-MM') date_month,
    SUM(meters_to_miles(distance)) total_mileage,
    SUM(total_elevation_clean) total_elevation_gain
FROM activities
WHERE type = 'Run' AND total_elevation_clean IS NOT NULL
GROUP BY date_trunc('month', start_date_local)
ORDER BY date_month
""", con=engine)

In [136]:
trace1 = go.Scatter(
    x=runs_detail['date_month'],
    y=runs_detail['total_mileage'],
    name="Total Mileage"
)

trace2 = go.Scatter(
    x=runs_detail['date_month'],
    y=runs_detail['total_elevation_gain'],
    yaxis='y2',
    name='Total Elevation Gain'
)

layout = go.Layout(
    title="Elevation and Mileage by Month",
    xaxis={"title":"Month"},
    yaxis={"title":"Total Miles",
           "rangemode":"tozero"},
    yaxis2={"title":"Total Elevation (Meters)",
           "overlaying":"y",
           "side":"right",
           "rangemode":"tozero"}
)

figure = go.Figure(data=[trace1, trace2], layout=layout)

In [137]:
iplot(figure)

My elevation to mileage ratio really dropping off this past year...

When was I running the fastest?

In [159]:
run_speed = pd.read_sql("""
SELECT 
    to_char(date_trunc('month', start_date_local), 'YYYY-MM') AS month,
    AVG(total_elevation_clean / meters_to_miles(distance)) total_miles,
    SUM(moving_time / 60.0) / SUM(meters_to_miles(distance)) AS avg_pace
FROM activities
WHERE type='Run' AND total_elevation_clean IS NOT NULL
GROUP BY to_char(date_trunc('month', start_date_local), 'YYYY-MM')
ORDER BY month
""", con=engine)

In [160]:
trace1 = go.Scatter(
    x=run_speed['month'],
    y=run_speed['total_miles'],
    name="Average Mileage per Run"
)

trace2 = go.Scatter(
    x=run_speed['month'],
    y=run_speed['avg_pace'],
    name='Average Pace per Run',
    yaxis="y2"
)

layout = go.Layout(
    title="Pace and Mileage over Time",
    xaxis={"title":"Month"},
    yaxis={"title":"Average Run Length"},
    yaxis2={"title":"Average Pace",
           "overlaying":"y",
           "side":"right"}
)

figure = go.Figure(data = [trace1, trace2], layout = layout)

In [161]:
iplot(figure)

Make a simple dashboard using Dash
- explore polymap
- explore stream data (heartrate, elevation, speed)
- use views
- use windowing functions
- joins
- multiple databases
