### Analysis of single activities

A record of preliminary experiments to plot autocorrelation functions and develop models to predict an expensive measurement (e.g., power) from common/cheap measurements (e.g., VAM and speed). 

In [None]:
import os
import re
import sys
import time
import datetime
import psycopg2
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot as plt

In [None]:
sys.path.insert(0, '../')
import cypy2

In [None]:
from cycler import cycler
red, blue, green, purple, orange, yellow, brown, pink, gray  = sns.color_palette('Set1')
mpl.rcParams['axes.prop_cycle'] = cycler(color=[blue, orange, green, red, brown, gray])
mpl.rcParams['figure.figsize'] = [12, 3]

In [None]:
%matplotlib 
%load_ext autoreload
%autoreload 2

In [None]:
user = 'keith'
host = 'localhost'
dbname = 'cypy2'
conn = psycopg2.connect(user=user, host=host, dbname=dbname)

In [None]:
md = pd.read_sql('select * from metadata order by strava_timestamp desc', conn)

In [None]:
md.loc[md.strava_title.apply(lambda s: s.find('3x10') != -1)]

In [None]:
# 3x SPD rides from 2016 - 2019
a_3x10_2019 = cypy2.Activity.from_db(conn, '20190301001401')
a_3x10_2018 = cypy2.Activity.from_db(conn, '20181125232717')
a_3x10_2017 = cypy2.Activity.from_db(conn, '20171202231443')
a_3x10_2016 = cypy2.Activity.from_db(conn, '20161027001349')

# 2018 horseshoe meadows road
a_hmr = cypy2.Activity.from_db(conn, '20180923163103')

In [None]:
activities = [a_3x10_2016, a_3x10_2017, a_3x10_2018, a_3x10_2019]

In [None]:
a.plot(['power', 'vam', 'altitude'], xmode='seconds', overlay=False, xrange=[], halflife=1)

In [None]:
for a in activities:
    a.plot(['power', 'vam'], xmode='seconds', overlay=True, xrange=[], halflife=5)
    plt.gca().set_title(a.metadata.activity_id)

In [None]:
a_hmr.plot(['power', 'vam'], xmode='seconds', overlay=True, xrange=[], halflife=5)

In [None]:
a_hmr.plot(['altitude'], xmode='seconds')

Figuring out why model to estimate power from VAM and speed using SPD rides as training data (see below) fails to predict power for HMR ride. Answer is likely a combination of altimeter error and greater bike weight. 

In [None]:
(
    # VAM correction multipler for HMR (max elevation was ~9600 instead of 10000)
    (9930 - 3850)/(9600 - 3850),

    # VAM weight multiplier (using Lynskey + backpack)
    80/72
)

In [None]:
plt.plot(a_hmr.records('proc').power.ewm(halflife=10).mean())
plt.plot(a_hmr.records('proc').vam.ewm(halflife=10).mean() * (270/1300) * 1.06 * (80/72))

plt.gca().set_ylim([0, 300])

In [None]:
plt.plot(np.diff(a_3x10_2018.records('proc').altitude.values[500:800]))
plt.plot(np.diff(a_3x10_2019.records('proc').altitude.values[500:800]))

In [None]:
plt.plot((a_3x10_2018.records('proc').vam.values))
plt.plot((a_3x10_2019.records('proc').vam.values))
plt.gca().set_ylim([0, 1800])

### Histograms

In [None]:
bins= {
    'speed': np.arange(5, 15, .3),
    'vam': np.arange(500, 1600, 50),
    'cadence': np.arange(60, 90, 1),
    'power': np.arange(200, 320, 5),
    'heart_rate': np.arange(120, 180, 2),
}

In [None]:
a = a_3x10_2019
columns = ['power', 'vam']
x_vals, y_vals = [a.records('proc')[c].values for c in columns]
x_bins, y_bins = [bins[c] for c in columns]

mask = (x_vals > min(x_bins)) & (x_vals < max(x_bins)) & (y_vals > min(y_bins)) & (y_vals < max(y_bins))
x_vals, y_vals = x_vals[mask], y_vals[mask]
    
h = plt.hist2d(x_vals, y_vals, bins=(x_bins, y_bins))

### Histograms of adjacent timepoints

In [None]:
offset = 1
column = 'power'

vals = a.records('proc')[column].values
vals = vals[~np.isnan(vals)]
h = plt.hist2d(vals[offset:], vals[:-offset], bins=bins[column])

### Autocorrelation functions

In [None]:
def autocorr(x, max_offset, stepsize=1, drop_zeros=True):
    
    offsets = np.arange(1, max_offset, stepsize).astype(int)
    
    corrs = []
    for offset in offsets:
        vals = np.vstack((x[offset:], x[:-offset]))
        
        # drop nans
        vals = vals[:, ~np.isnan(vals).any(axis=0)]
        
        if drop_zeros:
            vals = vals[:, vals.all(axis=0)]
        
        cov = np.cov(vals[0, :], vals[1, :])
        corrs.append(cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1]))
        
    return offsets, corrs

In [None]:
for col in ['heart_rate', 'power', 'cadence', 'speed']:
    plt.plot(*autocorr(recsp[col].values, 120, 1, drop_zeros=True), marker='.', label=col)
plt.legend()
plt.gca().set_ylim([-.1, 1.1])

### Models to predict future values or other time series

In [None]:
from sklearn import ensemble
from sklearn import linear_model
from sklearn import model_selection

In [None]:
def make_windows(records, columns, window_size, step_size, halflife=None):
    
    windows = []
    for ind, column in enumerate(columns):
        vals = records[column]
        if halflife:
            vals = vals.ewm(halflife=halflife).mean()

        _windows = cypy2.utils.sliding_window(vals, window_size, step_size)
        _mask = np.isnan(_windows).any(axis=1)
        windows.append(_windows)
        if ind==0:
            mask = _mask
        else:
            mask += _mask
    
    mask = mask.astype(bool)
    windows = [w[~mask, :] for w in windows]
    return windows

### Autoregressive model

In [None]:
windows, = make_windows(recsp, ['heart_rate'], 10, 1)

In [None]:
X = windows[:, :-1]
y = windows[:, -1]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=.8)

In [None]:
m = linear_model.LinearRegression()
m.fit(X_train, y_train)
m.coef_

In [None]:
yp = m.predict(X)
plt.plot(y)
plt.plot(yp)

### Predict power from VAM and speed

In [None]:
window_size = 10
activities_train = activities[1:]
activity_test = activities[0]

In [None]:
# train
recs = pd.concat(tuple([a.records('proc') for a in activities]), axis=0)
vam_wins, pwr_wins, speed_wins = make_windows(
    recs, ['vam', 'power', 'speed'], window_size, window_size, halflife=10)

X_train = np.concatenate((vam_wins, speed_wins), axis=1)
y_train = pwr_wins[:, -1]

# test 
recs = activity_test.records('proc')
vam_wins, pwr_wins, speed_wins = make_windows(
    recs, ['vam', 'power', 'speed'], window_size, window_size, halflife=10)

X_test = np.concatenate((vam_wins, speed_wins), axis=1)
y_test = pwr_wins[:, -1]

In [None]:
m = ensemble.RandomForestRegressor()
m.fit(X_train, y_train)

In [None]:
yp = m.predict(X_test)
plt.plot(y_test)
plt.plot(yp)

In [None]:
a_hmr.plot(['power', 'vam'], xmode='seconds', overlay=True, xrange=[3333, 5000])