# Explore and Plot my running data
---
### Learning Objectives:
  * Load and Cleanse data
  * Explore data

### Dataset:
  * My Strava (http://www.strava.com) running history
    * Can be obatined through the API (https://developers.strava.com/) or by downloading from your profile
---

## Load Data
---

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('data/activities.csv', parse_dates=['Activity Date'])

data.head()

In [None]:
data.columns

## Clean up  Data:
---
1. Pick up the relevant Columns
1. Keep only activities that are runs (no workouts, swims, hikes, etc...)
1. Drop NAs
1.  Add calculated column for Average Speed

In [None]:
run_cols = ['Activity Date','Activity Name','Activity Type','Elapsed Time','Distance.1', 'Average Heart Rate', 'Average Cadence', 'Average Grade','Relative Effort']

# 1. Pick up only relevant columns
runs = data[run_cols].copy()

# 2. Filter for 'Runs' only
runs = runs[runs['Activity Type'].eq('Run')]

# 3. Drop NAs
runs = runs.dropna()

# 4. Add Average Speed column (wrong on source data)
runs['Average Speed'] = runs['Distance.1'] / runs['Elapsed Time']


## Explore the Data
---
1. Describe and Explore Data
1. Pick up the relevant Columns
1. Explore correlations

In [None]:
runs.describe()

In [None]:
runs

In [None]:
import seaborn as sns
sns.distplot(runs['Relative Effort'])

In [None]:
# Prepare Calendar Dataframe (fill in days where I did not exercise)
first_date = min(runs['Activity Date']).replace(hour=0, minute=0, second=0, microsecond=0)
last_date = max(runs['Activity Date']).replace(hour=0, minute=0, second=0, microsecond=0)

runs['Activity Date'] = pd.to_datetime(runs['Activity Date'].dt.date)

calendar = pd.DataFrame()
calendar['days'] = pd.date_range(first_date, last_date, freq='D')

full_df = pd.merge(calendar,runs,left_on='days', right_on='Activity Date', how='left')
full_df['Distance.1'] = full_df['Distance.1'].fillna(0)
full_df.set_index('days', inplace=True)

# Print Plot
fig = plt.figure()
f, axs = plt.subplots(1,1,figsize=(12,3))
axs.plot(full_df.index.values,full_df['Distance.1'])
plt.title('Kms Run')
plt.xlabel('Date', fontsize=9)
plt.ylabel('Distance', fontsize=9)    
axs.legend(loc='best')
plt.grid()
plt.show()

In [None]:
# Group data by Months
df = full_df.groupby(pd.Grouper(freq="M")).sum()

fig = plt.figure()
f, axs = plt.subplots(1,1,figsize=(12,3))
axs.plot(df.index.values,df['Distance.1'])

plt.title('Kms Run')
plt.xlabel('Date', fontsize=9)
plt.ylabel('Distance', fontsize=9)
plt.ylim(ymin=0)
axs.legend(loc='best')
plt.grid()

plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Plot
plt.figure(figsize=(12,10), dpi= 80)
sns.heatmap(runs.corr(), xticklabels=runs.corr().columns, yticklabels=runs.corr().columns, cmap='RdYlGn', center=0, annot=True)

# Decorations
plt.title('Correlogram of runs', fontsize=22)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
corr_cols = ['Relative Effort','Distance.1','Elapsed Time','Average Heart Rate']

runs_small = runs[corr_cols]

sns.pairplot(runs_small)