# Initial Wrangle 

#### Analysis by Jeremy Mann, 2020-6-13
Just a simple initial wrangle to get a sense of the formatting of the csv for running data.

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

In [51]:
class polar_h10_running_wrangler:
    '''
    Wrangles running data
    Note that it considers anything slower than 60 min/mi as still.
    '''
    def __init__(self, filepath):
        self.filepath = filepath
        self.meta_df = self.wrangle_meta_df()
        self.data_df = self.wrangle_data_df()
        
    def wrangle_meta_df(self):
        """
        Extracts and wrangle session metadata 
        """
        meta_df = pd.read_csv(self.filepath)[:1]
        
        meta_df.dropna(axis = 1, inplace = True)
        
        meta_df['Date'] = pd.to_datetime(meta_df['Date'], format='%d-%m-%Y')
        meta_df['Start time'] = pd.to_datetime(meta_df['Start time'], infer_datetime_format=True)
        meta_df['Duration' ]= pd.to_timedelta(meta_df['Duration'])
        
        meta_df.drop(columns = ['Date'], inplace = True)
        
        renaming_dict = {'Start time': 'Start Datetime'}
        meta_df.rename(columns = renaming_dict, inplace = True)
        
        meta_df.loc[0,'Sport'] = meta_df.loc[0, 'Sport'].title()
        meta_df.loc[0,'Name'] = meta_df.loc[0, 'Name'].title()        
        
        return meta_df
    
    def wrangle_data_df(self, pace_threshold = 75):
        '''
        Extracts and wrangles the session data
        '''
        data_df = pd.read_csv(filepath, header= 2)
        
        data_df.dropna(axis = 1, inplace = True)
        
        data_df['Pace (min/mi)'] = '00:' + data_df['Pace (min/mi)'] 
        data_df['Pace (min/mi)'] = pd.to_timedelta(
            data_df['Pace (min/mi)']
        ).dt.total_seconds()/60
        
        data_df['Pace (min/mi)'] = np.round(data_df['Pace (min/mi)'],  decimals = 1)
        
        data_df[data_df['Pace (min/mi)'] > pace_threshold] = 0
        
        data = np.full(shape = data_df.index.shape, fill_value = self.get_start_datetime())
        start_datetime_series = pd.Series(data = data, index = data_df.index)
        
        data_df['Time'] = pd.to_timedelta(data_df['Time']) + start_datetime_series

        data_df.set_index('Time', inplace = True)
        
        return data_df
        
    def get_activity(self):
        activity = self.meta_df.loc[0,'Sport'].lower()
        return activity
    
    def get_name(self):
        name = self.meta_df.loc[0,'Name'].replace(' ', '_').lower()
        return name
    
    def get_start_datetime(self):
        start_datetime = self.meta_df.loc[0, 'Start Datetime']
        return start_datetime
    
    def save_wrangled_data(self):
        '''
        Saves the session data. Format is:
        <date>_<start_time>_<activity>_<last_name>_<first_name>
        '''
        
        start_dt_str = self.get_start_datetime().strftime('%Y-%m-%d_%H:%M')
        activity = self.get_activity()
        name = self.get_name()
        save_filename = '{}_{}_{}.csv'.format(start_dt_str, activity, name)
        filepath = os.path.join('..', 'data', 'wrangled_data', save_filename) 
        self.data_df.to_csv(filepath)
    
filepath = '../data/raw_data/Jeremy_Mann_2020-06-10_15-25-36.CSV'
wrangler = polar_h10_running_wrangler(filepath)
wrangler.save_wrangled_data()

# Data Dateframe

In [52]:
wrangler.data_df.head(10)

Unnamed: 0_level_0,HR (bpm),Speed (mi/h),Pace (min/mi),Distances (ft)
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-14 15:25:36,98,0.0,0.0,0.0
2020-06-14 15:25:37,99,0.0,0.0,0.0
2020-06-14 15:25:38,98,2.7,22.1,0.0
2020-06-14 15:25:39,95,2.7,22.2,8.5
2020-06-14 15:25:40,93,2.7,22.0,17.0
2020-06-14 15:25:41,92,2.7,21.8,25.5
2020-06-14 15:25:42,91,2.8,21.8,34.0
2020-06-14 15:25:43,92,2.7,22.6,42.5
2020-06-14 15:25:44,93,2.7,21.9,51.0
2020-06-14 15:25:45,95,2.8,21.1,59.5


In [53]:
wrangler.data_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1323 entries, 2020-06-14 15:25:36 to 2020-06-14 15:47:38
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   HR (bpm)        1323 non-null   int64  
 1   Speed (mi/h)    1323 non-null   float64
 2   Pace (min/mi)   1323 non-null   float64
 3   Distances (ft)  1323 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 51.7 KB


In [54]:
wrangler.data_df.describe()

Unnamed: 0,HR (bpm),Speed (mi/h),Pace (min/mi),Distances (ft)
count,1323.0,1323.0,1323.0,1323.0
mean,139.837491,3.472714,17.511489,4421.08285
std,25.680011,1.579168,10.133484,2558.223454
min,0.0,0.0,0.0,0.0
25%,127.0,2.35,11.8,2176.65
50%,148.0,3.6,14.4,4647.0
75%,158.0,4.9,21.7,6778.135
max,166.0,5.8,69.4,8127.89


# Metadata Dataframe

In [55]:
wrangler.meta_df.head()

Unnamed: 0,Name,Sport,Start Datetime,Duration,Total distance (mi),Average heart rate (bpm),Average speed (mi/h),Max speed (mi/h),Average pace (min/mi),Max pace (min/mi),Calories,Fat percentage of calories(%),Running index,Height (ft in),Weight (lbs),HR max
0,Jeremy Mann,Running,2020-06-14 15:25:36,00:22:01,1.54,142,4.2,5.8,14:18,10:21,290.0,16.0,35.0,6 2,176.4,190.0


In [56]:
wrangler.meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype          
---  ------                         --------------  -----          
 0   Name                           1 non-null      object         
 1   Sport                          1 non-null      object         
 2   Start Datetime                 1 non-null      datetime64[ns] 
 3   Duration                       1 non-null      timedelta64[ns]
 4   Total distance (mi)            1 non-null      object         
 5   Average heart rate (bpm)       1 non-null      object         
 6   Average speed (mi/h)           1 non-null      object         
 7   Max speed (mi/h)               1 non-null      object         
 8   Average pace (min/mi)          1 non-null      object         
 9   Max pace (min/mi)              1 non-null      object         
 10  Calories                       1 non-null      float64        
 11  Fat percen

In [57]:
wrangler.meta_df.describe()

Unnamed: 0,Duration,Calories,Fat percentage of calories(%),Running index,Weight (lbs),HR max
count,1,1.0,1.0,1.0,1.0,1.0
mean,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
std,NaT,,,,,
min,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
25%,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
50%,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
75%,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
max,0 days 00:22:01,290.0,16.0,35.0,176.4,190.0
