In [55]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import datetime
import sys
import os 

sys.path.append('..')
from src import config

if sys.platform == 'linux':
    path = config.LINUX_PATH
else:
    path = config.OS_PATH
os.chdir(path)

data_path = config.STRAVA_TRAIN_PATH
data = pd.read_csv(data_path, index_col=0)
print(data.columns.tolist())
data.head()

['name', 'distance', 'moving_time', 'total_elevation_gain', 'workout_type', 'timezone', 'achievement_count', 'kudos_count', 'athlete_count', 'photo_count', 'manual', 'max_speed', 'average_heartrate', 'max_heartrate', 'pr_count', 'total_photo_count', 'suffer_score', 'GMT_date', 'GMT_time', 'local_date', 'local_time', 'run_area', 'latlng_cluster', 'city', 'average_speed_mpk']


Unnamed: 0,name,distance,moving_time,total_elevation_gain,workout_type,timezone,achievement_count,kudos_count,athlete_count,photo_count,...,total_photo_count,suffer_score,GMT_date,GMT_time,local_date,local_time,run_area,latlng_cluster,city,average_speed_mpk
260,Pre meet,7.196,28.383333,3.0,3.0,America/Boise,0.0,33.0,1.0,0.0,...,0.0,16.0,2021-01-29,21:51:05,2021-01-29,14:51:05,2.207397,1,Boise,3.944615
261,WU,4.8448,22.183333,44.0,0.0,America/Boise,0.0,11.0,1.0,0.0,...,0.0,20.0,2021-01-29,21:08:46,2021-01-29,14:08:46,0.16698,1,Boise,4.578571
262,Gain train 🚂😈🤪,8.1354,36.1,10.0,0.0,America/Boise,0.0,32.0,1.0,0.0,...,0.0,26.0,2021-01-29,16:01:32,2021-01-29,09:01:32,5.870196,1,Boise,4.437167
263,"Not 10, sorry Logan",12.9277,59.283333,49.0,0.0,America/Boise,2.0,44.0,1.0,0.0,...,0.0,52.0,2021-01-28,22:39:54,2021-01-28,15:39:54,37.574116,1,Boise,4.586131
264,😴,6.7834,30.283333,9.0,0.0,America/Boise,0.0,27.0,1.0,0.0,...,0.0,29.0,2021-01-28,14:13:57,2021-01-28,07:13:57,4.20775,1,Boise,4.464506


In [52]:
def uk_awake_feature(df):
    '''
    A binary feature to see if run was in U.K. awake hours or not
    '''
    def is_awake(time_element):
        time_element = datetime.datetime.time(datetime.datetime.strptime(time_element, '%H:%M:%S'))
        wake = datetime.time(6, 00, 00)
        sleep = datetime.time(23, 59, 59)
        return int(time_element > wake and time_element < sleep)
    df['is_uk_awake'] = df['GMT_time'].apply(lambda x: is_awake(x))
    return df

temp = uk_awake_feature(data)

In [53]:
data.loc[:, 'datetime'] = pd.to_datetime(data['local_date'] + ' ' + data['local_time'])
data.loc[:, 'year'] = data['datetime'].dt.year
data.loc[:, 'weekofyear'] = data['datetime'].dt.isocalendar().week
data.loc[:, 'month'] = data['datetime'].dt.month
data.loc[:, 'dayofweek'] = data['datetime'].dt.dayofweek
data.loc[:, 'weekend'] = (data.datetime.dt.weekday >=5).astype(int)
data.loc[:, 'hour'] = data['datetime'].dt.hour
data = uk_awake_feature(data)
data.head()

Unnamed: 0,name,distance,moving_time,total_elevation_gain,workout_type,timezone,achievement_count,kudos_count,athlete_count,photo_count,...,city,average_speed_mpk,datetime,year,weekofyear,month,dayofweek,weekend,hour,is_uk_awake
260,Pre meet,7.196,28.383333,3.0,3.0,America/Boise,0.0,33.0,1.0,0.0,...,Boise,3.944615,2021-01-29 14:51:05,2021,4,1,4,0,14,1
261,WU,4.8448,22.183333,44.0,0.0,America/Boise,0.0,11.0,1.0,0.0,...,Boise,4.578571,2021-01-29 14:08:46,2021,4,1,4,0,14,1
262,Gain train 🚂😈🤪,8.1354,36.1,10.0,0.0,America/Boise,0.0,32.0,1.0,0.0,...,Boise,4.437167,2021-01-29 09:01:32,2021,4,1,4,0,9,1
263,"Not 10, sorry Logan",12.9277,59.283333,49.0,0.0,America/Boise,2.0,44.0,1.0,0.0,...,Boise,4.586131,2021-01-28 15:39:54,2021,4,1,3,0,15,1
264,😴,6.7834,30.283333,9.0,0.0,America/Boise,0.0,27.0,1.0,0.0,...,Boise,4.464506,2021-01-28 07:13:57,2021,4,1,3,0,7,1


In [56]:
def generate_time_features(df):
    # create time based features using date columns
    df.loc[:, 'datetime'] = pd.to_datetime(df['local_date'] + ' ' + df['local_time'])
    df.loc[:, 'year'] = df['datetime'].dt.year
    df.loc[:, 'weekofyear'] = df['datetime'].dt.isocalendar().week
    df.loc[:, 'month'] = df['datetime'].dt.month
    df.loc[:, 'dayofweek'] = df['datetime'].dt.dayofweek
    df.loc[:, 'weekend'] = (df.datetime.dt.weekday >=5).astype(int)
    df.loc[:, 'hour'] = df['datetime'].dt.hour
    df = uk_awake_feature(df)
    return df

temp = generate_time_features(data)
temp.head()

Unnamed: 0,name,distance,moving_time,total_elevation_gain,workout_type,timezone,achievement_count,kudos_count,athlete_count,photo_count,...,city,average_speed_mpk,datetime,year,weekofyear,month,dayofweek,weekend,hour,is_uk_awake
260,Pre meet,7.196,28.383333,3.0,3.0,America/Boise,0.0,33.0,1.0,0.0,...,Boise,3.944615,2021-01-29 14:51:05,2021,4,1,4,0,14,1
261,WU,4.8448,22.183333,44.0,0.0,America/Boise,0.0,11.0,1.0,0.0,...,Boise,4.578571,2021-01-29 14:08:46,2021,4,1,4,0,14,1
262,Gain train 🚂😈🤪,8.1354,36.1,10.0,0.0,America/Boise,0.0,32.0,1.0,0.0,...,Boise,4.437167,2021-01-29 09:01:32,2021,4,1,4,0,9,1
263,"Not 10, sorry Logan",12.9277,59.283333,49.0,0.0,America/Boise,2.0,44.0,1.0,0.0,...,Boise,4.586131,2021-01-28 15:39:54,2021,4,1,3,0,15,1
264,😴,6.7834,30.283333,9.0,0.0,America/Boise,0.0,27.0,1.0,0.0,...,Boise,4.464506,2021-01-28 07:13:57,2021,4,1,3,0,7,1
