In [794]:
# Import required modules
import requests
import urllib3
import secrets
import pandas as pd
import polyline
import folium
from ast import literal_eval
from datetime import datetime
from meteostat import Stations, Daily

In [795]:
# Disable insecure request warnings from urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Strava API Request
    - https://developers.strava.com/docs/reference/
    - within this repo there is a .gitignore file which ignores secrets.py this is where the strava_payload is to handle Strava API request
        - values in secrets.py for Strava API request:
            - client_id
            - client_secret
            - refresh_token
            - grant_type
            - f 
                - which is a request for json files to the API


In [796]:
auth_url = 'https://www.strava.com/oauth/token'
activities_url = 'https://www.strava.com/api/v3/athlete/activities'

# Request Strava Token
print('Requesting Strava token... \n')
res = requests.post(auth_url, data=secrets.strava_payload, verify=False)
strava_access_token = res.json()['access_token']

# Set the authorization header using the obtained access token
header = {'Authorization': 'Bearer ' + strava_access_token}

strava_requests_page_num = 1
all_activities = []

while True:
    # Prepare the parameters for paginated request
    strava_param = {'per_page' : 15, 'page' : strava_requests_page_num}
    # Send GET request to retrieve Strava activity data
    strava_dataset = requests.get(activities_url, headers=header, params=strava_param).json()

    if len(strava_dataset) == 0:
        print('breaking out of Strava while loop because the response is zero, indicating no more activities.')
        break

    if all_activities:
        print('all activities is populated')
        all_activities.extend(strava_dataset)

    else:
        print('all activities is NOT populated')
        all_activities = strava_dataset

    strava_requests_page_num += 1

print('Total Activities: ', len(all_activities))

Requesting Strava token... 



all activities is NOT populated
all activities is populated
all activities is populated
all activities is populated
breaking out of Strava while loop because the response is zero, indicating no more activities.
Total Activities:  54


## Creating pandas dataframe for all activites from Strava api
    - Contains all activites
        - Running
        - Walking
        - Hiking
        - Biking

In [797]:
all_strava_activites = pd.DataFrame(data=all_activities)

# Inspecting and Cleaning Activities Data
    - Where can data be cleaned?
    - Many Columns are not filled with information because there is no use of a watch or heart monitor.
    - Not intersted in the social information of Strava
        - ie, photos, kudos
    - location_city, location_state actually contain no information
        - We can get location information from Google Polyline information found in the 'map' column


In [798]:
all_strava_activites.head()

Unnamed: 0,resource_state,athlete,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,...,upload_id_str,external_id,from_accepted_tag,pr_count,total_photo_count,has_kudoed,suffer_score,average_watts,kilojoules,device_watts
0,2,"{'id': 8586088, 'resource_state': 1}",PMRP : Sore Heel Loop and Drive By Loop,8693.2,2864,3001,317.7,Run,Run,0.0,...,10170403908,71247EE3-700D-4881-87D0-B8B996F29BBD-activity.fit,False,0,0,False,,,,
1,2,"{'id': 8586088, 'resource_state': 1}",RRG : Rush Trail to Gray’s Arch Loop,6312.2,2374,2448,173.8,Run,Run,0.0,...,10150965476,3A315073-851C-4478-8D23-F4D440BAA704-activity.fit,False,0,0,False,,,,
2,2,"{'id': 8586088, 'resource_state': 1}",PMRP : Lode Loop and Drive By Loop,5532.4,1884,1967,124.0,Run,Run,0.0,...,10143932040,7D43355C-FECB-473B-8263-E7C4BDB870DE-activity.fit,False,0,0,False,,,,
3,2,"{'id': 8586088, 'resource_state': 1}",PMRP : Lode Loop and Sore Heel loop,7940.5,2861,2886,153.7,Run,Run,0.0,...,10131336663,4256B6D8-14F3-4F18-8DF3-440342D551CA-activity.fit,False,0,0,False,,,,
4,2,"{'id': 8586088, 'resource_state': 1}",PMRP : Sore Heel Loop,7090.5,2341,2365,210.0,Run,Run,0.0,...,10124522361,0B1C68DC-4E71-4EBB-8217-F688A9582DCD-activity.fit,False,0,0,False,,,,


In [799]:
all_strava_activites_columns = all_strava_activites.columns.to_list()
all_strava_activites_columns

['resource_state',
 'athlete',
 'name',
 'distance',
 'moving_time',
 'elapsed_time',
 'total_elevation_gain',
 'type',
 'sport_type',
 'workout_type',
 'id',
 'start_date',
 'start_date_local',
 'timezone',
 'utc_offset',
 'location_city',
 'location_state',
 'location_country',
 'achievement_count',
 'kudos_count',
 'comment_count',
 'athlete_count',
 'photo_count',
 'map',
 'trainer',
 'commute',
 'manual',
 'private',
 'visibility',
 'flagged',
 'gear_id',
 'start_latlng',
 'end_latlng',
 'average_speed',
 'max_speed',
 'has_heartrate',
 'heartrate_opt_out',
 'display_hide_heartrate_option',
 'elev_high',
 'elev_low',
 'upload_id',
 'upload_id_str',
 'external_id',
 'from_accepted_tag',
 'pr_count',
 'total_photo_count',
 'has_kudoed',
 'suffer_score',
 'average_watts',
 'kilojoules',
 'device_watts']

In [800]:
# Define the columns to drop from the DataFrame
columns_to_drop = ['athlete',
                   'resource_state', 
                   'sport_type', 
                   'workout_type',
                   'location_city',
                   'location_state',
                   'location_country', 
                   'kudos_count', 
                   'comment_count', 
                   'athlete_count', 
                   'photo_count', 
                   'trainer', 
                   'commute', 
                   'manual', 
                   'private',
                   'visibility', 
                   'flagged', 
                   'gear_id', 
                   'has_heartrate', 
                   'heartrate_opt_out', 
                   'display_hide_heartrate_option', 
                   'from_accepted_tag', 
                   'total_photo_count', 
                   'has_kudoed', 
                   'average_watts', 
                   'kilojoules',
                   'achievement_count',
                   'device_watts',
                   'upload_id_str',
                   'upload_id',
                   'external_id', 
                   'suffer_score']

# Drop the specified columns from the DataFrame
all_strava_activites.drop(columns=columns_to_drop, inplace=True)

columns = all_strava_activites.columns.to_list()

In [801]:
columns

['name',
 'distance',
 'moving_time',
 'elapsed_time',
 'total_elevation_gain',
 'type',
 'id',
 'start_date',
 'start_date_local',
 'timezone',
 'utc_offset',
 'map',
 'start_latlng',
 'end_latlng',
 'average_speed',
 'max_speed',
 'elev_high',
 'elev_low',
 'pr_count']

# Creating a Pandas Dataframe for just the activity of Running
    - Filter data with the 'type' is equal to 'Run'

In [802]:
run_data = all_strava_activites.loc[all_strava_activites['type'] == 'Run']

# Reset the index of the DataFrame after filtering
run_data.reset_index(drop=True, inplace=True)

## Conversions for Metrics
    - Calculate miles, minutes, and hours
    - 'distance' is in meters
    - 'moving_time' is in seconds

In [803]:
# Calculate and add new columns, 'distance_miles', 'moving_time_minutes', and 'moving_time_hours, rounded to 2 decimal places
run_data['distance_miles'] = round(run_data['distance'] * 0.00062137119, 2)
run_data['moving_time_minutes'] = round(run_data['moving_time'] / 60, 2)
run_data['moving_time_hours'] = round(run_data['moving_time'] / 3600, 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [804]:
# Calculate averages for miles and time
average_distance_miles = round(run_data['distance_miles'].mean(), 2)
print("Average Distance (miles):", average_distance_miles)
average_time_minutes = round(run_data['moving_time_minutes'].mean(), 2)
print("Average Time Ran (minutes):", average_time_minutes)

# Calculate distance for longest run
max_distance_ran = round(run_data['distance_miles'].max(), 2)
print("Longest Run:", max_distance_ran, "miles")

# Calculate total time ran
max_duration_mintues = round(run_data['moving_time_minutes'].max(), 2)
max_duration_hours = round(run_data['moving_time_hours'].max(), 2)
print("Longest Duration:", max_duration_mintues,"minutes. Converted to hours:", max_duration_hours)

# Calculate total miles ran
total_distance_miles = round(run_data['distance'].sum() * 0.00062137119, 2)
print("Total Distance Covered to the date (miles):", total_distance_miles)

Average Distance (miles): 4.02
Average Time Ran (minutes): 40.74
Longest Run: 9.62 miles
Longest Duration: 117.57 minutes. Converted to hours: 1.96
Total Distance Covered to the date (miles): 152.58


# Retrieve and Decode Polyline for mapping
    - Google Polyline information:
        - https://developers.google.com/maps/documentation/utilities/polylineutility
    - When Polyline is decoded it outputs longitude and latitude listings for activity.
    - Use of polyline module
        - https://pypi.org/project/polyline/

In [805]:
# Create a new DataFrame 'all_run_map_data' form the 'map' column in 'run_data' for polyline data
all_run_map_data = pd.DataFrame(run_data['map'].to_list())

# Remove the first character 'a' from the 'id' column to match id's between two DataFrames
all_run_map_data['id'] = all_run_map_data['id'].str.slice(start=1)

# Drop the 'map' column from 'run_data'
run_data.drop(columns='map', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [806]:
# Create a new DataFrame 'decoded_df_all' with columns 'id' and 'decoded_polyline'
decoded_df_all = pd.DataFrame(columns=['id', 'decoded_polyline'])

# Iterate over each row in 'all_run_map_data'
for index, row in all_run_map_data.iterrows():
    polyline_str = row['summary_polyline']

    # Decode the polyline string using 'polyline.decode()'
    decoded_polyline = polyline.decode(polyline_str)

    # Append the decoded polyline and its corresponding ID to 'decoded_df_all'
    decoded_df_all = decoded_df_all.append({'id' : row['id'], 'decoded_polyline' : decoded_polyline}, ignore_index=True)



In [807]:
# Print the data type of the 'id' column in both DataFrames
print('ID column datatype in Run Data: ', run_data['id'].dtype)
print('ID column datatype in Decoded Data: ', decoded_df_all['id'].dtype)

# Convert the 'id' column in 'decoded_df_all' to integer data type
decoded_df_all['id'] = decoded_df_all['id'].astype(int)
print('ID column datatype of Decoded Data after convert: ', decoded_df_all['id'].dtype)

ID column datatype in Run Data:  int64
ID column datatype in Decoded Data:  object
ID column datatype of Decoded Data after convert:  int64


In [808]:
# Merge the 'run_data' with 'decoded_df_all' on the 'id' column
run_data = pd.merge(run_data, decoded_df_all, on='id')

# Prep Run Data
    - Merge run data with weather data
    - merge on 'start_date'
        - change format of date to match between the two dataframes

In [809]:
# Convert 'start_date' to datetime format to be index for later merge with weather data
run_data['start_date'] = pd.to_datetime(run_data['start_date'])
run_data['start_date'].dtype

datetime64[ns, UTC]

In [810]:
# Convert 'start_date' column to date only (removing time information)
run_data['start_date'] = run_data['start_date'].dt.date
print(run_data['start_date'].dtype)

# Convert 'start_date' column to datetime format
run_data['start_date'] = pd.to_datetime(run_data['start_date'])
print(run_data['start_date'].dtype)

object
datetime64[ns]


# Weather Data
    - Using Meteostat
        - https://dev.meteostat.net/

In [811]:
start = datetime(2022, 1, 1)
end = datetime(2023, 7, 20)
model = True

In [812]:
stations = Stations()
stations = stations.nearby(37.64471283182502, -83.71428340673447)
weather_station = stations.fetch(1)

weather_station

Unnamed: 0_level_0,name,country,region,wmo,icao,latitude,longitude,elevation,timezone,hourly_start,hourly_end,daily_start,daily_end,monthly_start,monthly_end,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
CZF98,Jackson Carroll Airport,US,KY,,KJKL,37.5914,-83.3144,421.0,America/New_York,2022-04-23,2023-07-20,2022-04-24,2022-04-26,NaT,NaT,35715.990269


In [813]:
data = Daily(weather_station.iloc[:], start, end, model)
data = data.normalize()
data = data.fetch()

weather_data = pd.DataFrame(data)

In [814]:
weather_data['tavg'] = weather_data.apply(lambda x : round((9/5)*x['tavg']+32,2), axis=1)
weather_data['tmin'] = weather_data.apply(lambda x : round((9/5)*x['tmin'] +32,2), axis=1)
weather_data['tmax'] = weather_data.apply(lambda x : round((9/5)*x['tmax']+32,2), axis=1)

In [817]:
run_data = run_data.merge(weather_data, left_on='start_date', right_index=True, how='inner')

# Going further into the data to only collect runs from RRGCC land
    - For this particular project I want to focus on RRGCC owned and operated land
    - Highlight the running opportunites on climber owned land
        - Showcase different loops and routes

In [818]:
# Filter the run data for RRGCC land and different running loops
pmrp_run_data = run_data[run_data['name'].str.contains('PMRP', case=False, na=False)]
rrg_run_data = run_data[run_data['name'].str.contains('RRG', case=False, na=False)]
sore_heel_data = run_data[run_data['name'].str.contains('Sore Heel', case=False, na=False)]
lode_loop_data = run_data[run_data['name'].str.contains('Lode Loop', case=False, na=False)]
drive_by_loop_data = run_data[run_data['name'].str.contains('Drive By Loop', case=False,na=False)]

In [819]:
# Set csv path and save csv files
all_csv_path = 'csv/run/run_data.csv'
pmrp_csv_path = 'csv/run/pmrp_run_data.csv'
rrg_csv_path = 'csv/run/rrg_run_data.csv'
sore_heel_csv_path = 'csv/run/sore_heel_data.csv'
lode_loop_csv_path = 'csv/run/lode_loop_data.csv'
drive_by_csv_path = 'csv/run/drive_by_data.csv'

weather_data_csv_path = 'csv/weather/only_weather_data.csv'

run_data.to_csv(all_csv_path, index=False)
pmrp_run_data.to_csv(pmrp_csv_path, index=False)
rrg_run_data.to_csv(rrg_csv_path, index=False)
sore_heel_data.to_csv(sore_heel_csv_path, index=False)
lode_loop_data.to_csv(lode_loop_csv_path, index=False)
drive_by_loop_data.to_csv(drive_by_csv_path, index=False)

weather_data.to_csv(weather_data_csv_path, index=True)

In [820]:
# lode_loop = run_data[run_data['name'].str.contains(r'\bLode Loop\b')]

In [821]:
# lode_loop

Unnamed: 0,name,distance,moving_time,elapsed_time,total_elevation_gain,type,id,start_date,start_date_local,timezone,...,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
2,PMRP : Lode Loop and Drive By Loop,5532.4,1884,1967,124.0,Run,9458318004,2023-07-15,2023-07-15T17:24:26Z,(GMT-05:00) America/New_York,...,75.2,71.06,82.4,4.4,,291.0,4.3,,1011.9,
3,PMRP : Lode Loop and Sore Heel loop,7940.5,2861,2886,153.7,Run,9446383475,2023-07-13,2023-07-13T17:28:17Z,(GMT-05:00) America/New_York,...,79.16,71.6,86.0,4.3,,265.0,5.3,,1012.4,
9,PMRP : Lode Loop and Lode Hill,2615.8,843,849,96.5,Run,9059541871,2023-05-12,2023-05-12T15:53:27Z,(GMT-05:00) America/New_York,...,69.8,64.94,77.0,11.2,,201.0,6.3,,1019.3,
10,PMRP : Lode Loop and Flat Holler Loop,9653.3,3349,3385,453.1,Run,9017048707,2023-05-05,2023-05-05T14:39:29Z,(GMT-05:00) America/New_York,...,61.34,46.94,75.02,0.0,,58.0,4.7,,1019.8,
11,PMRP : Drive By Loop and Lode Loop,5774.7,1769,1827,131.6,Run,8973585450,2023-04-28,2023-04-28T14:23:12Z,(GMT-05:00) America/New_York,...,60.26,55.4,69.8,23.0,,264.0,6.1,,1007.1,
29,PMRP : Lode Loop,2342.4,825,849,34.6,Run,8441005412,2023-01-23,2023-01-23T16:54:18Z,(GMT-05:00) America/New_York,...,33.8,30.2,37.04,1.2,,268.0,9.1,,1018.6,
