# STEP ONE: DATA ACQUISITION AND FORMATTING

In [589]:
# Importing the necessary libraries
import pandas as pd
import datetime as dt

In [590]:
# Reading in the CSV file and displaying the first five activities
activities_df = pd.read_csv("C:/Users/17178/Desktop/GITHUB_PROJECTS/Strava-API-and-Sheets-Integration/python/data/GOONS_ACTIVITIES.csv", header=0)
activities_df.head(5)

Unnamed: 0,ATHLETE,ACTIVITY ID,RUN,MOVING TIME,DISTANCE,PACE,FULL DATE,TIME,DAY,MONTH,DATE,YEAR,SPM AVG,HR AVG,WKT TYPE,DESCRIPTION
0,PAIGE O,8861534949,Highest mileage week ever,01:53:01,15.02,00:07:33,4/9/2023,10:40:53 AM,SUN,4,9,2023,188.0,155.8,0.0,
1,PATRICK L,8860938466,Now this is a recovery run...when your not rec...,01:02:02,6.35,00:09:48,4/9/2023,10:38:04 AM,SUN,4,9,2023,162.6,141.9,0.0,
2,MARK M,8862725101,4 weeks out,00:35:03,4.3,00:08:18,4/9/2023,8:34:22 AM,SUN,4,9,2023,,133.5,0.0,Lots of school and miles to get in before the ...
3,PATRICK L,8855632342,"8M WU, 8M @ MP, 5M CD",02:42:01,21.03,00:07:43,4/8/2023,10:52:34 AM,SAT,4,8,2023,178.0,163.6,2.0,Well... That felt good! Kept WarmUp 8 below 8:...
4,DAVID L,8854595749,Little Friend,00:22:03,2.48,00:09:05,4/8/2023,10:40:45 AM,SAT,4,8,2023,163.0,157.4,0.0,Got home with a pal in tow


In [591]:
# Converting WKT TYPE to be an int
activities_df.loc[activities_df["WKT TYPE"].isna() | activities_df["WKT TYPE"].isnull(), "WKT TYPE"] = activities_df["WKT TYPE"].mask(activities_df["WKT TYPE"].isna() | activities_df["WKT TYPE"].isnull(), "0")
activities_df["WKT TYPE"] = activities_df["WKT TYPE"].astype("int").astype("category")

# Converting full date to a sortable format
activities_df["FULL DATE"] = activities_df["FULL DATE"].astype("string")
activities_df["FULL DATE"] = pd.to_datetime(activities_df["FULL DATE"], yearfirst=False, dayfirst=False)

# Converting time to a sortable (24h) format (for new column, "SORT TIME")
activities_df["SORT TIME"] = pd.to_datetime(activities_df["TIME"], format='%I:%M:%S %p').dt.strftime('%H:%M:%S')

# Sorting the dataframe by full date and time in descending order
activities_df = activities_df.sort_values(by=["FULL DATE", "SORT TIME"], ascending=[False, False]) # Extra precaution (things should be sorted properly as is)

In [592]:
# Viewing data types of each column
activities_df.dtypes

ATHLETE                object
ACTIVITY ID             int64
RUN                    object
MOVING TIME            object
DISTANCE              float64
PACE                   object
FULL DATE      datetime64[ns]
TIME                   object
DAY                    object
MONTH                   int64
DATE                    int64
YEAR                    int64
SPM AVG               float64
HR AVG                float64
WKT TYPE             category
DESCRIPTION            object
SORT TIME              object
dtype: object

In [593]:
# Viewing the first five activities
activities_df.head(5)

Unnamed: 0,ATHLETE,ACTIVITY ID,RUN,MOVING TIME,DISTANCE,PACE,FULL DATE,TIME,DAY,MONTH,DATE,YEAR,SPM AVG,HR AVG,WKT TYPE,DESCRIPTION,SORT TIME
0,PAIGE O,8861534949,Highest mileage week ever,01:53:01,15.02,00:07:33,2023-04-09,10:40:53 AM,SUN,4,9,2023,188.0,155.8,0,,10:40:53
1,PATRICK L,8860938466,Now this is a recovery run...when your not rec...,01:02:02,6.35,00:09:48,2023-04-09,10:38:04 AM,SUN,4,9,2023,162.6,141.9,0,,10:38:04
2,MARK M,8862725101,4 weeks out,00:35:03,4.3,00:08:18,2023-04-09,8:34:22 AM,SUN,4,9,2023,,133.5,0,Lots of school and miles to get in before the ...,08:34:22
3,PATRICK L,8855632342,"8M WU, 8M @ MP, 5M CD",02:42:01,21.03,00:07:43,2023-04-08,10:52:34 AM,SAT,4,8,2023,178.0,163.6,2,Well... That felt good! Kept WarmUp 8 below 8:...,10:52:34
4,DAVID L,8854595749,Little Friend,00:22:03,2.48,00:09:05,2023-04-08,10:40:45 AM,SAT,4,8,2023,163.0,157.4,0,Got home with a pal in tow,10:40:45


In [594]:
# TODO: Cleaning up the data (might not be necessary, have to think on this)
# Thought: Using the description just doesn't make sense for this machine learning. 
# It'd be too hard to extract meaning from in a column-interpretable way.