# Project Running

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [188]:
# Import libraries
import pandas as pd
import numpy as np

In [189]:
# Load dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Portfolio Projects/Dataset/Activities_new.csv'
df = pd.read_csv(file_path)

In [190]:
# Set the option to display all columns
pd.set_option('display.max_columns', None)

## Data Preprocessing

In [191]:
df.head(2)

Unnamed: 0,Activity Type,Date,Favorite,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Run Cadence,Max Run Cadence,Avg Pace,Best Pace,Total Ascent,Total Descent,Avg Stride Length,Avg Vertical Ratio,Avg Vertical Oscillation,Avg Ground Contact Time,Training Stress Score®,Avg Power,Max Power,Grit,Flow,Avg. Swolf,Avg Stroke Rate,Total Reps,Dive Time,Min Temp,Surface Interval,Decompression,Best Lap Time,Number of Laps,Max Temp,Avg Resp,Min Resp,Max Resp,Moving Time,Elapsed Time,Min Elevation,Max Elevation
0,Running,2023-12-11 10:53:19,False,Eleftherio-Kordelio Running,10.6,882,1:01:09,141,155,3.5,170,194,5:46,3:54,97,89,1.02,0,0,0,0,0,0,0,0,0,0,0,0:00,17,0:00,No,03:01.92,11,27,--,--,--,1:00:44,1:01:09,40,77
1,Running,2023-12-09 10:21:58,False,Eleftherio-Kordelio Running,10.84,874,1:00:01,140,159,3.6,172,179,5:32,4:46,102,109,1.05,0,0,0,0,0,0,0,0,0,0,0,0:00,17,0:00,No,04:46.93,11,27,--,--,--,0:59:58,1:00:01,38,82


In [192]:
# List of columns to drop
columns_to_drop = ['Favorite', 'Aerobic TE', 'Max Run Cadence', 'Best Pace', 'Avg Vertical Ratio', 'Avg Vertical Oscillation', 'Avg Ground Contact Time', 'Training Stress Score®', 'Avg Power', 'Max Power', 'Grit', 'Flow', 'Avg. Swolf', 'Avg Stroke Rate', 'Total Reps', 'Dive Time', 'Surface Interval', 'Decompression', 'Best Lap Time', 'Number of Laps', 'Avg Resp', 'Min Resp', 'Max Resp', 'Elapsed Time']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)


In [193]:
# Comprehensive overview of the DataFrame
# Checking null values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Activity Type      656 non-null    object 
 1   Date               656 non-null    object 
 2   Title              656 non-null    object 
 3   Distance           656 non-null    float64
 4   Calories           656 non-null    object 
 5   Time               656 non-null    object 
 6   Avg HR             656 non-null    int64  
 7   Max HR             656 non-null    int64  
 8   Avg Run Cadence    656 non-null    object 
 9   Avg Pace           656 non-null    object 
 10  Total Ascent       656 non-null    object 
 11  Total Descent      656 non-null    object 
 12  Avg Stride Length  656 non-null    float64
 13  Min Temp           656 non-null    int64  
 14  Max Temp           656 non-null    int64  
 15  Moving Time        656 non-null    object 
 16  Min Elevation      656 non

### Type conversion

**Before converting the data type of certain columns:**

**We need to ensure that there are no delimiters in the data to avoid errors during the conversion process.**

In [195]:
# These columns may potentially contain the ',' delimiter
columns_to_clean = ['Calories', 'Avg Run Cadence', 'Total Ascent', 'Total Descent', 'Min Elevation', 'Max Elevation']

for column in columns_to_clean:
    df[column] = df[column].str.replace(',', '')

In [196]:
# Columns to convert to integers
columns_to_clean = ['Calories', 'Avg Run Cadence', 'Total Ascent', 'Total Descent', 'Min Elevation', 'Max Elevation']

# Convert columns to integers
df[columns_to_clean] = df[columns_to_clean].apply(pd.to_numeric, errors='coerce') # I used the coerce function, because there were some non-numeric values.

In [197]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Convert Time and Moving Time columns to timedelta
df['Time'] = pd.to_timedelta(df['Time'])
df['Moving Time'] = pd.to_timedelta(df['Moving Time'])

In [198]:
# Convert Avg Pace column to timedelta
df['Avg Pace'] = pd.to_timedelta('00:' + df['Avg Pace'])

### Null handling

In [199]:
df.isna().sum()

Activity Type         0
Date                  0
Title                 0
Distance              0
Calories              0
Time                  0
Avg HR                0
Max HR                0
Avg Run Cadence      42
Avg Pace              0
Total Ascent         48
Total Descent        48
Avg Stride Length     0
Min Temp              0
Max Temp              0
Moving Time           0
Min Elevation        48
Max Elevation        48
dtype: int64

In [200]:
# Check null values in Avg Run Cadence based on Activity Type
null_values_by_activity_type = df.groupby('Activity Type')['Avg Run Cadence'].apply(lambda x: x.isnull().sum())

# Display the result
print(null_values_by_activity_type)


Activity Type
Running              42
Trail Running         0
Treadmill Running     0
Name: Avg Run Cadence, dtype: int64


In [201]:
# Calculate the Avg Running Cadence
avg_running_cadence = df.loc[df['Activity Type'] == 'Running', 'Avg Run Cadence'].mean()
print(avg_running_cadence)

171.80388692579504


In [202]:
# Replace missing values with the rounded average as an integer
df['Avg Run Cadence'].fillna(round(avg_running_cadence), inplace=True)

**Same as before, I want to investigate the pattern of missing values in these columns based on 'Activity Type' to make informed decisions on handling them.**

In [204]:
# Group by 'Activity Type' and count missing values for specific columns
df.groupby('Activity Type')[['Total Ascent', 'Total Descent', 'Min Elevation', 'Max Elevation']].apply(lambda x: x.isna().sum())

Unnamed: 0_level_0,Total Ascent,Total Descent,Min Elevation,Max Elevation
Activity Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Running,42,42,42,42
Trail Running,0,0,0,0
Treadmill Running,6,6,6,6


(!) Explanation:

*Normally, I would follow the same procedure as before to replace missing values for each column based on 'Activity Type'. However, for simplicity and considering that metrics like 'Total Ascent,' 'Total Descent,' 'Min Elevation,' and 'Max Elevation' are more relevant in 'Trail Running' activities, I chose a simplified approach by replacing missing values with zeros for these columns.*

In [205]:
# Replace missing values with zeros
columns_to_replace_with_zeros = ['Total Ascent', 'Total Descent', 'Min Elevation', 'Max Elevation']
df[columns_to_replace_with_zeros] = df[columns_to_replace_with_zeros].fillna(0)

In [208]:
df.isna().sum()

Activity Type        0
Date                 0
Title                0
Distance             0
Calories             0
Time                 0
Avg HR               0
Max HR               0
Avg Run Cadence      0
Avg Pace             0
Total Ascent         0
Total Descent        0
Avg Stride Length    0
Min Temp             0
Max Temp             0
Moving Time          0
Min Elevation        0
Max Elevation        0
dtype: int64