In [10]:
import pandas as pd

# Load the dataset
file_path = "/Users/kostaskes/Documents/code/Projects/Activities.csv"
df = pd.read_csv(file_path)

# This will allow printing all rows and all columns 
# pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)



# Display basic info
print(df.info())  # Check data types and missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1978 entries, 0 to 1977
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Activity Type      1978 non-null   object
 1   Date               1978 non-null   object
 2   Title              1978 non-null   object
 3   Distance           1978 non-null   object
 4   Calories           1978 non-null   object
 5   Time               1978 non-null   object
 6   Avg HR             1978 non-null   object
 7   Max HR             1978 non-null   object
 8   Aerobic TE         1978 non-null   object
 9   Avg Speed          1978 non-null   object
 10  Max Speed          1978 non-null   object
 11  Total Ascent       1978 non-null   object
 12  Total Descent      1978 non-null   object
 13  Avg Stride Length  1978 non-null   object
 14  Steps              1978 non-null   object
 15  Min Temp           1978 non-null   object
 16  Best Lap Time      1978 non-null   object


In [11]:
# convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Extract year, month, and month_name
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Month_Name'] = df['Date'].dt.month_name()

# Extract hour and minute as 'time_of_the_day'
df['Time_of_the_day'] = df['Date'].dt.strftime('%H:%M')

In [12]:
# Force the conversion again to ensure it's treated as numeric
df['Distance'] = pd.to_numeric(df['Distance'], errors='coerce')  # Convert to numeric, with errors as NaN
# Replace 0 in the Distance column with NaN
df['Distance'] = df['Distance'].replace(0, float('nan'))

In [13]:
# Filter for Open Water Swimming activities
open_water_swimming = df[df['Activity Type'] == 'Open Water Swimming']

# Print the activity type and distance for Open Water Swimming
print(open_water_swimming[['Activity Type', 'Distance']])


            Activity Type  Distance
1207  Open Water Swimming     843.0
1208  Open Water Swimming     714.0
1209  Open Water Swimming     650.0
1210  Open Water Swimming     602.0
1211  Open Water Swimming     416.0
1212  Open Water Swimming     626.0
1237  Open Water Swimming     554.0
1238  Open Water Swimming     586.0
1239  Open Water Swimming     632.0
1240  Open Water Swimming    1135.0
1241  Open Water Swimming     704.0
1242  Open Water Swimming     638.0
1243  Open Water Swimming     686.0
1244  Open Water Swimming     480.0
1245  Open Water Swimming     501.0
1246  Open Water Swimming    1001.0
1247  Open Water Swimming     708.0
1248  Open Water Swimming     502.0
1249  Open Water Swimming     658.0
1250  Open Water Swimming     340.0
1251  Open Water Swimming     538.0
1258  Open Water Swimming     411.0


In [14]:
# Convert 'Distance' to kilometers for Open Water Swimming activities
df.loc[df['Activity Type'] == 'Open Water Swimming', 'Distance'] = df['Distance'] / 1000


In [15]:
# Print all Open Water Swimming activities with just the 'Distance' column
print(df[df['Activity Type'] == 'Open Water Swimming'][['Activity Type', 'Distance']])


            Activity Type  Distance
1207  Open Water Swimming     0.843
1208  Open Water Swimming     0.714
1209  Open Water Swimming     0.650
1210  Open Water Swimming     0.602
1211  Open Water Swimming     0.416
1212  Open Water Swimming     0.626
1237  Open Water Swimming     0.554
1238  Open Water Swimming     0.586
1239  Open Water Swimming     0.632
1240  Open Water Swimming     1.135
1241  Open Water Swimming     0.704
1242  Open Water Swimming     0.638
1243  Open Water Swimming     0.686
1244  Open Water Swimming     0.480
1245  Open Water Swimming     0.501
1246  Open Water Swimming     1.001
1247  Open Water Swimming     0.708
1248  Open Water Swimming     0.502
1249  Open Water Swimming     0.658
1250  Open Water Swimming     0.340
1251  Open Water Swimming     0.538
1258  Open Water Swimming     0.411


In [17]:
df.head(10)

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Speed,Max Speed,Total Ascent,Total Descent,Avg Stride Length,Steps,Min Temp,Best Lap Time,Max Temp,Avg Resp,Min Resp,Max Resp,Moving Time,Elapsed Time,Min Elevation,Max Elevation,Year,Month,Month_Name,Time_of_the_day
0,Strength Training,2023-07-16 08:33:30,Strength,,99,00:39:16,69,100,0.1,--,--,--,--,--,8,32.0,00:39:16,33.0,--,--,--,00:39:16,00:39:16,--,--,2023,7,July,08:33
1,Strength Training,2023-06-30 15:06:33,Strength,,99,00:39:35,69,102,0.1,--,--,--,--,--,12,31.0,00:39:35,33.0,--,--,--,00:39:35,00:39:35,--,--,2023,6,June,15:06
2,Yoga,2024-03-22 08:12:44,Yoga,,99,00:20:51,90,105,0.2,--,--,--,--,--,--,30.0,00:20:51,31.0,15,11,17,00:20:51,00:20:51,--,--,2024,3,March,08:12
3,Yoga,2023-04-12 21:39:00,Yoga,,99,00:46:20,69,98,0.0,--,--,--,--,--,44,27.0,00:46:20,31.0,19,12,22,00:46:20,00:46:20,--,--,2023,4,April,21:39
4,Yoga,2022-11-16 21:49:35,Yoga,,99,00:40:25,70,92,0.1,--,--,--,--,--,--,29.0,00:40:25,31.0,16,12,20,00:40:25,00:40:25,--,--,2022,11,November,21:49
5,Strength Training,2023-12-29 09:59:09,Strength,,98,00:31:10,73,135,0.3,--,--,--,--,--,18,28.0,00:31:10,29.0,--,--,--,00:31:10,00:31:10,--,--,2023,12,December,09:59
6,Strength Training,2023-07-04 21:29:48,Strength,,98,00:37:51,69,95,0.1,--,--,--,--,--,26,32.0,00:37:51,34.0,--,--,--,00:37:51,00:37:51,--,--,2023,7,July,21:29
7,Strength Training,2023-06-14 18:25:34,Strength,,98,00:35:47,72,104,0.1,--,--,--,--,--,64,31.0,00:35:47,32.0,--,--,--,00:35:47,00:35:47,--,--,2023,6,June,18:25
8,Yoga,2024-04-09 08:20:41,Yoga,,97,00:20:44,91,107,0.2,--,--,--,--,--,--,30.0,00:20:44,33.0,16,12,18,00:20:44,00:20:44,--,--,2024,4,April,08:20
9,Strength Training,2024-02-07 15:46:54,Strength,,96,00:26:50,76,110,0.1,--,--,--,--,--,14,29.0,00:26:50,30.0,--,--,--,00:26:50,00:26:50,--,--,2024,2,February,15:46


In [19]:
columns_to_convert = [
    'Calories', 'Avg HR', 'Max HR', 'Aerobic TE', 'Total Ascent', 
    'Total Descent', 'Steps', 'Avg Resp', 'Min Resp', 'Max Resp', 
    'Min Elevation', 'Max Elevation'
]

# Apply the conversion to all specified columns
for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1978 entries, 0 to 1977
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Activity Type      1978 non-null   object        
 1   Date               1978 non-null   datetime64[ns]
 2   Title              1978 non-null   object        
 3   Distance           1377 non-null   float64       
 4   Calories           1977 non-null   float64       
 5   Time               1978 non-null   object        
 6   Avg HR             1972 non-null   float64       
 7   Max HR             1935 non-null   float64       
 8   Aerobic TE         1445 non-null   float64       
 9   Avg Speed          1978 non-null   object        
 10  Max Speed          1978 non-null   object        
 11  Total Ascent       1288 non-null   float64       
 12  Total Descent      1299 non-null   float64       
 13  Avg Stride Length  1978 non-null   object        
 14  Steps   