In [5]:
%load_ext autoreload
%autoreload 2


In [6]:
# Perform_AI.src.data_processing.py

import streamlit as st
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
import csv
import os
import sys
import plotly.graph_objs as go
import plotly.io as pio

In [7]:
import sys
from pathlib import Path

# Set the root directory as the path, assuming Perform_AI is the project root
sys.path.append(str(Path().resolve().parent))

In [8]:
from src.calorie_calculations import calculate_total_calories

from src.calorie_estimation_models import estimate_calories_with_duration, estimate_calories_with_nixtla
from src.tss_calculations import calculate_total_tss_and_metrics_from_tss

from src.data_loader.files_extracting import FileLoader
from src.data_loader.files_saving import FileSaver
from params import CLOUD_ON, GIVEN_DATE, BEST_MODEL

In [9]:
dataframes_names = {
    'workouts': ['tp_workouts_2022-03-03_to_2023-03-03', 'tp_workouts_2023-03-03_to_2024-03-03', 'tp_workouts_2024-03-03_to_2025-03-03'],
    'activities': 'activities',
    'foods': [f"FOOD-DATA-GROUP{i}" for i in range(1,6)]
}

workouts_df = pd.concat([FileLoader()._load_csv('data/raw/csv', name) for name in dataframes_names['workouts']], ignore_index=True)
activities_df = FileLoader()._load_csv('data/raw/csv', dataframes_names['activities'])
foods = pd.concat([FileLoader()._load_csv('data/raw/csv', name, index=0) for name in dataframes_names['foods']], ignore_index=True)

2024-11-01 21:10:26,871 - INFO - Tp Workouts 2022-03-03 To 2023-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2022-03-03_to_2023-03-03.csv
2024-11-01 21:10:26,879 - INFO - Tp Workouts 2023-03-03 To 2024-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2023-03-03_to_2024-03-03.csv
2024-11-01 21:10:26,886 - INFO - Tp Workouts 2024-03-03 To 2025-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2024-03-03_to_2025-03-03.csv
2024-11-01 21:10:26,897 - INFO - Activities dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/activities.csv
2024-11-01 21:10:26,907 - INFO - Food-Data-Group1 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform

# NOT EFFICIENT

In [5]:
def clean_data_basic(dfs, date_cols):
    """
    Clean data for the given dataframes.

    Parameters:
        dfs (dict): Dictionary of DataFrames to clean.
        date_cols (dict): Dictionary mapping DataFrame names to their date column names.
    """
    for df_name, df in dfs.items():
        df.replace('--', np.nan, inplace=True)
        df.drop_duplicates(inplace=True)

        if df_name == 'sleep':
            continue
        convert_to_datetime(df, date_cols[df_name])


def convert_to_datetime(df, date_col):
    """
    Convert specified column to datetime and set as index.

    Parameters:
        df (pd.DataFrame): DataFrame to process.
        date_col (str): Name of the date column.
    """
    if date_col != 'Timestamp':
        df['Date'] = pd.to_datetime(df[date_col])
        df.sort_values('Date', inplace=True)
        df.set_index('Date', inplace=True)
    else:
        df.index.name = 'Date'
        df.sort_values('Date', inplace=True)
        df.set_index(pd.to_datetime(df.index), inplace=True)

    if date_col in df.columns:
        df.drop(columns=date_col, inplace=True)


def clean_activities(df):
    """
    Clean activity data to keep relevant columns and rename them.

    Parameters:
        df (pd.DataFrame): DataFrame containing activity data.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    columns_to_keep = ["Type d'activité", 'Distance', 'Calories', 'Durée', 'Fréquence cardiaque moyenne']
    df = df[columns_to_keep].copy().rename(columns={
        'Distance': 'DistanceInMeters',
        'Durée': 'TimeTotalInHours',
        'Fréquence cardiaque moyenne': 'HeartRateAverage',
        'Type d\'activité': 'WorkoutType'
    })

    df['HeartRateAverage'] = pd.to_numeric(df['HeartRateAverage'], errors='coerce')
    df = df[df['HeartRateAverage'].notna()]

    df = df[~df["WorkoutType"].isin(['HIIT', 'Exercice de respiration', 'Musculation'])].copy()

    sports_types = {
        'Nat. piscine': 'Swim',
        'Cyclisme': 'Bike',
        'Course à pied': 'Run',
        "Vélo d'intérieur": 'Bike',
        'Cyclisme virtuel': 'Bike',
        'Course à pied sur tapis roulant': 'Run',
        'Natation': 'Swim',
    }
    df["WorkoutType"] = df["WorkoutType"].apply(lambda x: sports_types[x])

    # Convert Durée from 'hh:mm:ss' to total minutes
    df['TimeTotalInHours'] = pd.to_timedelta(df['TimeTotalInHours']).dt.total_seconds() / 3600  # Convert to Hours

    # Convert relevant columns to numeric (remove commas, etc.)
    df['DistanceInMeters'] = pd.to_numeric(df['DistanceInMeters'].str.replace(',', '.'), errors='coerce')
    df['Calories'] = pd.to_numeric(df['Calories'], errors='coerce')

    # Drop rows with NaN values in critical columns
    df = df.dropna(subset=['DistanceInMeters', 'Calories', 'TimeTotalInHours', 'HeartRateAverage'])

    # df = df[df['DistanceInMeters']>0].copy() # NOTE: not needed since, i will be using only TotalDuration or TimeTotalInHours

    return df


def filter_workouts_and_remove_nans(df, given_date = GIVEN_DATE):
    columns_to_keep_workouts = ['WorkoutType', 'Title', 'WorkoutDescription', 'CoachComments', 'HeartRateAverage', 'TimeTotalInHours', 'DistanceInMeters', 'PlannedDuration', 'PlannedDistanceInMeters']
    df = df[columns_to_keep_workouts].copy()

    before_df = df[df.index < given_date].copy()
    after_df = df[df.index >= given_date].copy()
    # Remove rows, before the given date, where i didn't train, meaning, where HR and Total Time is nan.
    before_df_cleaned = before_df[~(before_df['HeartRateAverage'].isna() & before_df['TimeTotalInHours'].isna())].copy() # NOTE: HERE IS THE PART THAT CAUSES THE WEIRD BEHAVIOUR. Explanation below
    # TODO: (BTW, I DON'T NEED TO REMOVE THE HEARTRATEAVERAGE.ISNA, since what's important for me is timetotalinhours only)

    # Remove rows, after the given date, where Planned Duration is nan, which means there is no info on training, so no tss
    after_df = after_df[after_df['PlannedDuration'].notna()]

    # Concatenate before and after dataframes
    w_df = pd.concat([before_df_cleaned, after_df])
    # Keep dates where there was a Run Swim or Bike training Plan
    w_df = w_df[w_df['WorkoutType'].isin(['Run', 'Swim', 'Bike'])]

    # Fill NaN values in object columns with an empty string
    object_cols = w_df.select_dtypes(include=['object']).columns
    w_df[object_cols] = w_df[object_cols].fillna('')

    return w_df


In [6]:
dataframes = {
    'activities': activities_df,
    #'sleep': sleep_df,
    #'health_metrics': health_metrics_df,
    'workouts': workouts_df
}
date_columns = {
    'activities': 'Date', # as column
    #'sleep': 'Date', # as column
    #'health_metrics': 'Timestamp', # already as index
    'workouts': 'WorkoutDay' # as column
}

In [7]:
clean_data_basic(dataframes, date_columns)

w_df = filter_workouts_and_remove_nans(dataframes['workouts'])
activities_df = clean_activities(dataframes['activities'])

TypeError: Invalid comparison between dtype=datetime64[ns] and date

In [None]:
w_df.info()

In [None]:
activities_df.info()

In [None]:
activities_df.describe()

In [6]:
dataframes_names = {
    'workouts': ['tp_workouts_2022-03-03_to_2023-03-03', 'tp_workouts_2023-03-03_to_2024-03-03', 'tp_workouts_2024-03-03_to_2025-03-03'],
    'activities': 'activities',
    'foods': [f"FOOD-DATA-GROUP{i}" for i in range(1,6)]
}

workouts_df = pd.concat([FileLoader()._load_csv('data/raw/csv', name) for name in dataframes_names['workouts']], ignore_index=True)
activities_df = FileLoader()._load_csv('data/raw/csv', dataframes_names['activities'])
foods = pd.concat([FileLoader()._load_csv('data/raw/csv', name, index=0) for name in dataframes_names['foods']], ignore_index=True)

2024-11-01 16:52:39,257 - INFO - Tp Workouts 2022-03-03 To 2023-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2022-03-03_to_2023-03-03.csv
2024-11-01 16:52:39,261 - INFO - Tp Workouts 2023-03-03 To 2024-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2023-03-03_to_2024-03-03.csv
2024-11-01 16:52:39,265 - INFO - Tp Workouts 2024-03-03 To 2025-03-03 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/tp_workouts_2024-03-03_to_2025-03-03.csv
2024-11-01 16:52:39,274 - INFO - Activities dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform_AI/data/raw/csv/activities.csv
2024-11-01 16:52:39,279 - INFO - Food-Data-Group1 dataframe loaded successfully from /Users/juanpabloangaritaafricano/code/juanpabloangarita/Perform

# EFFICIENT

In [10]:
def clean_data_basic(df):
    """
    Clean data for the given dataframes.

    Parameters:
        dfs (dict): Dictionary of DataFrames to clean.
        date_cols (dict): Dictionary mapping DataFrame names to their date column names.
    """
    df = df.replace('--', np.nan)
    df = df.drop_duplicates()

    return df

In [11]:
def convert_to_datetime(df, date_col):
    """
    Convert specified column to datetime and set as index with uniform date format.

    Parameters:
        df (pd.DataFrame): DataFrame to process.
        date_col (str): Name of the date column.
    
    Returns:
        pd.DataFrame: DataFrame with 'Date' as index in datetime format (YYYY-MM-DD).
    """
    # Explicitly check for known date columns
    if date_col in ['Date', 'WorkoutDay']:
        df['Date'] = pd.to_datetime(df[date_col])
        df = df.sort_values('Date')
        df = df.set_index('Date')
    elif date_col == 'Timestamp':
        df.index.name = 'Date'
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()
    else:
        raise ValueError(f"Unrecognized date column: {date_col}")

    # Format the index to YYYY-MM-DD
    #df.index = df.index.date  # Keep only the date part
    df.index = df.index.normalize()  # Keep only the date part

    # Drop the original date column if it exists
    if date_col in df.columns:
        df = df.drop(columns=date_col)

    return df

In [12]:
dataframes = {
    'Date': activities_df,
    #'Date': sleep_df,
    #'Timestamp': health_metrics_df, # already as index
    'WorkoutDay': workouts_df
}

In [13]:
workouts_df = clean_data_basic(workouts_df).copy()
activities_df = clean_data_basic(activities_df).copy()

In [14]:
workouts_df = convert_to_datetime(workouts_df, 'WorkoutDay').copy()
activities_df = convert_to_datetime(activities_df, 'Date').copy()

In [15]:
workouts_df.head()

Unnamed: 0_level_0,Title,WorkoutType,WorkoutDescription,PlannedDuration,PlannedDistanceInMeters,CoachComments,DistanceInMeters,PowerAverage,PowerMax,Energy,...,PWRZone3Minutes,PWRZone4Minutes,PWRZone5Minutes,PWRZone6Minutes,PWRZone7Minutes,PWRZone8Minutes,PWRZone9Minutes,PWRZone10Minutes,Rpe,Feeling
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-03-12,Other,Other,,,,,0.0,,,,...,,,,,,,,,,
2022-03-12,Other,Other,,,,,,,,,...,,,,,,,,,,
2022-03-13,Other,Other,,,,,,,,,...,,,,,,,,,,
2022-03-13,Running,Run,,,,,2190.780029,,,,...,,,,,,,,,,
2022-03-14,Running,Run,,,,,2363.75,,,,...,,,,,,,,,,


In [16]:
activities_df.head()

Unnamed: 0_level_0,Type d'activité,Favori,Titre,Distance,Calories,Durée,Fréquence cardiaque moyenne,Fréquence cardiaque maximale,Cadence de vélo moyenne,Cadence de vélo maximale,...,Fréquence respiratoire maximale,Changement du niveau de stress,Début de l'épreuve d'effort,Fin de l'épreuve d'effort,Stress moyen,Stress maximal,Temps de déplacement,Temps écoulé,Altitude minimale,Altitude maximale
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-03-12,Exercice de respiration,False,Détente/Concent. (crt),,,00:04:15.2,65,73,,,...,17.0,-10.0,14.0,4.0,4.0,14.0,00:00:00,00:04:15.2,,
2022-03-12,HIIT,False,HIIT,0.0,55.0,00:06:26.2,115,153,,,...,,,,,,,00:03:28.6,00:06:34.1,,
2022-03-13,Course à pied,False,Saint-Mandé Corsa,2.19,200.0,00:13:40,147,171,153.0,163.0,...,,,,,,,00:13:39,00:13:40,48.0,57.0
2022-03-13,Exercice de respiration,False,Relax e con​centr. (Breve),,,00:05:34,67,74,,,...,4.0,0.0,14.0,14.0,14.0,22.0,00:00:00,00:05:34,,
2022-03-14,Course à pied,False,Vincennes Corsa,2.36,216.0,00:19:04,126,170,125.0,208.0,...,,,,,,,00:18:59,00:19:04,46.0,55.0


In [17]:
type(workouts_df.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [18]:
def filter_and_translate_columns(df, column_mapping, columns_to_keep):
    """
    Translates column names in a DataFrame based on a given mapping and filters to keep only specified columns.

    Parameters:
    - df (pd.DataFrame): The DataFrame with original column names.
    - column_mapping (dict): A dictionary mapping original column names to desired column names.
    - columns_to_keep (list, optional): List of columns to keep in the final DataFrame after renaming. Defaults to None.

    Returns:
    - pd.DataFrame: A DataFrame with renamed and filtered columns.
    """
    # Translate columns
    df_translated = df.rename(columns=column_mapping)

    # Determine columns to keep
    df_translated = df_translated[columns_to_keep]

    return df_translated


In [20]:
columns_to_keep_workouts = ['WorkoutType', 'Title', 'WorkoutDescription', 'CoachComments', 
                            'HeartRateAverage', 'TimeTotalInHours', 'DistanceInMeters', 'PlannedDuration', 'PlannedDistanceInMeters']
french_to_english = {
    'Type d\'activité': 'WorkoutType',
    'Titre': 'Title',
    'Fréquence cardiaque moyenne': 'HeartRateAverage',
    'Durée': 'TimeTotalInHours',
    'Distance': 'DistanceInMeters',
    'Calories': 'Calories'
}
columns_to_keep_activities = list(french_to_english.values())

In [21]:
workouts_df = filter_and_translate_columns(workouts_df, {}, columns_to_keep_workouts).copy()
activities_df = filter_and_translate_columns(activities_df, french_to_english, columns_to_keep_activities).copy()

In [22]:
workouts_df['WorkoutType'].unique()

array(['Other', 'Run', 'Strength', 'Swim', 'Bike', 'Brick', 'Day Off'],
      dtype=object)

In [23]:
activities_df['WorkoutType'].unique()

array(['Exercice de respiration', 'HIIT', 'Course à pied', 'Musculation',
       'Nat. piscine', "Vélo d'intérieur", 'Cyclisme',
       'Course à pied sur tapis roulant', 'Natation', 'Cyclisme virtuel'],
      dtype=object)

In [24]:
def filter_and_translate_workouts_column(df, workouts_to_remove, sports_mapping=None):
    """
    Filters and translates workout types in a DataFrame based on specified criteria.

    Parameters:
    - df (pd.DataFrame): The DataFrame with workout data.
    - workouts_to_remove (list): List of workout types to exclude from the DataFrame.
    - sports_mapping (dict, optional): A dictionary to translate workout types. Defaults to None.

    Returns:
    - pd.DataFrame: The DataFrame with filtered and translated workout types.
    """
    # Filter out unwanted workout types
    df_filtered = df[~df['WorkoutType'].isin(workouts_to_remove)].copy()

    # Apply mapping if provided
    if sports_mapping:
        df_filtered['WorkoutType'] = df_filtered['WorkoutType'].map(sports_mapping).fillna(df_filtered['WorkoutType'])

    return df_filtered


In [25]:
workouts_to_remove_both_dfs = ['Brick', 'Other', 'Strength', 'Day Off', 'HIIT', 'Exercice de respiration', 'Musculation']
sports_types = {
    'Nat. piscine': 'Swim',
    'Cyclisme': 'Bike',
    'Course à pied': 'Run',
    "Vélo d'intérieur": 'Bike',
    'Cyclisme virtuel': 'Bike',
    'Course à pied sur tapis roulant': 'Run',
    'Natation': 'Swim',
}

In [26]:
workouts_df = filter_and_translate_workouts_column(workouts_df, workouts_to_remove_both_dfs).copy()
activities_df = filter_and_translate_workouts_column(activities_df, workouts_to_remove_both_dfs, sports_types).copy()

In [27]:
workouts_df['WorkoutType'].unique()

array(['Run', 'Swim', 'Bike'], dtype=object)

In [28]:
activities_df['WorkoutType'].unique()

array(['Run', 'Swim', 'Bike'], dtype=object)

In [29]:
workouts_df.head()

Unnamed: 0_level_0,WorkoutType,Title,WorkoutDescription,CoachComments,HeartRateAverage,TimeTotalInHours,DistanceInMeters,PlannedDuration,PlannedDistanceInMeters
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-03-13,Run,Running,,,147.0,0.227822,2190.780029,,
2022-03-14,Run,Running,,,126.0,0.317827,2363.75,,
2022-03-15,Run,Running,,,162.0,0.366062,3018.330078,,
2022-03-16,Run,Running,,,151.0,0.500626,3558.080078,,
2022-03-19,Run,Running,,,138.0,0.093556,760.710022,,


In [30]:
activities_df

Unnamed: 0_level_0,WorkoutType,Title,HeartRateAverage,TimeTotalInHours,DistanceInMeters,Calories
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-03-13,Run,Saint-Mandé Corsa,147,00:13:40,2.19,200
2022-03-14,Run,Vincennes Corsa,126,00:19:04,2.36,216
2022-03-15,Run,Saint-Mandé Corsa,162,00:21:58,3.02,351
2022-03-16,Run,Saint-Mandé Corsa,151,00:30:02,3.56,439
2022-03-19,Run,Vincennes Corsa,138,00:05:36.8,0.76,66
...,...,...,...,...,...,...
2024-09-26,Swim,Nat. piscine,136,00:22:11,1000,255
2024-10-04,Run,Run Test,141,00:22:28,2.71,278
2024-10-07,Bike,Paris - Easy Effort spin,140,00:31:21,3.95,357
2024-10-07,Run,Zone 3 Intervals,157,00:13:38,1.73,187


In [31]:
workouts_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 784 entries, 2022-03-13 to 2025-03-03
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   WorkoutType              784 non-null    object 
 1   Title                    784 non-null    object 
 2   WorkoutDescription       488 non-null    object 
 3   CoachComments            335 non-null    object 
 4   HeartRateAverage         547 non-null    float64
 5   TimeTotalInHours         572 non-null    float64
 6   DistanceInMeters         572 non-null    float64
 7   PlannedDuration          485 non-null    float64
 8   PlannedDistanceInMeters  144 non-null    float64
dtypes: float64(5), object(4)
memory usage: 61.2+ KB


In [32]:
activities_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 574 entries, 2022-03-13 to 2024-10-16
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   WorkoutType       574 non-null    object
 1   Title             574 non-null    object
 2   HeartRateAverage  549 non-null    object
 3   TimeTotalInHours  574 non-null    object
 4   DistanceInMeters  574 non-null    object
 5   Calories          573 non-null    object
dtypes: object(6)
memory usage: 31.4+ KB


In [33]:
import pandas as pd
from typing import List

def convert_time_to_hours(time_str: str) -> float:
    """Convert a time string to hours."""
    try:
        return pd.to_timedelta(time_str).total_seconds() / 3600
    except Exception as e:
        print(f"Error converting time '{time_str}': {e}")
        return 0.0

def clean_calories(calories_str: str) -> float:
    """Remove thousands commas from calorie strings and convert to float."""
    try:
        return float(calories_str.replace(',', ''))
    except ValueError:
        print(f"Error cleaning calories '{calories_str}': cannot convert to float.")
        return 0.0

def convert_distance_to_meters(distance_str: str, workout_type: str) -> float:
    """Convert distance string to meters based on workout type."""
    try:
        distance_value = float(distance_str.replace(',', ''))  # Remove commas
        return distance_value if workout_type == 'Swim' else distance_value * 1000
    except ValueError:
        print(f"Error converting distance '{distance_str}' for workout '{workout_type}': cannot convert to float.")
        return 0.0

def convert_data_types_for_activities(df: pd.DataFrame, columns_to_modify: List[str]) -> pd.DataFrame:
    """Convert specified columns in a DataFrame to appropriate data types."""
    for col in columns_to_modify:
        if col == 'DistanceInMeters':
            df[col] = df.apply(lambda row: convert_distance_to_meters(row['DistanceInMeters'], row['WorkoutType']), axis=1)
        elif col in ['HeartRateAverage', 'Calories', 'TimeTotalInHours']:
            conversion_func = clean_calories if col == 'Calories' else (convert_time_to_hours if col == 'TimeTotalInHours' else float)
            df[col] = df[col].apply(conversion_func)

        df[col] = df[col].astype('float64')  # Ensure the column is in float64 format

    return df


In [34]:

# Usage
activities_df = activities_df.dropna()
columns_to_float = ['HeartRateAverage', 'Calories', 'DistanceInMeters', 'TimeTotalInHours']
activities_df = convert_data_types_for_activities(activities_df, columns_to_float).copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(conversion_func)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('float64')  # Ensure the column is in float64 format
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(conversion_func)
A value is trying to be set on a copy of a sl

In [35]:
activities_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 548 entries, 2022-03-13 to 2024-10-16
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   WorkoutType       548 non-null    object 
 1   Title             548 non-null    object 
 2   HeartRateAverage  548 non-null    float64
 3   TimeTotalInHours  548 non-null    float64
 4   DistanceInMeters  548 non-null    float64
 5   Calories          548 non-null    float64
dtypes: float64(4), object(2)
memory usage: 30.0+ KB


In [36]:
activities_df.describe()

Unnamed: 0,HeartRateAverage,TimeTotalInHours,DistanceInMeters,Calories
count,548.0,548.0,548.0,548.0
mean,136.89781,0.945837,13611.532847,548.167883
std,13.353599,0.937863,23683.509137,457.256389
min,80.0,0.004,0.0,1.0
25%,130.75,0.473056,1600.0,292.0
50%,139.0,0.741111,5640.0,453.0
75%,145.0,1.073333,14102.5,659.75
max,165.0,7.666944,178390.0,3297.0


In [40]:
workouts_df.describe()

Unnamed: 0,HeartRateAverage,TimeTotalInHours,DistanceInMeters,PlannedDuration,PlannedDistanceInMeters
count,547.0,572.0,572.0,485.0,144.0
mean,136.826325,0.947046,13329.542514,1.253266,2709.227544
std,13.533198,0.941658,23108.359453,1.015936,1859.480497
min,69.0,0.003466,0.0,0.333333,822.960022
25%,130.0,0.465532,1523.797485,0.75,1737.360107
50%,139.0,0.732646,5440.939941,1.0,2103.120117
75%,145.0,1.074003,13241.160156,1.25,2743.200195
max,165.0,7.666993,178391.015625,6.0,12066.0


In [41]:
today_date = datetime.today().date()

# Convert today_date to a datetime at midnight for comparison
GIVEN_DATE = pd.to_datetime(today_date)

In [42]:
def filter_workouts_and_remove_nans(df, given_date = GIVEN_DATE):

    before_df = df[df.index < given_date].copy()
    after_df = df[df.index >= given_date].copy()
    # Remove rows, before the given date, where i didn't train, meaning, where HR and Total Time is nan.
    before_df_cleaned = before_df[~(before_df['HeartRateAverage'].isna() & before_df['TimeTotalInHours'].isna())].copy() # NOTE: HERE IS THE PART THAT CAUSES THE WEIRD BEHAVIOUR. Explanation below
    # TODO: (BTW, I DON'T NEED TO REMOVE THE HEARTRATEAVERAGE.ISNA, since what's important for me is timetotalinhours only)

    # Remove rows, after the given date, where Planned Duration is nan, which means there is no info on training, so no tss
    after_df = after_df[after_df['PlannedDuration'].notna()]

    # Concatenate before and after dataframes
    w_df = pd.concat([before_df_cleaned, after_df])

    object_cols = w_df.select_dtypes(include=['object']).columns
    w_df[object_cols] = w_df[object_cols].fillna('')

    return w_df

In [43]:
workouts_df = filter_workouts_and_remove_nans(workouts_df).copy()

In [37]:
type(workouts_df.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [38]:
type(activities_df.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [44]:
workouts_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 735 entries, 2022-03-13 to 2025-03-03
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   WorkoutType              735 non-null    object 
 1   Title                    735 non-null    object 
 2   WorkoutDescription       735 non-null    object 
 3   CoachComments            735 non-null    object 
 4   HeartRateAverage         547 non-null    float64
 5   TimeTotalInHours         572 non-null    float64
 6   DistanceInMeters         572 non-null    float64
 7   PlannedDuration          439 non-null    float64
 8   PlannedDistanceInMeters  124 non-null    float64
dtypes: float64(5), object(4)
memory usage: 57.4+ KB


In [46]:
workouts_df.index < GIVEN_DATE

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [47]:
workouts_df

Unnamed: 0_level_0,WorkoutType,Title,WorkoutDescription,CoachComments,HeartRateAverage,TimeTotalInHours,DistanceInMeters,PlannedDuration,PlannedDistanceInMeters
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-03-13,Run,Running,,,147.0,0.227822,2190.780029,,
2022-03-14,Run,Running,,,126.0,0.317827,2363.750000,,
2022-03-15,Run,Running,,,162.0,0.366062,3018.330078,,
2022-03-16,Run,Running,,,151.0,0.500626,3558.080078,,
2022-03-19,Run,Running,,,138.0,0.093556,760.710022,,
...,...,...,...,...,...,...,...,...,...
2025-02-28,Run,Zone 2 Run,"Warm up for 5 minutes in zone 1, 50 minutes in...",,,,,1.000000,
2025-03-01,Swim,Endurance Swim #3,Warm Up: 200 Zone 1 easy 4X1000 Zone 2b 100 ea...,,,,,1.537334,3931.920166
2025-03-02,Run,Brick Run Mile Repeats,Warm up for 10 minutes in zone 1-2 then do: 2X...,,,,,0.750556,5845.000000
2025-03-02,Bike,Race Pace Intervals,Warm Up: -20 minutes Zone 1 Main Set: -20 minu...,This is a race simulated workout. Simulate rac...,,,,2.000000,


In [48]:
w_df, tss_df, atl_df, ctl_df, tsb_df = calculate_total_tss_and_metrics_from_tss(workouts_df, 'data_processing')


DatetimeIndex(['2022-03-13', '2022-03-14', '2022-03-15', '2022-03-16',
               '2022-03-19', '2022-03-19', '2022-03-20', '2022-03-21',
               '2022-03-22', '2022-03-23',
               ...
               '2025-02-25', '2025-02-25', '2025-02-26', '2025-02-27',
               '2025-02-28', '2025-02-28', '2025-03-01', '2025-03-02',
               '2025-03-02', '2025-03-03'],
              dtype='datetime64[ns]', name='Date', length=735, freq=None)





<class 'pandas.core.indexes.datetimes.DatetimeIndex'>





DatetimeIndex(['2022-03-13', '2022-03-14', '2022-03-15', '2022-03-16',
               '2022-03-19', '2022-03-19', '2022-03-20', '2022-03-21',
               '2022-03-22', '2022-03-23',
               ...
               '2025-02-25', '2025-02-25', '2025-02-26', '2025-02-27',
               '2025-02-28', '2025-02-28', '2025-03-01', '2025-03-02',
               '2025-03-02', '2025-03-03'],
              dtype='datetime64[ns]', name='Date', length=735, freq=None)


In [49]:
w_df

Unnamed: 0_level_0,WorkoutType,Title,WorkoutDescription,CoachComments,HeartRateAverage,TimeTotalInHours,DistanceInMeters,PlannedDuration,PlannedDistanceInMeters,Run_TSS Calculated,Bike_TSS Calculated,Swim_TSS Calculated,TOTAL TSS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2022-03-13,Run,Running,,,147.0,0.227822,2190.780029,,,10.293289,0.000000,0.000000,10.293289
2022-03-14,Run,Running,,,126.0,0.317827,2363.750000,,,9.190303,0.000000,0.000000,9.190303
2022-03-15,Run,Running,,,162.0,0.366062,3018.330078,,,21.602197,0.000000,0.000000,21.602197
2022-03-16,Run,Running,,,151.0,0.500626,3558.080078,,,24.375106,0.000000,0.000000,24.375106
2022-03-19,Run,Running,,,138.0,0.093556,760.710022,,,3.533398,0.000000,0.000000,3.533398
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-28,Run,Zone 2 Run,"Warm up for 5 minutes in zone 1, 50 minutes in...",,,,,1.000000,,39.357967,0.000000,0.000000,39.357967
2025-03-01,Swim,Endurance Swim #3,Warm Up: 200 Zone 1 easy 4X1000 Zone 2b 100 ea...,,,,,1.537334,3931.920166,0.000000,0.000000,49.412848,49.412848
2025-03-02,Run,Brick Run Mile Repeats,Warm up for 10 minutes in zone 1-2 then do: 2X...,,,,,0.750556,5845.000000,29.540341,0.000000,0.000000,29.540341
2025-03-02,Bike,Race Pace Intervals,Warm Up: -20 minutes Zone 1 Main Set: -20 minu...,This is a race simulated workout. Simulate rac...,,,,2.000000,,0.000000,80.175425,0.000000,80.175425


In [52]:
w_df_calories_calculated = calculate_total_calories(USER_DATA_FILE, df=w_df)

NameError: name 'USER_DATA_FILE' is not defined