In [14]:
#%config IPCompleter.use_jedi=False
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import sys 
import pandas as pd
import numpy as np

from pandas._libs.tslibs.timestamps import Timestamp
from pandas.core.frame import DataFrame
from pandas.core.series import Series


sys.path.append('..')
from cyclingEfficiency import Reader, Paths, Constants

## ¿Soy más eficiente en el uso de mi energía pedaleando con alta o baja cadencia? ¿Es diferente si estoy en terreno llano o en montaña?

## Limpieza de datos
Primero se debe leer la información. En este caso, se debe unir la información de cada actividad con el peso más cercano a la fecha del entrenamiento, ya que esta información se encuentra en archivos distintos.

In [16]:
def search_weight(
    date_to_search: Timestamp, df_weight: DataFrame, before: bool = False
) -> float:
    result: float = np.nan
    if before:
        df_result: DataFrame = df_weight[
            df_weight['date']<=date_to_search
        ]
        if df_result.shape[0] > 0:
            result = df_result.iloc[0,1]    
    else: 
        df_result: DataFrame = df_weight[
            df_weight['date']>=date_to_search
        ]
        if df_result.shape[0] > 0:
            result = df_result.iloc[-1,1]
    return result

data =  Reader().data

date_min: Timestamp = data['activities']['date'].min()
date_max: Timestamp = data['activities']['date'].max()

weight: DataFrame = pd.DataFrame(
    pd.date_range(
        start=date_min,
        end=date_max,
        freq='d'
    ),
    columns=['date']
)

weight['weight'] = weight['date'].apply(
    search_weight, df_weight=data['weight'], before=True
)

In [17]:
df = pd.merge(
    left=data['activities'],
    right=weight,
    on='date',
    how='left'
)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513970 entries, 0 to 3513969
Data columns (total 26 columns):
 #   Column     Dtype         
---  ------     -----         
 0   secs       int64         
 1   cad        float64       
 2   hr         float64       
 3   km         float64       
 4   kph        float64       
 5   nm         int64         
 6   watts      float64       
 7   alt        float64       
 8   lon        float64       
 9   lat        float64       
 10  headwind   int64         
 11  slope      float64       
 12  temp       float64       
 13  interval   int64         
 14  lrbalance  int64         
 15  lte        int64         
 16  rte        int64         
 17  lps        int64         
 18  rps        int64         
 19  smo2       int64         
 20  thb        int64         
 21  o2hb       int64         
 22  hhb        int64         
 23  datetime   datetime64[us]
 24  date       datetime64[us]
 25  weight     float64       
dtypes: datetime64[

Ahora se debe crear la variable 'zonas cardiacas' para tener un indicador más sencillo. Estas zonas se basan en los niveles de esfuerzo que puede tener el cuerpo, donde a mayor desgaste, mayor será la zona. En este caso, el rango va de uno a cinco.

Posteriormente, vamos a descartar datos que estén fuera de valores lógicos, además de los descensos.

In [19]:
def search_zone(hr_to_search: float, zones: list[float]) -> int:
    flag: bool = True
    size: int = len(zones)
    index: int = size - 1
    result: int = 0
    while flag and 0 <= index:
        if zones[index] <= hr_to_search:
            result = index + 1
            flag = False
        index -= 1            
    return result

hr_max: int = 190
zones: list[float] = []
for zone in Constants.HEART_ZONES:
    zones.append(int(zone * hr_max))

df['zones'] = df['hr'].apply(search_zone, zones=zones)

In [20]:
df_filtered = df[
    (df['kph'] > 0) &
    (df['cad'] > 0) &
    (df['hr'] > 0) &
    (df['slope'] >= 0) &
    (df['zones'] >= 0) &
    (~df['weight'].isna())
].reset_index()

In [21]:
df_filtered.shape

(667441, 28)

In [22]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667441 entries, 0 to 667440
Data columns (total 28 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   index      667441 non-null  int64         
 1   secs       667441 non-null  int64         
 2   cad        667441 non-null  float64       
 3   hr         667441 non-null  float64       
 4   km         667441 non-null  float64       
 5   kph        667441 non-null  float64       
 6   nm         667441 non-null  int64         
 7   watts      667441 non-null  float64       
 8   alt        667441 non-null  float64       
 9   lon        667441 non-null  float64       
 10  lat        667441 non-null  float64       
 11  headwind   667441 non-null  int64         
 12  slope      667441 non-null  float64       
 13  temp       667441 non-null  float64       
 14  interval   667441 non-null  int64         
 15  lrbalance  667441 non-null  int64         
 16  lte        667441 no

Después de realizar los cambios, se puede observar que los datos contienen muchas columnas con información en ceros o constante. Por esta razón, ahora se eliminarán esas columnas para hacer la base de datos menos pesada.

In [23]:
for column in df_filtered.columns:
    if not column in ['datetime', 'date']:
        print('Variable: {}, Mean: {:,.2f}\tStd: {:,.2f}'.format(
            column, df_filtered[column].mean(), df_filtered[column].std()
        ))

Variable: index, Mean: 1,640,778.33	Std: 1,064,534.33
Variable: secs, Mean: 8,598.69	Std: 6,488.56
Variable: cad, Mean: 76.34	Std: 10.20
Variable: hr, Mean: 140.63	Std: 29.09
Variable: km, Mean: 36.85	Std: 26.56
Variable: kph, Mean: 16.04	Std: 6.98
Variable: nm, Mean: 0.00	Std: 0.00
Variable: watts, Mean: 0.04	Std: 2.12
Variable: alt, Mean: 2,134.35	Std: 736.44
Variable: lon, Mean: -73.18	Std: 8.93
Variable: lat, Mean: 4.87	Std: 0.63
Variable: headwind, Mean: 0.00	Std: 0.00
Variable: slope, Mean: 4.52	Std: 3.25
Variable: temp, Mean: 17.20	Std: 5.46
Variable: interval, Mean: 0.00	Std: 0.00
Variable: lrbalance, Mean: -255.00	Std: 0.00
Variable: lte, Mean: 0.00	Std: 0.00
Variable: rte, Mean: 0.00	Std: 0.00
Variable: lps, Mean: 0.00	Std: 0.00
Variable: rps, Mean: 0.00	Std: 0.00
Variable: smo2, Mean: 0.00	Std: 0.00
Variable: thb, Mean: 0.00	Std: 0.00
Variable: o2hb, Mean: 0.00	Std: 0.00
Variable: hhb, Mean: 0.00	Std: 0.00
Variable: weight, Mean: 64.03	Std: 1.84
Variable: zones, Mean: 2.54	S

In [24]:
columns_to_delete: list[str] = []

for column in df_filtered.columns:
    if (
        df_filtered[column].mean() == 0 or 
        df_filtered[column].std() == 0
    ):
        columns_to_delete.append(column)
df_filtered.drop(columns=columns_to_delete, inplace=True)

Se agrega una nueva variable para diferencia cuando el terreno es llano o con elevación.

In [25]:
df_filtered['is_plain'] = df_filtered['slope'].apply(
            lambda x: 1 if x < 1 else 0
        )

In [26]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667441 entries, 0 to 667440
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   index     667441 non-null  int64         
 1   secs      667441 non-null  int64         
 2   cad       667441 non-null  float64       
 3   hr        667441 non-null  float64       
 4   km        667441 non-null  float64       
 5   kph       667441 non-null  float64       
 6   watts     667441 non-null  float64       
 7   alt       667441 non-null  float64       
 8   lon       667441 non-null  float64       
 9   lat       667441 non-null  float64       
 10  slope     667441 non-null  float64       
 11  temp      667441 non-null  float64       
 12  datetime  667441 non-null  datetime64[us]
 13  date      667441 non-null  datetime64[us]
 14  weight    667441 non-null  float64       
 15  zones     667441 non-null  int64         
 16  is_plain  667441 non-null  int64      