# Cuaderno que realiza el preprocesamiento del conjunto original

In [1]:
import pandas as pd
import re

En este cuaderno vamos a convertir los tipos de los atributos del conjunto de datos y vamos a cambiar el nombre de las variables utilizando el tipo de notación seleccionado y dejando un nombre más amigable para poder realizar operaciones con dataframes.

In [2]:
df = pd.read_csv('../../datasets/original-data/Exported Data_202304231610.csv')
df.head()

Unnamed: 0,Seconds from Start,Date Time,Speed (km/h),HR (bpm),Core Temperature °C,X accelerometer (Sideways),Y accelerometer (Forward),Z accelerometer (Vertical),Raw RR (ms),Filtered RR (ms),Difference,HR RMSSD,ActivityLoad 2D,ActivityLoad 3D
0,1003,[2023-04-23 18:10:38 770],0.0,,,,,,,,,,,
1,1101,[2023-04-23 18:10:38 868],,,,-113313.0,-35219.0,-238975.0,,,,,,
2,1104,[2023-04-23 18:10:38 871],0.0,,,,,,,,,,,
3,1110,[2023-04-23 18:10:38 877],,,,-140875.0,-15312.0,-246631.0,,,,,,
4,1120,[2023-04-23 18:10:38 887],,,,-120969.0,-7656.0,-255819.0,,,,,,


Primeramente, se modifica el nombre de las columnas por algunos nombre más cortos y siguiendo la notación indicada

In [3]:
new_columns = {'Seconds from Start': 'seconds', 
               'Date Time': 'date',
               'Speed (km/h)': 'speed',
               'HR (bpm)': 'hr',
               'Core Temperature °C': 'core_temperature',
               'X accelerometer (Sideways)': 'x',
               'Y accelerometer (Forward)': 'y',
               'Z accelerometer (Vertical)': 'z',
               'Raw RR (ms)': 'raw_rr',
               'Filtered RR (ms)': 'filtered_rr',
               'Difference': 'difference_rr',
               'HR RMSSD': 'hr_rmssd',
               'ActivityLoad 2D': 'activity_load_2d',
               'ActivityLoad 3D': 'activity_load_3d'
              }

In [4]:
df = df.rename(new_columns, axis=1)  # new method

In [5]:
df.columns

Index(['seconds', 'date', 'speed', 'hr', 'core_temperature', 'x', 'y', 'z',
       'raw_rr', 'filtered_rr', 'difference_rr', 'hr_rmssd',
       'activity_load_2d', 'activity_load_3d'],
      dtype='object')

Ahora cambiamos los tipos de datos a su tipo correspondiente.

In [6]:
df.dtypes

seconds              object
date                 object
speed                object
hr                  float64
core_temperature     object
x                    object
y                    object
z                    object
raw_rr              float64
filtered_rr         float64
difference_rr       float64
hr_rmssd             object
activity_load_2d     object
activity_load_3d     object
dtype: object

Primeramente, se va  modificar el atributo date. Para ello, habrá que eliminar los corchetes de su valor

In [8]:
def delete_brackets(value):
    if isinstance(value, str):
        # Se eliminan los corchetes
        new_value = re.sub(r'\[|\]', '', value)
        # Se añade un punto en los milisegundos
        new_value = re.sub(r'(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\d{3})', r'\1 \2.\3', new_value)
        return new_value
    return value

In [9]:
df['date'] = df['date'].apply(lambda x: delete_brackets(x))

In [10]:
df['date']

0          2023-04-23 18:10:38.770
1          2023-04-23 18:10:38.868
2          2023-04-23 18:10:38.871
3          2023-04-23 18:10:38.877
4          2023-04-23 18:10:38.887
                    ...           
1241159    2023-04-23 21:13:29.967
1241160    2023-04-23 21:13:29.977
1241161    2023-04-23 21:13:29.987
1241162    2023-04-23 21:13:29.996
1241163    2023-04-23 21:13:30.006
Name: date, Length: 1241164, dtype: object

In [11]:
df['date'] = df['date'].apply(pd.to_datetime)

Se procede a obtener los nuevos subconjuntos de datos

In [None]:
datasets_to_create = {
    'speed': ['speed'],
    'hr': ['hr'],
    'core_temperature': ['core_temperature'],
    'hr_rmssd': ['hr_rmssd'],
    'activity_load': ['activity_load_2d', 'activity_load_3d'],
    'accelerometer': ['x','y','z']
}

Se fijan dos columnas para todos los subconjuntos

In [None]:
fix_column = ['seconds', 'date']

In [None]:
for dataset, columns in datasets_to_create.items():
    df_temp = df.loc[:,fix_column + columns]
    df_temp= df_temp[df_temp[columns].notnull().all(axis=1)]
    print(f"{dataset}: {columns} ({len(df_temp.index)})")
    df_temp.to_csv(f'../../datasets/{dataset}.csv', index= False)

El rr se hace distinto

In [None]:
rr_df = df[df['raw_rr'].notnull() | df['filtered_rr'].notnull() | df['difference_rr'].notnull()]
rr_df

In [None]:
rr_df.to_csv('../../datasets/rr.csv')