In [2]:
import pandas as pd
import numpy as np
import warnings
from scipy.stats import moment, skew, kurtosis

from src.settings import ROOT_DIR
from src.preprocessing.get_anon_id import get_anon_id
from src.utils.data_split import save_and_split

In [62]:
data: pd.DataFrame = pd.read_pickle(ROOT_DIR / 'data' / 'processed' / 'anon_imu_data_time_series_cleaned.pkl')

In [63]:
data.head()

Unnamed: 0,anon_id,date_measure,time_stamp,imu_gyroX_right,imu_gyroY_right,imu_gyroZ_right,imu_accX_right,imu_accY_right,imu_accZ_right,imu_gyroX_left,...,imu_angularX_left,imu_angularY_left,imu_angularZ_left,imu_angularX_right,imu_angularY_right,imu_angularZ_right,imu_angularX_spine,imu_angularY_spine,imu_angularZ_spine,PD
0,9,2023-9-29-11-53-21,0.0,-721.682708,15.593227,-388.127748,9.181793,2.01005,-2.132478,-786.340071,...,-0.272368,0.018669,-0.1152,-0.873084,0.076169,-0.284259,-0.598579,-0.068011,-0.241989,1
1,9,2023-9-29-11-53-21,20.0,-670.260115,106.966612,-531.556509,9.330485,2.006033,-2.221657,-836.277343,...,-0.925128,-0.000353,-0.203227,1.15918,0.022186,-0.128051,0.950772,0.073927,0.38569,1
2,9,2023-9-29-11-53-21,39.0,-1256.346678,154.354034,-583.745247,9.430883,2.003482,-2.231009,-784.334538,...,-1.895445,-0.033696,-0.329384,3.761404,-0.046249,0.08444,0.04003,-0.036652,0.075855,1
3,9,2023-9-29-11-53-21,64.0,-2645.955791,139.340391,-516.938939,9.452909,2.061261,-2.123366,-575.420332,...,-2.67541,-0.07121,-0.420512,4.947832,-0.077489,0.211777,0.153714,-0.063593,0.04717,1
4,9,2023-9-29-11-53-21,83.0,-4370.713652,81.916044,-388.833685,9.392902,2.283368,-1.919931,-228.216868,...,-2.836775,-0.099201,-0.418456,3.459541,-0.044035,0.151183,0.325708,-0.031565,0.1072,1


In [64]:
data.columns

Index(['anon_id', 'date_measure', 'time_stamp', 'imu_gyroX_right',
       'imu_gyroY_right', 'imu_gyroZ_right', 'imu_accX_right',
       'imu_accY_right', 'imu_accZ_right', 'imu_gyroX_left', 'imu_gyroY_left',
       'imu_gyroZ_left', 'imu_accX_left', 'imu_accY_left', 'imu_accZ_left',
       'imu_gyroX_spine', 'imu_gyroY_spine', 'imu_gyroZ_spine',
       'imu_accX_spine', 'imu_accY_spine', 'imu_accZ_spine',
       'imu_angleX_right', 'imu_angleY_right', 'imu_angleZ_right',
       'imu_angleX_left', 'imu_angleY_left', 'imu_angleZ_left',
       'imu_angleX_spine', 'imu_angleY_spine', 'imu_angleZ_spine',
       'imu_angularX_left', 'imu_angularY_left', 'imu_angularZ_left',
       'imu_angularX_right', 'imu_angularY_right', 'imu_angularZ_right',
       'imu_angularX_spine', 'imu_angularY_spine', 'imu_angularZ_spine', 'PD'],
      dtype='object')

In [65]:
# Creamos una lista de los valores únicos de fecha como strings
fechas_unicas = data['date_measure'].astype(str).unique()
# 'fechas_unicas' contiene una lista de las fechas únicas encontradas en 'date_measure' como strings
print(fechas_unicas)

['2023-9-29-11-53-21' '2023-6-22-10-28-8' '2023-6-16-14-45-43'
 '2023-9-19-10-12-28' '2023-3-23-13-14-7' '2023-7-6-12-7-50'
 '2023-11-7-15-14-42' '2023-2-23-15-58-22' '2023-9-30-8-42-24'
 '2023-9-30-8-58-32' '2024-1-16-16-1-25' '2023-6-16-9-48-30'
 '2023-6-17-8-26-3' '2023-3-17-11-12-26' '2023-3-18-13-33-51'
 '2023-6-17-8-39-41' '2023-3-17-11-1-26' '2023-6-16-14-38-22'
 '2023-10-5-10-2-44' '2023-3-18-10-50-25' '2023-7-6-9-33-42'
 '2023-9-5-12-31-47' '2024-2-2-9-34-54' '2023-2-23-15-37-18'
 '2023-6-29-10-40-43' '2023-6-22-9-54-43' '2023-9-30-9-41-22'
 '2023-9-30-12-3-43' '2023-3-23-10-28-48' '2024-2-2-11-11-47'
 '2024-1-30-9-6-49' '2024-1-30-10-17-23' '2024-2-15-8-52-21'
 '2024-2-15-10-1-41' '2024-2-15-11-18-44' '2023-6-17-9-42-57'
 '2023-8-31-9-14-29' '2023-2-23-14-11-53' '2024-2-13-15-13-47'
 '2023-9-30-12-28-15' '2023-9-7-12-2-5' '2023-2-21-15-8-52'
 '2023-11-21-9-17-56' '2023-9-22-14-56-19' '2023-4-11-8-49-25'
 '2023-3-30-8-35-11' '2023-11-3-10-52-41' '2023-6-16-11-54-33'
 '2023-9-5

# Feature Engineering Stage 1: Statistical measures

In the stage 1 of the feature engineering, we will build a total of 15 simple statistical features —
1. mean
2. standard deviation
3. average absolute deviation
4. minimum value
5. maximum value
6. difference of maximum and minimum values
7. median
8. median absolute deviation
9. interquartile range
10. negative values count
11. positive values count
12. number of values above mean
13. number of peaks
14. skewness
15. kurtosis

Definir la función para calcular las medidas estadísticas de una ventana

In [66]:
def calcular_medidas_estadisticas(columna):
    #print(columna)
    medidas = {
        'mean': np.mean(columna),
        'std': np.std(columna),
        'mean_abs_dev': np.mean(np.abs(columna - np.mean(columna))),
        'min': np.min(columna),
        'max': np.max(columna),
        'range': np.max(columna) - np.min(columna),
        'median': np.median(columna),
        'median_abs_dev': np.median(np.abs(columna - np.median(columna))),
        'interquartile_range': np.percentile(columna, 75) - np.percentile(columna, 25),
        'negative_count': np.sum(columna < 0),
        'positive_count': np.sum(columna > 0),
        'above_mean_count': np.sum(columna > np.mean(columna)),
        'local_maxima_count': len(columna) - np.sum((columna.shift(-1) < columna) & (columna.shift(1) < columna)),
        'skewness': skew(columna),
        'kurtosis': kurtosis(columna)
    }
    #print(medidas)
    return medidas

Crear un nuevo DataFrame para almacenar los datos de las ventanas

In [67]:
columnas_sensores = ['imu_gyroX_right', 'imu_gyroY_right', 'imu_gyroZ_right', 'imu_accX_right', 'imu_accY_right', 'imu_accZ_right', 'imu_gyroX_left', 'imu_gyroY_left', 'imu_gyroZ_left', 'imu_accX_left', 'imu_accY_left', 'imu_accZ_left', 'imu_gyroX_spine', 'imu_gyroY_spine', 'imu_gyroZ_spine', 'imu_accX_spine', 'imu_accY_spine', 'imu_accZ_spine', 'imu_angleX_right', 'imu_angleY_right', 'imu_angleZ_right', 'imu_angleX_left', 'imu_angleY_left', 'imu_angleZ_left', 'imu_angleX_spine', 'imu_angleY_spine', 'imu_angleZ_spine', 'imu_angularX_left', 'imu_angularY_left', 'imu_angularZ_left', 'imu_angularX_right', 'imu_angularY_right', 'imu_angularZ_right', 'imu_angularX_spine', 'imu_angularY_spine', 'imu_angularZ_spine']
medidas_estadisticas = ['mean', 'std', 'mean_abs_dev',
                      'min', 'max', 'range', 'median', 'median_abs_dev',
                      'interquartile_range', 'negative_count', 'positive_count', 'above_mean_count',
                      'local_maxima_count', 'skewness', 'kurtosis']
def generar_nombres_columnas(columnas_sensores, medidas_estadisticas):
    nombres_columnas = ['anon_id', 'date_measure', 'window_number', 'first_timestamp']
    for columna in columnas_sensores:
        for medida in medidas_estadisticas:
            nombre_columna = f"{columna}_{medida}"
            nombres_columnas.append(nombre_columna)
    nombres_columnas.append("PD")
    return nombres_columnas

columnas_df_final=generar_nombres_columnas(columnas_sensores,medidas_estadisticas)
print(columnas_df_final)

['anon_id', 'date_measure', 'window_number', 'first_timestamp', 'imu_gyroX_right_mean', 'imu_gyroX_right_std', 'imu_gyroX_right_mean_abs_dev', 'imu_gyroX_right_min', 'imu_gyroX_right_max', 'imu_gyroX_right_range', 'imu_gyroX_right_median', 'imu_gyroX_right_median_abs_dev', 'imu_gyroX_right_interquartile_range', 'imu_gyroX_right_negative_count', 'imu_gyroX_right_positive_count', 'imu_gyroX_right_above_mean_count', 'imu_gyroX_right_local_maxima_count', 'imu_gyroX_right_skewness', 'imu_gyroX_right_kurtosis', 'imu_gyroY_right_mean', 'imu_gyroY_right_std', 'imu_gyroY_right_mean_abs_dev', 'imu_gyroY_right_min', 'imu_gyroY_right_max', 'imu_gyroY_right_range', 'imu_gyroY_right_median', 'imu_gyroY_right_median_abs_dev', 'imu_gyroY_right_interquartile_range', 'imu_gyroY_right_negative_count', 'imu_gyroY_right_positive_count', 'imu_gyroY_right_above_mean_count', 'imu_gyroY_right_local_maxima_count', 'imu_gyroY_right_skewness', 'imu_gyroY_right_kurtosis', 'imu_gyroZ_right_mean', 'imu_gyroZ_right_s

In [68]:
df_ventanas = pd.DataFrame(columns=columnas_df_final)

Definir los parámetros para las ventanas

In [69]:
frecuencia_muestreo = 50  # Hz
longitud_ventana_s = 1  # segundos
overlap = 0.5  # 50%

Calcular la cantidad de filas por ventana y el paso de tiempo entre ventanas

In [70]:
filas_por_ventana = int(longitud_ventana_s * frecuencia_muestreo)
paso_tiempo = int(filas_por_ventana * (1 - overlap))

In [71]:
# Ignorar todos los warnings
warnings.filterwarnings("ignore")

In [72]:
columnas_df_final_sensores=columnas_df_final
columnas_df_final_sensores.remove('anon_id')
columnas_df_final_sensores.remove('date_measure')
columnas_df_final_sensores.remove('window_number')
columnas_df_final_sensores.remove('first_timestamp')
columnas_df_final_sensores.remove('PD')
print(columnas_df_final_sensores)

['imu_gyroX_right_mean', 'imu_gyroX_right_std', 'imu_gyroX_right_mean_abs_dev', 'imu_gyroX_right_min', 'imu_gyroX_right_max', 'imu_gyroX_right_range', 'imu_gyroX_right_median', 'imu_gyroX_right_median_abs_dev', 'imu_gyroX_right_interquartile_range', 'imu_gyroX_right_negative_count', 'imu_gyroX_right_positive_count', 'imu_gyroX_right_above_mean_count', 'imu_gyroX_right_local_maxima_count', 'imu_gyroX_right_skewness', 'imu_gyroX_right_kurtosis', 'imu_gyroY_right_mean', 'imu_gyroY_right_std', 'imu_gyroY_right_mean_abs_dev', 'imu_gyroY_right_min', 'imu_gyroY_right_max', 'imu_gyroY_right_range', 'imu_gyroY_right_median', 'imu_gyroY_right_median_abs_dev', 'imu_gyroY_right_interquartile_range', 'imu_gyroY_right_negative_count', 'imu_gyroY_right_positive_count', 'imu_gyroY_right_above_mean_count', 'imu_gyroY_right_local_maxima_count', 'imu_gyroY_right_skewness', 'imu_gyroY_right_kurtosis', 'imu_gyroZ_right_mean', 'imu_gyroZ_right_std', 'imu_gyroZ_right_mean_abs_dev', 'imu_gyroZ_right_min', 'im

In [73]:
# Iterar sobre los datos originales en el DataFrame
window_number = 1
for i in range(0, len(data) - filas_por_ventana + 1, paso_tiempo):
    # Seleccionar la primera fila de cada ventana
    primera_fila_ventana = data.iloc[i]

    # Seleccionar los valores relevantes de la fila
    anon_id = primera_fila_ventana['anon_id']
    date_measure = primera_fila_ventana['date_measure']
    PD = primera_fila_ventana['PD']
    first_timestamp = primera_fila_ventana['time_stamp']

    # Seleccionar la ventana incluyendo tanto los valores de los sensores como los metadatos
    ventana = data.iloc[i:i + filas_por_ventana]

    # Verificar si los valores en la primera y última fila de la ventana son iguales
    while not ventana.iloc[0][['anon_id', 'date_measure', 'PD']].equals(
            ventana.iloc[-1][['anon_id', 'date_measure', 'PD']]):
        ventana = ventana.iloc[:-1]  # Reducir la longitud de la ventana eliminando la última fila

    # Eliminar los valores de 'anon_id', 'date_measure' y 'PD' de la ventana antes de calcular las medidas estadísticas
    ventana_valores_sensor = ventana.drop(['anon_id', 'date_measure', 'time_stamp', 'PD'], axis=1)

    # Calcular las medidas estadísticas de la ventana para cada columna
    datos_ventana = {
        'anon_id': anon_id,
        'date_measure': date_measure,
        'window_number': window_number,
        'first_timestamp': first_timestamp,
        'PD': PD
    }
    for sensor_col in ventana_valores_sensor.columns:
        medidas = calcular_medidas_estadisticas(ventana_valores_sensor[sensor_col])
        for medida, medida_valor in medidas.items():
            nombre_columna = f"{sensor_col}_{medida}"
            datos_ventana[nombre_columna] = medida_valor

    # Añadir los datos de la ventana al DataFrame
    df_ventanas = pd.concat([df_ventanas, pd.DataFrame(datos_ventana, index=[0])], ignore_index=True)

    window_number += 1


In [74]:
# Restaurar el comportamiento predeterminado de los warnings
warnings.filterwarnings("default")

In [75]:
# Mostrar las primeras filas del DataFrame de ventanas
print(df_ventanas.head())

  anon_id        date_measure window_number  first_timestamp  \
0       9  2023-9-29-11-53-21             1              0.0   
1       9  2023-9-29-11-53-21             2            532.0   
2       9  2023-9-29-11-53-21             3           1029.0   
3       9  2023-9-29-11-53-21             4           1587.0   
4       9  2023-9-29-11-53-21             5           2121.0   

   imu_gyroX_right_mean  imu_gyroX_right_std  imu_gyroX_right_mean_abs_dev  \
0            344.527504          2066.933206                   1375.765062   
1            -19.660417          1231.834050                    996.842663   
2            312.616200          1460.410867                   1236.003103   
3            504.363670          1551.660978                   1195.522472   
4           -775.488526          1733.126911                   1391.358979   

   imu_gyroX_right_min  imu_gyroX_right_max  imu_gyroX_right_range  ...  \
0         -5573.486673          4888.726779           10462.213452  ...

In [12]:
df_ventanas.to_pickle(ROOT_DIR / 'data' / 'processed'/'anon_imu_data_features.pkl')
df_ventanas.to_csv(ROOT_DIR / 'data' / 'processed'/'anon_imu_data_features.csv')

In [3]:
#df_ventanas: pd.DataFrame = pd.read_pickle(ROOT_DIR / 'data' / 'processed' / 'anon_imu_data_features.csv')

In [13]:
print(df_ventanas.first_timestamp.tolist())

[0.0, 532.0, 1029.0, 1587.0, 2121.0, 2692.0, 3221.0, 3723.0, 4262.0, 4763.0, 5260.0, 5762.0, 6287.0, 6788.0, 7315.0, 7888.0, 8460.0, 9032.0, 9533.0, 10054.0, 10581.0, 11179.0, 11764.0, 12261.0, 12806.0, 13386.0, 13924.0, 14420.0, 14967.0, 15467.0, 16015.0, 16553.0, 17050.0, 17551.0, 18052.0, 18553.0, 19054.0, 19551.0, 20052.0, 20553.0, 21073.0, 21574.0, 22107.0, 22608.0, 23106.0, 23607.0, 24128.0, 24629.0, 25129.0, 25651.0, 26152.0, 26653.0, 27193.0, 27728.0, 28261.0, 28762.0, 29263.0, 29837.0, 30338.0, 30838.0, 31353.0, 31855.0, 32352.0, 32852.0, 33353.0, 33899.0, 34399.0, 34897.0, 35429.0, 35930.0, 36426.0, 36984.0, 37498.0, 37999.0, 38514.0, 39042.0, 39543.0, 40044.0, 40560.0, 41073.0, 41588.0, 42161.0, 42693.0, 43194.0, 43760.0, 44258.0, 44758.0, 45259.0, 45759.0, 46256.0, 46757.0, 47258.0, 47759.0, 48260.0, 48783.0, 49283.0, 49805.0, 50305.0, 50801.0, 51302.0, 51803.0, 52304.0, 52805.0, 53302.0, 53840.0, 54343.0, 54844.0, 55345.0, 55842.0, 56343.0, 56844.0, 57370.0, 57866.0, 58367