# ASA preprocessing module

This preprocessing module aims to label the data from the patients through a peak analysis.

## Required Imports

In [26]:
import re

import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder

from src.settings import ROOT_DIR
from src.utils import data_split
from src.preprocessing.asa_functions import get_asa_laterality

## Load Data

In [27]:
data: pd.DataFrame = pd.read_pickle(ROOT_DIR / 'data' / 'processed' / 'anon_imu_data_time_series_cleaned.pkl')

In [28]:
columns_for_labeling = ['date_measure', 'imu_angleY_left', 'imu_angleY_right']
labeling_data = data[columns_for_labeling]
labeling_data.head()

Unnamed: 0,date_measure,imu_angleY_left,imu_angleY_right
0,2023-9-29-11-53-21,-41.945678,346.797958
1,2023-9-29-11-53-21,-16.777883,435.826445
2,2023-9-29-11-53-21,-17.254295,461.75842
3,2023-9-29-11-53-21,-62.680428,407.701072
4,2023-9-29-11-53-21,-158.679678,317.129831


## Labeling Data
We will add to every measure the laterality of it based on a peak analysis.

In [29]:
laterality = []
dates = labeling_data['date_measure'].unique()

for date in dates:
    sample = labeling_data[labeling_data['date_measure'] == date]
    sample_lat = get_asa_laterality(sample['imu_angleY_left'].to_numpy(), sample['imu_angleY_right'].to_numpy())
    laterality.append(sample_lat)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [30]:
Y = pd.DataFrame()
Y['date_measure'] = dates
Y['laterality'] = laterality
Y.head()

Unnamed: 0,date_measure,laterality
0,2023-9-29-11-53-21,der
1,2023-6-22-10-28-8,der
2,2023-6-16-14-45-43,der
3,2023-9-19-10-12-28,der
4,2023-3-23-13-14-7,der


In [31]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(Y['laterality'])

In [32]:
encoded_labels[:4]

array([0, 0, 0, 0])

In [33]:
label_encoder.classes_

array(['der', 'izq'], dtype=object)

In [34]:
Y['label'] = encoded_labels

In [35]:
Y.head()

Unnamed: 0,date_measure,laterality,label
0,2023-9-29-11-53-21,der,0
1,2023-6-22-10-28-8,der,0
2,2023-6-16-14-45-43,der,0
3,2023-9-19-10-12-28,der,0
4,2023-3-23-13-14-7,der,0


## Features
The final laterality model will be focused on left and righ limbs, not it spine, due to it foes not affects this field. By that reason spine_base columns will be removed in this module.

In [36]:
feature_columns = data.columns.tolist()[3:-1]

spine_patter = re.compile("^imu_.*_spine")
feature_columns = [col for col in feature_columns if not spine_patter.match(col)]
feature_columns

['imu_gyroX_right',
 'imu_gyroY_right',
 'imu_gyroZ_right',
 'imu_accX_right',
 'imu_accY_right',
 'imu_accZ_right',
 'imu_gyroX_left',
 'imu_gyroY_left',
 'imu_gyroZ_left',
 'imu_accX_left',
 'imu_accY_left',
 'imu_accZ_left',
 'imu_angleX_right',
 'imu_angleY_right',
 'imu_angleZ_right',
 'imu_angleX_left',
 'imu_angleY_left',
 'imu_angleZ_left',
 'imu_angularX_left',
 'imu_angularY_left',
 'imu_angularZ_left',
 'imu_angularX_right',
 'imu_angularY_right',
 'imu_angularZ_right']

In [37]:
sequences = []

for date, group in data.groupby('date_measure'):
    sequence_features = group[feature_columns]
    label = Y[Y.date_measure == date].iloc[0].label
    
    sequences.append((sequence_features.values, label))

In [38]:
sequences[0]

(array([[ 4.85520635e+02,  2.45096017e+02, -8.56655114e+01, ...,
          3.97876878e+00, -3.15354652e-01,  6.18834463e-01],
        [-2.89996008e+02,  1.16847725e+02,  4.03042105e+02, ...,
          1.04482421e+00, -1.54729017e-01,  1.51040283e-02],
        [-6.52201860e+02,  1.92986548e+02,  5.73127655e+02, ...,
         -2.93821537e+00,  7.00484272e-02, -7.64249504e-01],
        ...,
        [-1.81756000e+02,  8.17445367e+02, -3.33828648e+02, ...,
          1.91789020e-01, -1.60111278e-01,  1.38514533e-02],
        [-2.36325631e+02,  8.97089628e+02, -1.58256160e+02, ...,
         -4.68360527e-01, -1.69687784e-01,  3.01703872e-02],
        [-3.42243476e+01,  9.74031477e+02,  3.15596017e+01, ...,
         -6.90768917e-01, -1.80833909e-01,  9.00073273e-02]]),
 0)

In [39]:
len(sequences)

166

## Padding
Due to models don't allow the usage of tuples (n, 24), where n is a dynamic value, as input shape, we are going to add padding to all the sequences. All the sequences will the size of
the sequences max length.

In [40]:
temp = [values for values, _ in sequences]
max_length = max(len(sequence) for sequence in temp)

In [41]:
max_length

16723

In [42]:
sequences_padded = tf.keras.preprocessing.sequence.pad_sequences(
    temp, 
    maxlen=max_length, 
    padding='post', 
    dtype='float32'
)

In [43]:
len(sequences_padded)

166

In [44]:
for i in range(len(sequences_padded)):
    sequences[i] = (sequences_padded[i], sequences[i][1])

## Save Data
Now we will save the data of the sequences where we have a 2-tuple by each record. The first value of the tuple is record containing all the information of the time-series, and the second one the label 
(0 or 1) which refers to the laterality affectation.

In [45]:
data_split.save_and_split_sequences(sequences, 'asa')