In [None]:
import pandas as pd
import math

# Configuracion global
CHUNK_TIME = 30 # En segundos
SAMPLE_RATE = 250 # En Hz
CHUNK_SIZE = math.floor(CHUNK_TIME * SAMPLE_RATE)

# Leer tabla RAW

In [None]:
# Leer la RAW
directory = "../Raws/Segunda_Temporada_Grabaciones/"

recording_file = directory + "RAW_COMPLETA_17-04-2024.csv"
RAW_DATA = pd.read_csv(recording_file)
METADATA_COLUMNS = RAW_DATA.columns[0:9]

pd.set_option("display.max_columns", None)
display(RAW_DATA)

# Preprocesar BrainAccess

In [None]:
from scipy import signal
from sklearn.decomposition import FastICA

#Canales del casco
EEG_CHANNELS = 16

#Columnas correspondientes a EEG
EEG_COLUMNS = RAW_DATA.columns[9:28]
#Columnas EEG sin acelerometro
EEG_COLUMNS_FOR_ICA = RAW_DATA.columns[9:25]

RAW_EEG = RAW_DATA[METADATA_COLUMNS].join(RAW_DATA[EEG_COLUMNS])
preprocessed_eeg = RAW_EEG.dropna(subset=EEG_COLUMNS)

# Definir un filtro, por ejemplo, un filtro paso bajo Butterworth
ORDER = 6
FS = 2000  # Frecuencia de muestreo (Hz)
FC = 300  # Frecuencia de corte (Hz)
b, a = signal.butter(ORDER, FC / (FS / 2), 'low')

# Configuracion de ICA
ica = FastICA(n_components=EEG_CHANNELS, random_state=42)

# Definir la cantidad de últimas medidas para calcular la media
N = int(CHUNK_TIME * SAMPLE_RATE)

# Separar la RAW por takes
eeg_endtimes = {}
eeg_takes = {}

for (subjectID, take), eeg_take in preprocessed_eeg.groupby(['Subject ID', 'Take']):
    take_name = f"Subject {subjectID} Take {take}"

    any_nan_rows = eeg_take.isnull().any(axis=1).sum()
    assert any_nan_rows == 0, "No se han eliminado todos los NaN"

    #Guardamos el tiempo final para escalar EmotiBit mas adelante
    eeg_endtimes[take_name] = eeg_take["time"].iloc[-1]

    expectedTimeEntries = int(eeg_endtimes[take_name] * SAMPLE_RATE + 1)
    assert eeg_take.shape[0] == expectedTimeEntries, "Se esperan " + str(expectedTimeEntries) + " entradas de tiempo del EEG. Hay " + str(eeg_take.shape[0])

    # Aplicar el filtro a cada columna por separado
    for column in EEG_COLUMNS_FOR_ICA:
        eeg_take[column] = signal.filtfilt(b, a, eeg_take[column])

    # Aplicar FastICA a las columnas del EEG
    eeg_take[EEG_COLUMNS_FOR_ICA] = ica.fit_transform(eeg_take[EEG_COLUMNS_FOR_ICA])

    # Crear la nueva columna "media activación" para cada electrodo
    for electrode in range(EEG_CHANNELS):  
        col_name = f'{EEG_COLUMNS_FOR_ICA[electrode]}_media_activacion'
        eeg_take[col_name] = eeg_take.iloc[:, electrode + 8].rolling(window=N).mean()

    eeg_takes[take_name] = eeg_take

# Reconstruir tabla completa
preprocessed_eeg = pd.concat(eeg_takes.values(), ignore_index=True)

display(preprocessed_eeg)

# Preprocesar EmotiBit

In [None]:
from sklearn.preprocessing import StandardScaler

# Columnas correspondientes a EmotiBit
# Dropear tags no biometricas
EMOTIBIT_COLUMNS = RAW_DATA.columns[30:51].drop(["ER"])

RAW_EMOTIBIT = RAW_DATA[METADATA_COLUMNS].join(RAW_DATA[EMOTIBIT_COLUMNS])
preprocessed_emotibit = RAW_EMOTIBIT.dropna(subset=EMOTIBIT_COLUMNS, how='all')

# Configuracion del StardardScaler
scaler = StandardScaler()

# Separar la RAW por takes
emotibit_takes = {}
valid_start_times = {}

for (subjectID, take), emotibit_take in preprocessed_emotibit.groupby(['Subject ID', 'Take']):
    take_name = f"Subject {subjectID} Take {take}"

    # Escalar el tiempo
    emotibit_endtime = emotibit_take["time"].iloc[-1]
    emotibit_take["time"] = emotibit_take["time"] * eeg_endtimes[take_name] / emotibit_endtime

    # Añadir las entradas de tiempo regular
    emotibit_take = emotibit_take.merge(eeg_takes[take_name][["time"]], on="time", how="outer")

    emotibit_take = emotibit_take.sort_values(by="time")

    # Mantener el valor hasta que encontremos uno nuevo
    emotibit_take.ffill(inplace=True)

    # Eliminar entradas de tiempo irregular
    emotibit_take = emotibit_take[emotibit_take["time"].isin(eeg_takes[take_name]["time"])]

    # Los valores de las columnas HR y BI tardan varios segundos en llegar, 
    # por lo que debemos cortar los valores anteriores a que lleguen
    valid_start_times[take_name] = CHUNK_TIME

    any_nan_rows = emotibit_take[EMOTIBIT_COLUMNS].isnull().any(axis=1).sum()
    time_with_nan = any_nan_rows / SAMPLE_RATE
    if (time_with_nan > valid_start_times[take_name]):
        valid_start_times[take_name] = time_with_nan
        print("Los datos del sujeto " + str(take_name) + " no se completan hasta el segundo " + str(time_with_nan))

    # Aplicar StandardScaler a las columnas seleccionadas
    emotibit_take[EMOTIBIT_COLUMNS] = scaler.fit_transform(emotibit_take[EMOTIBIT_COLUMNS])

    expectedTimeEntries = int(eeg_endtimes[take_name] * SAMPLE_RATE + 1)
    assert emotibit_take.shape[0] == expectedTimeEntries, "Se esperan " + str(expectedTimeEntries) + " entradas de tiempo del EEG. Hay " + str(emotibit_take.shape[0])

    emotibit_takes[take_name] = emotibit_take

# Reconstruir tabla completa
preprocessed_emotibit = pd.concat(emotibit_takes.values(), ignore_index=True)

display(preprocessed_emotibit)

# TableApp

In [None]:
# Columnas correspondientes a TableApp
TABLEAPP_COLUMNS = RAW_DATA.columns[69:]

RAW_TABLEAPP = RAW_DATA[METADATA_COLUMNS].join(RAW_DATA[TABLEAPP_COLUMNS])
preprocessed_tableapp = RAW_TABLEAPP.dropna(subset=TABLEAPP_COLUMNS)

# Reescalar emociones
preprocessed_tableapp.loc[:, TABLEAPP_COLUMNS] = (preprocessed_tableapp[TABLEAPP_COLUMNS] - 3) / (5 - 3)

# Separar la RAW por takes
tableapp_takes = {}

for (subjectID, take), tableapp_take in preprocessed_tableapp.groupby(['Subject ID', 'Take']):
    take_name = f"Subject {subjectID} Take {take}"

    # Añadir las entradas de tiempo regular
    tableapp_take = tableapp_take.merge(eeg_takes[take_name][["time"]], on="time", how="outer")
    
    tableapp_take = tableapp_take.sort_values(by="time")
    
    # Rellenar NaN replicando la ultima entrada hasta que se encuentre una nueva
    tableapp_take.ffill(inplace=True)

    # Eliminar entradas de tiempo irregular
    tableapp_take = tableapp_take[tableapp_take["time"].isin(eeg_takes[take_name]["time"])]

    any_nan_rows = tableapp_take.isnull().any(axis=1).sum()
    assert any_nan_rows == 0, "No se han eliminado todos los NaN"

    expectedTimeEntries = int(eeg_endtimes[take_name] * SAMPLE_RATE + 1)
    assert tableapp_take.shape[0] == expectedTimeEntries, "Se esperan " + str(expectedTimeEntries) + " entradas de tiempo del EEG. Hay " + str(tableapp_take.shape[0])

    tableapp_takes[take_name] = tableapp_take

# Concatenar de nuevo todos los takes
preprocessed_tableapp = pd.concat(tableapp_takes.values(), ignore_index=True)

display(preprocessed_tableapp)

# Generar tabla preprocesada

In [None]:
import numpy as np
import math

preprocessed_data = pd.concat([preprocessed_eeg, preprocessed_emotibit[EMOTIBIT_COLUMNS], preprocessed_tableapp[TABLEAPP_COLUMNS]], axis=1)

preprocessed_takes = {}

for (subjectID, take), preprocessed_take in preprocessed_data.groupby(['Subject ID', 'Take']):
    take_name = f"Subject {subjectID} Take {take}"
    
    # Recortar el principio y el final
    preprocessed_take = preprocessed_take[preprocessed_take['time'] >= valid_start_times[take_name]]
    preprocessed_take = preprocessed_take[preprocessed_take['time'] < math.floor(eeg_endtimes[take_name])]

    any_nan_rows = tableapp_take.isnull().any(axis=1).sum()
    assert any_nan_rows == 0, "No se han eliminado todos los NaN"

    preprocessed_takes[take_name] = preprocessed_take

# Concatenar de nuevo todos los takes
preprocessed_data = pd.concat(preprocessed_takes.values(), ignore_index=True)

training_input = preprocessed_data.drop(columns=METADATA_COLUMNS).drop(columns=TABLEAPP_COLUMNS)

#Reestructuracion en filas con un chunk de informacion
expanded_columns = [f'{col}_t{((i - 1) / SAMPLE_RATE):.3f}' for col in training_input.columns for i in range(1, CHUNK_SIZE + 1)]

rows, cols = training_input.shape
compressed_data = np.zeros((rows // CHUNK_SIZE, cols * CHUNK_SIZE))

for row in range(0, rows // CHUNK_SIZE):
    start_idx = row * CHUNK_SIZE
    end_idx = start_idx + CHUNK_SIZE
    
    for idx, col in enumerate(training_input.columns):
        col_data = training_input[col].values[start_idx:end_idx]
        compressed_data[row, idx * CHUNK_SIZE:(idx + 1) * CHUNK_SIZE] = col_data

training_input = pd.DataFrame(compressed_data, columns=expanded_columns)

display(training_input)

any_nan_rows = training_input.isnull().any(axis=1).sum()
assert any_nan_rows == 0, "No se han eliminado todos los NaN"

# Comprime emotions_df tomando el primer valor de cada grupo de filas con la misma frecuencia que la compresión de sorted_preprocessed_data
compressed_tableapp = preprocessed_data.iloc[::CHUNK_SIZE]
compressed_tableapp = compressed_tableapp[TABLEAPP_COLUMNS]
display(compressed_tableapp.iloc[:-1])
#Volver a añadir emotions
training_input.reset_index(drop=True, inplace=True)
compressed_tableapp.reset_index(drop=True, inplace=True)
merged_df = pd.concat([training_input, compressed_tableapp.iloc[:-1]], axis=1)
#merged_df = pd.merge(training_input, compressed_tableapp, left_index=True, right_index=True)
display(merged_df)
any_nan_rows = merged_df.isnull().any(axis=1).sum()
assert any_nan_rows == 0, "No se han eliminado todos los NaN" + str(any_nan_rows)



# Exportar tabla preprocesada

In [None]:
merged_df.to_csv("../Preprocesadas/Preprocessed_Data_Chunk_30_seg_17-04-2024.csv", index=False)