# FeatureExtractor

- Script Python que faz a extracao dos dados da mesma forma que o descrito no artigo. Basicamente adiciona o janelamento de 75% ao script `Main.ipynb`

In [None]:
import soundfile as sf
import numpy as np
from tg_utils import ExtractFeatures
import pandas as pd

In [None]:
## Defining Constants
BUFFER_LEN = 2048                           # Tamanho do buffer utilizado para o calculo da FFT
OVERLAP_STEP_SIZE = int(BUFFER_LEN * 0.25)   # Passo de 25% para um overlap de 75%
INPUT_AUDIO_CHANNEL = 0                     # Seleciona o canal dos arquivos de audio do microfone (0, 1)
FAULT_ID_OFF = 0
FAULT_ID_HEALTH = 1
FAULT_ID_BEARING = 2
FAULT_ID_FAN = 3
FAULT_ID_GEAR = 4


In [None]:
## Read Audio Sample
# Off_Cond_Data, Off_Cond_FS          = sf.read('../data/audio_files/original/M1_OFF_S1.flac') # Off Condition with noise ((Fault_ID1))
# Health_Cond_Data, Health_Cond_FS    = sf.read('../data/audio_files/original/M1_H_S1.flac')   # Healthy condition (Fault_ID2)
# Bearing_Cond_Data, Bearing_Cond_FS  = sf.read('../data/audio_files/original/M1_F1_S1.flac')  # Bearing fault (Fault_ID3)
# Fan_Cond_Data, Fan_Cond_FS          = sf.read('../data/audio_files/original/M1_F2_S1.flac')  # Fan fault (Fault_ID4)
# Gear_Cond_Data, Gear_Cond_FS        = sf.read('../data/audio_files/original/M1_F3_S1.flac')  # Gear fault (Fault_ID5)

AudioFiles = {
    "TrainingData": {
        "off": "../data/audio_files/wav/training/off_80.wav",
        "health": "../data/audio_files/wav/training/health_80.wav",
        "f1": "../data/audio_files/wav/training/f1_80.wav",
        "f2": "../data/audio_files/wav/training/f2_80.wav",
        "f3": "../data/audio_files/wav/training/f3_80.wav",
    },
    "ValidationData": {
        "off": "../data/audio_files/wav/validation/off_20.wav",
        "health": "../data/audio_files/wav/validation/health_20.wav",
        "f1": "../data/audio_files/wav/validation/f1_20.wav",
        "f2": "../data/audio_files/wav/validation/f2_20.wav",
        "f3": "../data/audio_files/wav/validation/f3_20.wav",
    },
    "RawData": {
        "off": "../data/audio_files/wav/full/off.wav",
        "health": "../data/audio_files/wav/full/health.wav",
        "f1": "../data/audio_files/wav/full/f1.wav",
        "f2": "../data/audio_files/wav/full/f2.wav",
        "f3": "../data/audio_files/wav/full/f3.wav",
    },
}

input_data_type = 'float32' #or int16

In [None]:
def extract_features_with_overlap(audio_data, fs, fault_id, buffer_len, step_size, feature_columns):
    """
    Extrai features de um sinal de audio usando uma janela deslizante com overlap.

    Args:
        audio_data (np.array): O array 1D de dados do audio.
        fs (int): A taxa de amostragem do audio.
        fault_id (int): O ID da falha a ser atribuído.
        buffer_len (int): O tamanho da janela de análise (buffer).
        step_size (int): O passo (step size) para deslizar a janela.
        feature_columns (list): A lista de nomes das colunas para o dicionário de features.

    Returns:
        list: Uma lista de dicionários, onde cada dicionário representa as features de uma janela.
    """
    features_list = []
    # Itera sobre o audio com o passo definido por step_size
    for i in range(0, len(audio_data) - buffer_len + 1, step_size):
        # Seleciona a janela de dados atual
        data_buff = audio_data[i : i + buffer_len]
        
        # Extrai as features da janela
        features = ExtractFeatures(DataBuff=data_buff, BufferLen=buffer_len, Fs=fs)
        
        # Cria o dicionário e adiciona o FaultID
        feature_dict = dict(zip(feature_columns, [*features, fault_id]))
        features_list.append(feature_dict)
        
    return features_list

In [None]:
def process_and_extract_features(file_dict, data_type='float32'):
    """
    Carrega um conjunto de arquivos de áudio, extrai features e retorna um DataFrame.

    Args:
        file_dict (dict): Dicionário com os caminhos para os arquivos de áudio ('off', 'health', 'f1', etc.).
        data_type (str): O tipo de dado para ler os arquivos de áudio.

    Returns:
        pd.DataFrame: DataFrame com as features extraídas de todos os arquivos.
    """
    # Carrega todos os arquivos de áudio do dicionário
    off_data, off_fs = sf.read(file_dict["off"], dtype=data_type)           # Off Condition with noise ((Fault_ID1))
    health_data, health_fs = sf.read(file_dict["health"], dtype=data_type)  # Healthy condition (Fault_ID2)
    bearing_data, bearing_fs = sf.read(file_dict["f1"], dtype=data_type)    # Bearing fault (Fault_ID3)
    fan_data, fan_fs = sf.read(file_dict["f2"], dtype=data_type)            # Fan fault (Fault_ID4)
    gear_data, gear_fs = sf.read(file_dict["f3"], dtype=data_type)          # Gear fault (Fault_ID5)

    # Remove o início dos sinais de áudio
    off_data = off_data[(20*BUFFER_LEN)+1:]
    health_data = health_data[(20*BUFFER_LEN)+1:]
    bearing_data = bearing_data[(20*BUFFER_LEN)+1:]
    fan_data = fan_data[(20*BUFFER_LEN)+1:]
    gear_data = gear_data[(20*BUFFER_LEN)+1:]

    # Define as colunas do DataFrame final
    columns=['RMS','Mean','Median','Variance','Skewness','Kurtosis', 'CrestFactor','ShapeFactor','ImpulseFactor','MarginFactor', 'Peak1','Peak2','Peak3','PeakLocs1','PeakLocs2','PeakLocs3','FaultID']
    
    # Extrai features de cada tipo de condição
    off_list = extract_features_with_overlap(off_data, off_fs, FAULT_ID_OFF, BUFFER_LEN, OVERLAP_STEP_SIZE, columns)
    health_list = extract_features_with_overlap(health_data, health_fs, FAULT_ID_HEALTH, BUFFER_LEN, OVERLAP_STEP_SIZE, columns)
    bearing_list = extract_features_with_overlap(bearing_data, bearing_fs, FAULT_ID_BEARING, BUFFER_LEN, OVERLAP_STEP_SIZE, columns)
    fan_list = extract_features_with_overlap(fan_data, fan_fs, FAULT_ID_FAN, BUFFER_LEN, OVERLAP_STEP_SIZE, columns)
    gear_list = extract_features_with_overlap(gear_data, gear_fs, FAULT_ID_GEAR, BUFFER_LEN, OVERLAP_STEP_SIZE, columns)

    # Concatena todas as listas de features
    full_list = off_list + health_list + bearing_list + fan_list + gear_list
    
    # Cria e retorna o DataFrame
    return pd.DataFrame(full_list, columns=columns)

## Extracao das Features

In [None]:
# --- Processando os dados de Treinamento ---
print("Iniciando a extração de features dos dados de TREINAMENTO...")
training_df = process_and_extract_features(AudioFiles["TrainingData"], data_type=input_data_type)
print(f"Extração de treinamento concluída. Total de janelas: {len(training_df)}")

# Salva as features de treinamento em um arquivo CSV
output_path_train = '../data/extracted_features/extracted_features_training.csv'
training_df.to_csv(output_path_train, index=False)
print(f"Features de treinamento salvas em: {output_path_train}")


# --- Processando os dados de Validação ---
print("\nIniciando a extração de features dos dados de VALIDAÇÃO...")
validation_df = process_and_extract_features(AudioFiles["ValidationData"], data_type=input_data_type)
print(f"Extração de validação concluída. Total de janelas: {len(validation_df)}")

# Salva as features de validação em um arquivo CSV
output_path_validation = '../data/extracted_features/extracted_features_validation.csv'
validation_df.to_csv(output_path_validation, index=False)
print(f"Features de validação salvas em: {output_path_validation}")