# Imports

In [1]:
# # Processamento do Novo Dataset – Geração de Views e Balanceamento
#
# Neste notebook, realizamos os seguintes passos:
#
# 1. Leitura dos dados brutos do novo dataset.
# 2. Aplicação dos pipelines "raw" e "standardized".
# 3. Remoção de atividades com valor -1.
# 4. Balanceamento do dataset por atividade.
# 5. (Opcional) Balanceamento por usuário e atividade.
# 6. Geração das views do dataset e salvamento dos arquivos resultantes.
#
# **Observação:** Ajuste os caminhos, funções e chaves do pipeline conforme sua implementação.

# %% [code]
import os
import pandas as pd
import numpy as np
import random
import traceback
from pathlib import Path

# Importando funções necessárias
# Supondo que o leitor do novo dataset esteja em readers_newdataset
from readers_newdataset import read_newdataset, sanity_function
# Pipelines e mapeamento de colunas (aqui consideramos que o novo dataset utiliza a chave "HIAAC")
from pipelines import pipelines, match_columns
# Funções de balanceamento e split
from steps import (
    SplitGuaranteeingAllClassesPerSplit,
    BalanceToMinimumClass,
    BalanceToMinimumClassAndUser,
    FilterByCommonRows,
)



# Configura seeds para reprodutibilidade
np.random.seed(42)
random.seed(42)

## Paths

In [3]:

# Define o caminho raiz para o novo dataset
root_path = "/home/maria/git/knowledge-representation/experiments/dataset_hiaac/har/Dataset_V1/Dados/Bolso_direito"

# Define o path de saída
output_path = "/home/maria/git/knowledge-representation/experiments/dataset_hiaac/har/Dataset_V1/Views_v3/Bolso_direito"

# Processing

In [4]:
# --- Leitura dos dados brutos ---
print("Lendo os dados raw do novo dataset...")
df_root = read_newdataset(root_path)
print("Formato do raw dataset:", df_root.shape)
display(df_root.head())

Lendo os dados raw do novo dataset...
Arquivo de acelerômetro não encontrado para o usuário 3
Formato do raw dataset: (672900, 11)


Unnamed: 0,accel-x,accel-y,accel-z,gyro-x,gyro-y,gyro-z,user,index,csv,trial,activity code
0,0.531513,-9.809343,-0.40462,0.011988,0.003207,0.010156,1,0,1,0,0
1,0.531513,-9.838074,-0.414197,0.010767,0.000764,0.012599,1,1,1,0,0
2,0.529119,-9.838074,-0.423774,0.01321,-0.005345,0.015653,1,2,1,0,0
3,0.577003,-9.828497,-0.35913,0.015043,-0.008399,0.015043,1,3,1,0,0
4,0.588974,-9.826103,-0.318429,0.024816,-0.00901,0.016264,1,4,1,0,0


In [5]:

# %% [code]
# Verifica quais pipelines estão disponíveis
print("Pipelines disponíveis:")
available_keys = list(pipelines.keys())
print(available_keys)

Pipelines disponíveis:
['KuHar', 'MotionSense', 'WISDM', 'UCI', 'RealWorld', 'HIAAC']


In [6]:
# Aplicando os pipelines
# Supondo que no pipeline "HIAAC" exista a função "raw_dataset" e "standardized"
df_raw = pipelines["HIAAC"]["raw_dataset"](df_root)
df_standardized = pipelines["HIAAC"]["standardized"](df_root)

print("Formato do dataset processado (raw):", df_raw.shape)
display(df_raw.head())

print("Formato do dataset processado (standardized):", df_standardized.shape)
display(df_standardized.head())

Executing Windowize


Creating windows: 100%|██████████| 2243/2243 [02:00<00:00, 18.64it/s]


Executing AddStandardActivityCode
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 2243/2243 [00:04<00:00, 490.31it/s]


Executing Windowize


Creating windows: 100%|██████████| 2243/2243 [00:27<00:00, 80.41it/s]


Executing AddStandardActivityCode
Formato do dataset processado (raw): (2243, 1807)


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-297,gyro-z-298,gyro-z-299,user,csv,activity code,index,trial,window,standard activity code
0,0.531513,0.531513,0.529119,0.577003,0.588974,0.557849,0.52433,0.483629,0.47884,0.591368,...,0.028482,0.068799,0.092622,1.0,1,0,0,0,0,
1,0.002394,0.349553,0.507571,0.584185,0.531513,0.349553,0.181959,-0.189142,-0.040701,0.167594,...,0.009545,0.00649,0.002825,1.0,1,0,300,1,0,
2,0.543484,0.562637,0.550666,0.488417,0.471658,0.519542,0.538695,0.545878,0.500388,0.521936,...,-0.002062,-0.005116,-0.00817,1.0,1,0,600,2,0,
3,0.610521,0.54109,0.4956,0.462081,0.433351,0.486023,0.493206,0.509965,0.531513,0.562637,...,0.02054,0.024206,0.02726,1.0,1,0,900,3,0,
4,0.304064,0.332794,0.411803,0.550666,0.73502,0.866701,0.919373,0.88346,0.742202,0.579397,...,-0.000229,-0.001451,-0.002673,1.0,1,0,1200,4,0,


Formato do dataset processado (standardized): (2243, 368)


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-58,gyro-z-59,user,csv,level_0,activity code,index,trial,window,standard activity code
0,0.043939,0.058694,0.099691,0.10559,0.108888,0.096158,0.05973,0.056849,-0.09334,-0.068499,...,-0.228732,-0.034904,1.0,1,0,0,0,0,0,
1,-0.180496,-0.366092,-0.566072,-0.530784,-0.536695,-0.325834,0.496649,-0.347638,1.147642,1.126622,...,0.006335,0.007245,1.0,1,300,0,300,1,0,
2,0.009302,-0.015479,0.026727,0.015229,0.000781,0.004393,0.012528,0.075865,-0.005746,0.070646,...,0.000123,-0.002204,1.0,1,600,0,600,2,0,
3,-0.00595,-0.066012,0.023321,-0.055179,0.013233,-0.020849,-0.000389,0.001325,-0.040271,0.003105,...,-0.011131,0.008433,1.0,1,900,0,900,3,0,
4,-0.137434,0.174563,0.061518,-0.164496,-0.030202,-0.220042,-0.196659,0.097026,0.266902,-0.074766,...,0.004611,0.003628,1.0,1,1200,0,1200,4,0,


In [7]:
# %% [code]


# incluir timestamp local, time stamp server e o diff entre eles 

def apply_activity_mapping(df, mapping, source_column="activity code", target_column="standard activity code"):
    """
    Aplica o mapeamento de códigos de atividade no DataFrame, garantindo que os valores sejam inteiros.
    
    Parâmetros:
    - df: DataFrame contendo os dados.
    - mapping: dicionário com o mapeamento dos códigos (int para int).
    - source_column: nome da coluna com os códigos originais.
    - target_column: nome da coluna onde o novo código será salvo.
    
    Retorna:
    - DataFrame com a coluna de código de atividade reescrita.
    """
    if source_column not in df.columns:
        raise ValueError(f"A coluna '{source_column}' não existe no DataFrame.")
    
    # Converter cada valor para int, se não for nulo, e aplicar o mapeamento
    df[target_column] = df[source_column].apply(lambda x: mapping.get(int(x)) if pd.notnull(x) else None)
    return df

# Dicionário de mapeamento para o dataset "HIAAC"
activity_mapping = {
    0: 1,    # STANDING -> 1 (stand)
    1: 0,    # SITTING -> 0 (sit)
    2: 2,    # W_SPONT/WALKING_SPONTANEOUS -> 2 (walk)
    3: 3,    # UPSTAIRS -> 3 (stair up)
    4: 4,    # DOWNSTAIRS -> 4 (stair down)
    5: -1,   # W_FAST -> -1 (removido)
    6: 5,    # RUN -> 5 (run)
    7: -1,   # ELEV_UP -> -1 (removido)
    8: -1,   # ELEV_DOWN -> -1 (removido)
    9: 2,    # W_IN_DOOR -> 2 (walk)
    10: 2,   # w_DISTRACTED -> 2 (walk)
    -1: -1,  # REMOVE -> -1 (removido)
}

# Aplicando o mapeamento nos DataFrames
df_raw = apply_activity_mapping(df_raw, activity_mapping, source_column="activity code", target_column="standard activity code")
df_standardized = apply_activity_mapping(df_standardized, activity_mapping, source_column="activity code", target_column="standard activity code")

# Exibindo as primeiras linhas para verificar se a coluna foi atualizada corretamente
display(df_raw.head())
display(df_standardized.head())


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-297,gyro-z-298,gyro-z-299,user,csv,activity code,index,trial,window,standard activity code
0,0.531513,0.531513,0.529119,0.577003,0.588974,0.557849,0.52433,0.483629,0.47884,0.591368,...,0.028482,0.068799,0.092622,1.0,1,0,0,0,0,1
1,0.002394,0.349553,0.507571,0.584185,0.531513,0.349553,0.181959,-0.189142,-0.040701,0.167594,...,0.009545,0.00649,0.002825,1.0,1,0,300,1,0,1
2,0.543484,0.562637,0.550666,0.488417,0.471658,0.519542,0.538695,0.545878,0.500388,0.521936,...,-0.002062,-0.005116,-0.00817,1.0,1,0,600,2,0,1
3,0.610521,0.54109,0.4956,0.462081,0.433351,0.486023,0.493206,0.509965,0.531513,0.562637,...,0.02054,0.024206,0.02726,1.0,1,0,900,3,0,1
4,0.304064,0.332794,0.411803,0.550666,0.73502,0.866701,0.919373,0.88346,0.742202,0.579397,...,-0.000229,-0.001451,-0.002673,1.0,1,0,1200,4,0,1


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-58,gyro-z-59,user,csv,level_0,activity code,index,trial,window,standard activity code
0,0.043939,0.058694,0.099691,0.10559,0.108888,0.096158,0.05973,0.056849,-0.09334,-0.068499,...,-0.228732,-0.034904,1.0,1,0,0,0,0,0,1
1,-0.180496,-0.366092,-0.566072,-0.530784,-0.536695,-0.325834,0.496649,-0.347638,1.147642,1.126622,...,0.006335,0.007245,1.0,1,300,0,300,1,0,1
2,0.009302,-0.015479,0.026727,0.015229,0.000781,0.004393,0.012528,0.075865,-0.005746,0.070646,...,0.000123,-0.002204,1.0,1,600,0,600,2,0,1
3,-0.00595,-0.066012,0.023321,-0.055179,0.013233,-0.020849,-0.000389,0.001325,-0.040271,0.003105,...,-0.011131,0.008433,1.0,1,900,0,900,3,0,1
4,-0.137434,0.174563,0.061518,-0.164496,-0.030202,-0.220042,-0.196659,0.097026,0.266902,-0.074766,...,0.004611,0.003628,1.0,1,1200,0,1200,4,0,1


In [8]:
# %% [code]
def balance_per_activity(dataset: str, dataframe: pd.DataFrame, output_path: str):
    """
    Balanceia o dataset por atividade e salva os dados de treino, validação e teste.
    """
    # Configura os splits garantindo que todas as classes estejam presentes
    split_data = SplitGuaranteeingAllClassesPerSplit(
        column_to_split="user",
        class_column="standard activity code",
        train_size=0.8,
        random_state=42,
    )
    split_data_train_val = SplitGuaranteeingAllClassesPerSplit(
        column_to_split="user",
        class_column="standard activity code",
        train_size=0.9,
        random_state=42,
    )
    
    train_df, test_df = split_data(dataframe)
    train_df, val_df = split_data_train_val(train_df)
    
    # # Balanceamento por atividade
    # balancer_activity = BalanceToMinimumClass(
    #     class_column="standard activity code"
    # )
    # train_df = balancer_activity(train_df)
    # val_df = balancer_activity(val_df)
    # test_df = balancer_activity(test_df)
    
    # Cria o diretório de saída para o dataset, se não existir
    output_dir = os.path.join(output_path, dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Salva os DataFrames em arquivos CSV
    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "validation.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
    
    print(f"Dados balanceados por atividade salvos em: {output_dir}")
    return train_df, val_df, test_df

def balance_per_user_and_activity(dataset: str, dataframe: pd.DataFrame, output_path: str):
    """
    Balanceia o dataset por usuário e atividade e salva os dados de treino, validação e teste.
    """
    balancer_activity_and_user = BalanceToMinimumClassAndUser(
        class_column="standard activity code", filter_column="user"
    )
    
    # Remove atividades com -1 antes do balanceamento
    new_df_balanced = balancer_activity_and_user(dataframe[dataframe["standard activity code"] != -1])
    
    split_data = SplitGuaranteeingAllClassesPerSplit(
        column_to_split="user",
        class_column="standard activity code",
        train_size=0.8,
        random_state=42,
    )
    split_data_train_val = SplitGuaranteeingAllClassesPerSplit(
        column_to_split="user",
        class_column="standard activity code",
        train_size=0.9,
        random_state=42,
    )
    
    train_df, test_df = split_data(new_df_balanced)
    train_df, val_df = split_data_train_val(train_df)
    
    output_dir = os.path.join(output_path, dataset)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "validation.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
    
    print(f"Dados balanceados por usuário e atividade salvos em: {output_dir}")
    return train_df, val_df, test_df

def generate_views(new_df, new_df_standardized, dataset, path_balanced, path_balanced_standardized):
    """
    Gera as views do dataset processado, aplicando o balanceamento e salvando os resultados.
    """
    # Filtra os datasets para manter apenas as linhas com colunas em comum
    # Aqui, usamos a chave "HIAAC" – ajuste se necessário
    filter_common = FilterByCommonRows(match_columns=match_columns["HIAAC"])
    new_df, new_df_standardized = filter_common(new_df, new_df_standardized)
    
    print("---- RAW (balanceado por atividade)")
    train_df, val_df, test_df = balance_per_activity(dataset, new_df, path_balanced)
    sanity_function(train_df, val_df, test_df)
    
    print("---- STANDARDIZED (balanceado por atividade)")
    train_df, val_df, test_df = balance_per_activity(dataset, new_df_standardized, path_balanced_standardized)
    sanity_function(train_df, val_df, test_df)


In [9]:
# %% [code]
# Removendo linhas com atividade igual a -1
df_raw_filtered = df_raw[df_raw["standard activity code"] != -1]
df_standardized_filtered = df_standardized[df_standardized["standard activity code"] != -1]

display(df_raw_filtered.head())
display(df_standardized_filtered.head())

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-297,gyro-z-298,gyro-z-299,user,csv,activity code,index,trial,window,standard activity code
0,0.531513,0.531513,0.529119,0.577003,0.588974,0.557849,0.52433,0.483629,0.47884,0.591368,...,0.028482,0.068799,0.092622,1.0,1,0,0,0,0,1
1,0.002394,0.349553,0.507571,0.584185,0.531513,0.349553,0.181959,-0.189142,-0.040701,0.167594,...,0.009545,0.00649,0.002825,1.0,1,0,300,1,0,1
2,0.543484,0.562637,0.550666,0.488417,0.471658,0.519542,0.538695,0.545878,0.500388,0.521936,...,-0.002062,-0.005116,-0.00817,1.0,1,0,600,2,0,1
3,0.610521,0.54109,0.4956,0.462081,0.433351,0.486023,0.493206,0.509965,0.531513,0.562637,...,0.02054,0.024206,0.02726,1.0,1,0,900,3,0,1
4,0.304064,0.332794,0.411803,0.550666,0.73502,0.866701,0.919373,0.88346,0.742202,0.579397,...,-0.000229,-0.001451,-0.002673,1.0,1,0,1200,4,0,1


Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-58,gyro-z-59,user,csv,level_0,activity code,index,trial,window,standard activity code
0,0.043939,0.058694,0.099691,0.10559,0.108888,0.096158,0.05973,0.056849,-0.09334,-0.068499,...,-0.228732,-0.034904,1.0,1,0,0,0,0,0,1
1,-0.180496,-0.366092,-0.566072,-0.530784,-0.536695,-0.325834,0.496649,-0.347638,1.147642,1.126622,...,0.006335,0.007245,1.0,1,300,0,300,1,0,1
2,0.009302,-0.015479,0.026727,0.015229,0.000781,0.004393,0.012528,0.075865,-0.005746,0.070646,...,0.000123,-0.002204,1.0,1,600,0,600,2,0,1
3,-0.00595,-0.066012,0.023321,-0.055179,0.013233,-0.020849,-0.000389,0.001325,-0.040271,0.003105,...,-0.011131,0.008433,1.0,1,900,0,900,3,0,1
4,-0.137434,0.174563,0.061518,-0.164496,-0.030202,-0.220042,-0.196659,0.097026,0.266902,-0.074766,...,0.004611,0.003628,1.0,1,1200,0,1200,4,0,1


In [10]:


# Definindo os caminhos para salvar os datasets balanceados
raw_balanced_path = os.path.join(output_path, "raw_balanced")
standardized_balanced_path = os.path.join(output_path, "standardized_balanced")

# Gerando as views do novo dataset
generate_views(df_raw_filtered, df_standardized_filtered, "HIAAC_Bolso_direito", raw_balanced_path, standardized_balanced_path)


---- RAW (balanceado por atividade)
Dados balanceados por atividade salvos em: /home/maria/git/knowledge-representation/experiments/dataset_hiaac/har/Dataset_V1/Views_v3/Bolso_direito/raw_balanced/HIAAC_Bolso_direito
Train size: 975 (60.60%)
Validation size: 184 (11.44%)
Test size: 450 (27.97%)
Train activities: [1 0 2 3 4 5]
Validation activities: [1 0 2 3 4 5]
Test activities: [1 0 2 3 4 5]
Users in train: [4.0 5.0 6.0 7.0 8.0]
Users in validation: [9.0]
Users in test: [1.0 2.0]

---- STANDARDIZED (balanceado por atividade)
Dados balanceados por atividade salvos em: /home/maria/git/knowledge-representation/experiments/dataset_hiaac/har/Dataset_V1/Views_v3/Bolso_direito/standardized_balanced/HIAAC_Bolso_direito
Train size: 975 (60.60%)
Validation size: 184 (11.44%)
Test size: 450 (27.97%)
Train activities: [1 0 2 3 4 5]
Validation activities: [1 0 2 3 4 5]
Test activities: [1 0 2 3 4 5]
Users in train: [4.0 5.0 6.0 7.0 8.0]
Users in validation: [9.0]
Users in test: [1.0 2.0]

