In [39]:
import os
import random
import numpy as np
import pandas as pd
import os, json, joblib
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR

import polars as pl
from pathlib import Path
import warnings 
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from sklearn.model_selection import StratifiedGroupKFold
from scipy.spatial.transform import Rotation as R

In [40]:
# (Competition metric will only be imported when TRAINing)
TRAIN = True                     # ← set to True when you want to train

class config:
    AMP = False
    BATCH_SIZE_TRAIN = 8 #32
    BATCH_SIZE_VALID = 8 #32
    DEBUG = False
    EPOCHS = 2  #30
    FOLDS = 5
    GRADIENT_ACCUMULATION_STEPS = 1
    LEARNING_RATE = 1e-3
    MAX_GRAD_NORM = 1e7
    NUM_WORKERS = 0 # multiprocessing.cpu_count()
    PRINT_FREQ = 20
    SEED = 20
    WEIGHT_DECAY = 0.01
    PAD_PERCENTILE = 95

class paths:
    BASE_DIR = Path("C:/Users/konno/SynologyDrive/datasciense/projects_foler/1_kaggle/CMI/cmi-detect-behavior-with-sensor-data")
    
    OUTPUT_DIR = BASE_DIR / "output-02-wavenet"
    TEST_CSV = BASE_DIR / "test.csv"
    TEST_DEMOGRAPHICS = BASE_DIR / "test_demographics.csv"
    TRAIN_CSV = BASE_DIR / "train.csv"
    TRAIN_DEMOGRAPHICS = BASE_DIR / "train_demographics.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


print("▶ imports ready · torch", torch.__version__, "device :", device)

▶ imports ready · torch 2.7.1+cu128 device : cuda


In [41]:
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [42]:
def preprocess_sequence(df_seq: pd.DataFrame, feature_cols: list[str], scaler: StandardScaler):
    mat = df_seq[feature_cols].ffill().bfill().fillna(0).values
    return scaler.transform(mat).astype('float32')

# MixUp the data argumentation in order to regularize the neural network. 

class CustomDataset(Dataset):
    def __init__(
        self, config, df: pd.DataFrame, X: np.ndarray, y: np.ndarray
    ): 
        
        self.config = config
        self.df = df
        self.X = X
        self.y = y
        self.indexes = self.df.sequence_id.unique()
        
    def __len__(self):
        """
        Length of dataset.
        """
        return len(self.indexes)
        
    def __getitem__(self, index):
        """
        Get one item.
        """
        # print(f"[CustomDataset] Dataset size: {len(self.X)}, Index requested: {index}")
        
        sequence_id = self.indexes[index]
        X = self.X[index]
        y = self.y[index]
        output = {
            "X": torch.tensor(X, dtype=torch.float32),
            "y": torch.tensor(y, dtype=torch.long),
            "sequence_id": sequence_id
        }
        return output     
    
class MixupDataset(CustomDataset):
    def __init__(self, *args, alpha=0.4, mixup_prob=0.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.mixup_prob = mixup_prob

    def __getitem__(self, index):
        # print(f"[CustomDataset] Dataset size: {len(self.X)}, Index requested: {index}")
        if np.random.rand() < self.mixup_prob:
            # Mixup
            idx1 = index
            idx2 = np.random.randint(0, len(self))

            X1 = self.X[idx1]
            y1 = self.y[idx1]

            X2 = self.X[idx2]
            y2 = self.y[idx2]

            lam = np.random.beta(self.alpha, self.alpha)

            X_mix = lam * X1 + (1 - lam) * X2
            y_mix = lam * y1 + (1 - lam) * y2

            return {
                "X": torch.tensor(X_mix, dtype=torch.float32),
                "y": torch.tensor(y_mix, dtype=torch.long),
                "sequence_id": self.indexes[idx1],  # you can decide what ID to keep
            }

        # No mixup
        return super().__getitem__(index)
    
# train_dataset = MixupDataset(config, df_train, X_tr, y_tr, y_soft_tr)
# train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE_TRAIN, shuffle=True)
# val_dataset = CustomDataset(config, df_train, X_val, y_val, y_soft_val)
# val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE_VALID, shuffle=True)

def pad_or_truncate(seq, max_len, mode=TRAIN, pad_value=0.0, dtype=np.float32) -> np.ndarray:
    """
    Pads or truncates a sequence to a fixed length.

    Parameters:
    - seq: np.ndarray of shape (L, D)
    - max_len: int, desired sequence length
    - mode: bool, True = random pad, False = regular pad
    - pad_value: float or int, value to use for padding
    - dtype: np.dtype, dtype for the output array

    Returns:
    - np.ndarray of shape (max_len, D)
    """
    # print("sequence shape", seq.shape)
    L, D = seq.shape
    # print("mode = ", mode)

    if L > max_len:
        return seq[:max_len] # truncate if too long

    elif L < max_len:
        total_padding = max_len - L
        
        if mode:
            pad_start = np.random.randint(0, total_padding + 1)
            pad_end = total_padding - pad_start
            
        else:
            pad_start = 0
            pad_end = total_padding

        start_padding = np.full((pad_start, D), pad_value, dtype=dtype)
        end_padding = np.full((pad_end, D), pad_value, dtype=dtype)
        padded = np.vstack((start_padding, seq, end_padding))
        # print("padded shape", padded.shape)
        return padded

    else:
        return seq.astype(dtype)

In [43]:
def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)

            # Calculate the relative rotation
            delta_rot = rot_t.inv() * rot_t_plus_dt
            
            # Convert delta rotation to angular velocity vector
            # The rotation vector (Euler axis * angle) scaled by 1/dt
            # is a good approximation for small delta_rot
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            # If quaternion is invalid, angular velocity remains zero
            pass
            
    return angular_vel

def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0 # Или np.nan, в зависимости от желаемого поведения
            continue
        try:
            # Преобразование кватернионов в объекты Rotation
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)

            # Вычисление углового расстояния: 2 * arccos(|real(p * q*)|)
            # где p* - сопряженный кватернион q
            # В scipy.spatial.transform.Rotation, r1.inv() * r2 дает относительное вращение.
            # Угол этого относительного вращения - это и есть угловое расстояние.
            relative_rotation = r1.inv() * r2
            
            # Угол rotation vector соответствует угловому расстоянию
            # Норма rotation vector - это угол в радианах
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0 # В случае недействительных кватернионов
            pass
            
    return angular_dist

In [44]:
class Wave_Block(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, dilation_rates: int, kernel_size: int = 3):
        """
        WaveNet building block.
        :param in_channels: number of input channels.
        :param out_channels: number of output channels.
        :param dilation_rates: how many levels of dilations are used.
        :param kernel_size: size of the convolving kernel.
        """
        super(Wave_Block, self).__init__()
        self.num_rates = dilation_rates
        self.convs = nn.ModuleList()
        self.filter_convs = nn.ModuleList()
        self.gate_convs = nn.ModuleList()
        # First conv: (B, in_channels, L) -> (B, out_channels, L)
        self.convs.append(
            nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=True)
        )
        
        dilation_rates = [2 ** i for i in range(dilation_rates)]
        for dilation_rate in dilation_rates:
            # Filter conv: (B, out_channels, L) -> (B, out_channels, L)
            self.filter_convs.append(
                nn.Conv1d(
                    out_channels, out_channels, kernel_size=kernel_size,
                    padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate)
            )
            # Gate conv: (B, out_channels, L) -> (B, out_channels, L)
            self.gate_convs.append(
                nn.Conv1d(
                    out_channels, out_channels, kernel_size=kernel_size,
                    padding=int((dilation_rate*(kernel_size-1))/2), dilation=dilation_rate)
            )
            # Residual conv: (B, out_channels, L) -> (B, out_channels, L)
            self.convs.append(nn.Conv1d(out_channels, out_channels, kernel_size=1, bias=True))
        
        for i in range(len(self.convs)):
            nn.init.xavier_uniform_(self.convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.convs[i].bias)

        for i in range(len(self.filter_convs)):
            nn.init.xavier_uniform_(self.filter_convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.filter_convs[i].bias)

        for i in range(len(self.gate_convs)):
            nn.init.xavier_uniform_(self.gate_convs[i].weight, gain=nn.init.calculate_gain('relu'))
            nn.init.zeros_(self.gate_convs[i].bias)

    def forward(self, x: torch.Tensor):
        # x: (B, in_channels, L)
        x = self.convs[0](x)  # (B, in_channels, L) -> (B, out_channels, L)
        res = x  # res: (B, out_channels, L)
        for i in range(self.num_rates):
            tanh_out = torch.tanh(self.filter_convs[i](x))  # (B, out_channels, L) -> (B, out_channels, L)
            sigmoid_out = torch.sigmoid(self.gate_convs[i](x)) # (B, out_channels, L) -> (B, out_channels, L)
            x = tanh_out * sigmoid_out  # (B, out_channels, L) * (B, out_channels, L) -> (B, out_channels, L)
            x = self.convs[i + 1](x) # (B, out_channels, L) -> (B, out_channels, L)
            res = res + x  # (B, out_channels, L) + (B, out_channels, L) -> (B, out_channels, L)
        return res  # (B, out_channels, L)
    
class WaveNet(nn.Module):
    def __init__(self, input_channels: int = 1, kernel_size: int = 3):
        super(WaveNet, self).__init__()
        self.model = nn.Sequential(
                Wave_Block(input_channels, 32, 12, kernel_size),   # (B, input_channels, L) -> (B, 8, L)
                Wave_Block(32, 64, 8, kernel_size),                # (B, 8, L) -> (B, 16, L)
                Wave_Block(64, 128, 4, kernel_size),               # (B, 16, L) -> (B, 32, L)
                Wave_Block(128, 256, 1, kernel_size),                # (B, 32, L) -> (B, 64, L)
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x: (B, L, input_channels) - typical input format
        x = x.permute(0, 2, 1)  # (B, L, input_channels) -> (B, input_channels, L)
        output = self.model(x)  # (B, input_channels, L) -> (B, 64, L)
        return output  # (B, 64, L)


class TemporalAttentionPooling(nn.Module):
    def __init__(self, input_channels, hidden_dim=64):
        super(TemporalAttentionPooling, self).__init__()
        self.attn = nn.Sequential(
            nn.Conv1d(input_channels, hidden_dim, kernel_size=1),
            nn.Tanh(),
            nn.Conv1d(hidden_dim, 1, kernel_size=1)
        )

    def forward(self, x):
        """
        x: (B, C, L)
        returns: (B, C)
        """
        # Compute attention scores
        attn_scores = self.attn(x)  # (B, 1, L)
        attn_weights = F.softmax(attn_scores, dim=-1)  # (B, 1, L)

        # Weighted sum over time
        weighted = x * attn_weights  # (B, C, L)
        pooled = weighted.sum(dim=-1)  # (B, C)

        return pooled

class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.block = nn.Sequential(
            nn.LayerNorm(dim),
            nn.GELU(),
            nn.Linear(dim, dim),
            nn.LayerNorm(dim),
            nn.GELU(),
            nn.Dropout(0.2)
        )

    def forward(self, x):
        return x + self.block(x)

class CustomModel(nn.Module):
    def __init__(self, num_classes):
        super(CustomModel, self).__init__()
        self.wavenet1 = WaveNet(input_channels=17)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
        self.wavenet2 = WaveNet(input_channels=25)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
        self.config = config
        self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)  # (B, 64, L) -> (B, 64, 1)
        self.dropout = 0.2
        self.head_1 = nn.Sequential(
            nn.Flatten(), 
            nn.Linear(512, 512*4), # (B, 512) -> (B, 512*4)  #add
            ResidualBlock(512*4),  #add
            nn.Linear(512*4, 512*4), # (B, 512*4) -> (B, 512*4)
            nn.LayerNorm(512*4),  # (B, 512*4) -> (B, 512*4)  BatchNorm1d
            nn.GELU(),  # (B, 512*4) -> (B, 512*4)  ReLu
            nn.Dropout(self.dropout),  # (B, 512*4) -> (B, 512*4)
            nn.Linear(512*4, num_classes)  # (B, 512*4) -> (B, num_classes)
        )

        # self.attn_pool = TemporalAttentionPooling(input_channels=256)

    def forward(self, x: torch.Tensor):
        """
        Forward pass.
        """
        # x: (B, L, input_channels) - typical input format
        x1 = self.wavenet1(x[:, :, :17])  # (B, L, input_channels) -> (B, 256, L)
        # print("shape of x1 :", x1.shape)        
        x2 = self.wavenet2(x[:, :, 17:])  # (B, L, input_channels) -> (B, 256, L)
        # print("shape of x2 :", x2.shape) 
        x1 = self.global_avg_pooling(x1)  # (B, 256, L) -> (B, 256, 1)
        x2 = self.global_avg_pooling(x2)  # (B, 256, L) -> (B, 256, 1)
        # x1 = self.attn_pool(x1)  # (B, 256, L) -> (B, 256, 1)
        # x2 = self.attn_pool(x2)  # (B, 256, L) -> (B, 256, 1)
        x = torch.concatenate([x1, x2], axis=1) # (B, 512, 1)
        # print("shape of x :", x.shape)
        x = x.squeeze(-1) # (B, 512)
        z1 = self.head_1(x)  # (B, 512) -> (B, num_classes)
        return z1  # (B, num_classes)


# model = CustomModel(num_classes=9)
# total_params = sum(p.numel() for p in model.parameters())
# print(f"Total number of parameters: {total_params}")

In [45]:
# class CustomModel(nn.Module):
#     def __init__(self, num_classes):
#         super(CustomModel, self).__init__()
#         self.wavenet1 = WaveNet(input_channels=3)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.wavenet2 = WaveNet(input_channels=4)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.config = config
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)  # (B, 64, L) -> (B, 64, 1)
#         self.dropout = 0.2
#         self.head_1 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(512, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, num_classes)  # (B, 64) -> (B, num_classes)
#         )
#         self.head_2 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(512, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, 1)  # (B, 64) -> (B, num_classes)
#         )
        
#     def forward(self, x: torch.Tensor):
#         """
#         Forward pass.
#         """
#         # x: (B, L, input_channels) - typical input format
#         x1 = self.wavenet1(x[:, :, 0:3])  # (B, L, input_channels) -> (B, 64, L)
#         x1 = self.global_avg_pooling(x1)  # (B, 64, L) -> (B, 64, 1)
#         x2 = self.wavenet2(x[:, :, 3:])  # (B, L, input_channels) -> (B, 64, L)
#         x2 = self.global_avg_pooling(x2)  # (B, 64, L) -> (B, 64, 1)
#         y = torch.concatenate([x1, x2], axis=1) # (B, 128)
#         z1 = self.head_1(y)  # (B, 64) -> (B, num_classes)
#         z2 = self.head_2(y)  # (B, 64) -> (B, num_classes)
#         return z1, z2  # (B, num_classes)

# model = CustomModel(num_classes=9)
# total_params = sum(p.numel() for p in model.parameters())
# print(f"Total number of parameters: {total_params}")

# ######################################
# ######## TWO WAVENETS ################
# ######################################
# class CustomModel(nn.Module):
#     def __init__(self, num_classes):
#         super(CustomModel, self).__init__()
#         self.wavenet1 = WaveNet(input_channels=21)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.wavenet2 = WaveNet(input_channels=21)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.config = config
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)  # (B, 64, L) -> (B, 64, 1)
#         self.dropout = 0.2
#         self.head_1 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(128, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, num_classes)  # (B, 64) -> (B, num_classes)
#         )
#         self.head_2 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(128, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, 1)  # (B, 64) -> (B, num_classes)
#         )
        
#     def forward(self, x: torch.Tensor):
#         """
#         Forward pass.
#         """
#         # x: (B, L, input_channels) - typical input format
#         x1 = self.wavenet1(x[:, :, :21])  # (B, L, input_channels) -> (B, 64, L)
#         x2 = self.wavenet2(x[:, :, 21:])  # (B, L, input_channels) -> (B, 64, L)
#         x1 = self.global_avg_pooling(x1)  # (B, 64, L) -> (B, 64, 1)
#         x2 = self.global_avg_pooling(x2)  # (B, 64, L) -> (B, 64, 1)
#         x = torch.concatenate([x1, x2], axis=1) # (B, 128)
#         x = x.squeeze(-1) # (B, 64)
#         z1 = self.head_1(x)  # (B, 64) -> (B, num_classes)
#         z2 = self.head_2(x)  # (B, 64) -> (B, num_classes)
#         return z1, z2  # (B, num_classes)
# ######################################
# ######## TWO WAVENETS ################
# ######################################


# ######################################
# ######## THREE WAVENETS ##############
# ######################################

# class CustomModel(nn.Module):
#     def __init__(self, num_classes):
#         super(CustomModel, self).__init__()
#         self.wavenet_imu  = WaveNet(input_channels=21)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.wavenet_thm  = WaveNet(input_channels=16)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.wavenet_tof  = WaveNet(input_channels=5)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.config = config
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)  # (B, 64, L) -> (B, 64, 1)
#         self.dropout = 0.2
#         self.head_1 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(192, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, num_classes)  # (B, 64) -> (B, num_classes)
#         )
#         self.head_2 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(192, 256), # (B, 64) -> (B, 64)
#             nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#             nn.ReLU(),  # (B, 64) -> (B, 64)
#             nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#             nn.Linear(256, 1)  # (B, 64) -> (B, num_classes)
#         )
        
#     def forward(self, x: torch.Tensor):
#         """
#         Forward pass.
#         """
#         # x: (B, L, input_channels) - typical input format
#         x_imu = self.wavenet_imu(x[:, :, :21])
#         x_thm = self.wavenet_thm(x[:, :, 21:37])
#         x_tof = self.wavenet_tof(x[:, :, 37:])  # if ToF has 5 channels
#         x_imu = self.global_avg_pooling(x_imu)  # (B, 64, L) -> (B, 64, 1)
#         x_thm = self.global_avg_pooling(x_thm)  # (B, 64, L) -> (B, 64, 1)
#         x_tof = self.global_avg_pooling(x_tof)  # (B, 64, L) -> (B, 64, 1)
#         x = torch.cat([x_imu, x_thm, x_tof], dim=1)  #192
#         x = x.squeeze(-1) # (B, 64)
#         z1 = self.head_1(x)  # (B, 64) -> (B, num_classes)
#         z2 = self.head_2(x)  # (B, 64) -> (B, num_classes)
#         return z1, z2  # (B, num_classes)

# ######################################
# ######## THREE WAVENETS ##############
# ######################################

# # ######################################
# # ######## SINGLE WAVENETS #############
# # ######################################
# class CustomModel(nn.Module):
#     def __init__(self, num_classes):
#         super(CustomModel, self).__init__()
#         self.wavenet = WaveNet(input_channels=42)  # WaveNet: (B, input_channels, L) -> (B, 64, L)
#         self.config = config
#         self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)  # (B, 64, L) -> (B, 64, 1)
#         self.dropout = 0.2
#         self.head_1 = nn.Sequential(
#             nn.Flatten(), 
#             nn.Linear(256, 256*4), # (B, 256) -> (B, 256*4)
#             nn.BatchNorm1d(256*4),  # (B, 256*4) -> (B, 256*4)
#             nn.ReLU(),  # (B, 256*4) -> (B, 256*4)
#             nn.Dropout(self.dropout),  # (B, 256*4) -> (B, 256*4)
#             nn.Linear(256*4, num_classes)  # (B, 256*4) -> (B, num_classes)
#         )
#         # self.head_2 = nn.Sequential(
#         #     nn.Flatten(), 
#         #     nn.Linear(64, 256), # (B, 64) -> (B, 64)
#         #     nn.BatchNorm1d(256),  # (B, 64) -> (B, 64)
#         #     nn.ReLU(),  # (B, 64) -> (B, 64)
#         #     nn.Dropout(self.dropout),  # (B, 64) -> (B, 64)
#         #     nn.Linear(256, 1)  # (B, 64) -> (B, num_classes)
#         # )
        
#     def forward(self, x: torch.Tensor):
#         """
#         Forward pass.
#         """
#         # x: (B, L, input_channels) - typical input format
#         # print("[CustomModel] - forward x shape 101", x.shape)
#         x = self.wavenet(x)  # (B, L, input_channels) -> (B, 64, L)
#         # print("[CustomModel] - forward x shape 102", x.shape)
#         x = self.global_avg_pooling(x)  # (B, 64, L) -> (B, 64, 1)
#         # print("[CustomModel] - forward x shape 103", x.shape)
#         x = x.squeeze(-1) # (B, 256)
#         # print("[CustomModel] - forward x shape 104", x.shape)
#         z1 = self.head_1(x)  # (B, 64) -> (B, num_classes)
#         # z2 = self.head_2(x)  # (B, 64) -> (B, num_classes)
#         return z1  #, z2  # (B, num_classes)

# ######################################
# ######## SINGLE WAVENETS #############
# ######################################

In [46]:
### DATA CREATION
print("▶ TRAIN MODE – loading dataset …")
df = pd.read_csv(paths.TRAIN_CSV)

train_dem_df = pd.read_csv(paths.TRAIN_DEMOGRAPHICS)
df_for_groups = pd.merge(df.copy(), train_dem_df, on='subject', how='left')
print("df for group shape :", df_for_groups.shape)

le = LabelEncoder()
df['gesture_int'] = le.fit_transform(df['gesture'])
np.save(paths.OUTPUT_DIR / "gesture_classes.npy", le.classes_)
gesture_classes = le.classes_


print(" 0/8 Calculating base engineered IMU features (magnitude, angle) ...")
df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)
df['rot_angle'] = 2* np.arccos(df['rot_w'].clip(-1, 1))


print(" 1/8 Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag ...")
df['acc_mag_jerk'] = df.groupby('sequence_id')['acc_mag'].diff().fillna(0)
df['rot_angle_vel'] = df.groupby('sequence_id')['rot_angle'].diff().fillna(0)


print(" 2/8 Removing gravity and calculating linear acceleration features...")

linear_accel_list = []
for _, group in df.groupby('sequence_id'):
    acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)
    linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))

df_linear_accel = pd.concat(linear_accel_list)
df = pd.concat([df, df_linear_accel], axis=1)

df['linear_acc_mag'] = np.sqrt(df['linear_acc_x']**2 + df['linear_acc_y']**2 + df['linear_acc_z']**2)
df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)


print(" 3/8 Calculating angular velocity from quaternion derivatives...")
angular_vel_list = []
for _, group in df.groupby('sequence_id'):
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    angular_vel_group = calculate_angular_velocity_from_quat(rot_data_group)
    angular_vel_list.append(pd.DataFrame(angular_vel_group, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index))

df_angular_vel = pd.concat(angular_vel_list)
df = pd.concat([df, df_angular_vel], axis=1)


print(" 4/8 Calculating angular distance between successive quaternions...")
angular_distance_list = []
for _, group in df.groupby('sequence_id'):
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    angular_dist_group = calculate_angular_distance(rot_data_group)
    angular_distance_list.append(pd.DataFrame(angular_dist_group, columns=['angular_distance'], index=group.index))

df_angular_distance = pd.concat(angular_distance_list)
df = pd.concat([df, df_angular_distance], axis=1)

meta_cols = { } # This was an empty dict in your provided code, keeping it as is.

print(" 5/8 Calculating imu_cols_base ...")

imu_cols_base = ['linear_acc_x', 'linear_acc_y', 'linear_acc_z']
imu_cols_base.extend([c for c in df.columns if c.startswith('rot_') and c not in ['rot_angle', 'rot_angle_vel']])

imu_engineered_features = [
    'acc_mag', 'rot_angle',
    'acc_mag_jerk', 'rot_angle_vel',
    'linear_acc_mag', 'linear_acc_mag_jerk',
    'angular_vel_x', 'angular_vel_y', 'angular_vel_z', # Existing new features
    'angular_distance' # Added new feature
]
imu_cols = imu_cols_base + imu_engineered_features
imu_cols = list(dict.fromkeys(imu_cols)) # Для удаления дубликатов

thm_cols_original = [c for c in df.columns if c.startswith('thm_')]

print(" 6/8 Calculating tof_aggregated_cols_template...")

## tof data
tof_aggregated_cols_template = []
for i in range(1, 6):
    tof_aggregated_cols_template.extend([f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max'])

final_feature_cols = imu_cols + thm_cols_original + tof_aggregated_cols_template
imu_dim_final = len(imu_cols)
tof_thm_aggregated_dim_final = len(thm_cols_original) + len(tof_aggregated_cols_template)

print(f" IMU (incl. engineered & derivatives) {imu_dim_final} | THM ({len(thm_cols_original)}) + Aggregated TOF {tof_thm_aggregated_dim_final} | total {len(final_feature_cols)} features")
np.save(paths.OUTPUT_DIR / "feature_cols.npy", np.array(final_feature_cols))

print(" 7/8 calculating tof tof_i_mean/std/min/max...")

seq_gp = df.groupby('sequence_id') 

all_steps_for_scaler_list = []
X_list_unscaled, y_list_int_for_stratify, lens = [], [], [] 

for seq_id, seq_df_orig in seq_gp:
    seq_df = seq_df_orig.copy()

    for i in range(1, 6):
        pixel_cols_tof = [f"tof_{i}_v{p}" for p in range(64)]
        tof_sensor_data = seq_df[pixel_cols_tof].replace(-1, np.nan)
        seq_df[f'tof_{i}_mean'] = tof_sensor_data.mean(axis=1)
        seq_df[f'tof_{i}_std']  = tof_sensor_data.std(axis=1)
        seq_df[f'tof_{i}_min']  = tof_sensor_data.min(axis=1)
        seq_df[f'tof_{i}_max']  = tof_sensor_data.max(axis=1)
    
    mat_unscaled = seq_df[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32')
    
    all_steps_for_scaler_list.append(mat_unscaled)
    X_list_unscaled.append(mat_unscaled)
    y_list_int_for_stratify.append(seq_df['gesture_int'].iloc[0])
    lens.append(len(mat_unscaled))


# fit scaler
print(" 8/8 ToF completed and Fitting StandardScaler...")
all_steps_concatenated = np.concatenate(all_steps_for_scaler_list, axis=0)
scaler = StandardScaler().fit(all_steps_concatenated)
joblib.dump(scaler, paths.OUTPUT_DIR / "scaler.pkl")
del all_steps_for_scaler_list, all_steps_concatenated

# scale individual sequences
print(" Last/8 Scaling and padding sequences...")
X_scaled_list = [scaler.transform(x_seq) for x_seq in X_list_unscaled]
del X_list_unscaled

# calculate pad length
pad_len = int(np.percentile(lens, config.PAD_PERCENTILE))
np.save(paths.OUTPUT_DIR / "sequence_maxlen.npy", pad_len)

# padding section
X_padded_np = np.stack([pad_or_truncate(seq, pad_len) for seq in X_scaled_list])
X = torch.tensor(X_padded_np, dtype=torch.float32)  # shape: (N, T, D)
del X_scaled_list

y_int_for_stratify = np.array(y_list_int_for_stratify)
y = F.one_hot(torch.tensor(y_int_for_stratify), num_classes=len(le.classes_)).float()

# Suppose df_seq has exactly one row per sequence (same order as X)
df_seq = df_for_groups.drop_duplicates('sequence_id').reset_index(drop=True)




▶ TRAIN MODE – loading dataset …
df for group shape : (574945, 348)
 0/8 Calculating base engineered IMU features (magnitude, angle) ...
 1/8 Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag ...
 2/8 Removing gravity and calculating linear acceleration features...
 3/8 Calculating angular velocity from quaternion derivatives...
 4/8 Calculating angular distance between successive quaternions...
 5/8 Calculating imu_cols_base ...
 6/8 Calculating tof_aggregated_cols_template...
 IMU (incl. engineered & derivatives) 17 | THM (5) + Aggregated TOF 25 | total 42 features
 7/8 calculating tof tof_i_mean/std/min/max...
 8/8 ToF completed and Fitting StandardScaler...
 Last/8 Scaling and padding sequences...


In [47]:
# Check the shape 
print("X shape:", X.shape)              # (N_sequences, T, D)
print("y shape:", y.shape)              # (N_sequences, num_classes)
print("df_seq shape:", df_seq.shape)    # (N_unique_sequences, n_meta_columns)

print("  Splitting data and preparing for training...")
# X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=82, stratify=y_int_for_stratify)
X_tr, X_val, y_tr, y_val, df_train, df_val = train_test_split(
    X, y, df_seq, test_size=0.2, random_state=82, stratify=y_int_for_stratify
)

y_train_int = y_tr.argmax(dim=1).cpu().numpy()  # Convert one-hot back to class indices
cw_vals = compute_class_weight('balanced', classes=np.arange(len(le.classes_)), y=y_train_int)  #y=y_int_for_stratify

# Data Loader
print("  ...Data Loader")
train_dataset = MixupDataset(config, df_train, X_tr, y_tr)
train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE_TRAIN, shuffle=True, drop_last=True)
val_dataset = CustomDataset(config, df_val, X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE_VALID, shuffle=True, drop_last=True)
print("  ...Data Loader completed")

X shape: torch.Size([8151, 127, 42])
y shape: torch.Size([8151, 18])
df_seq shape: (8151, 348)
  Splitting data and preparing for training...
  ...Data Loader
  ...Data Loader completed


In [48]:
model = CustomModel(num_classes=len(le.classes_))
model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")
print(f"Total number of classes: {len(le.classes_)}")

Total number of parameters: 12056594
Total number of classes: 18


In [49]:

optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
weights_tensor = torch.tensor(cw_vals, dtype=torch.float32).to(device)
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=0.1)

steps = []
lrs = []
best_val_acc = 0
patience, patience_counter = 10, 0
EPOCHS = config.EPOCHS
scheduler = OneCycleLR(
    optimizer,
    max_lr=1e-3,
    epochs=config.EPOCHS,
    steps_per_epoch=len(train_loader),
    pct_start=0.0,
    anneal_strategy="cos",
    final_div_factor=100,
)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_idx, batch in tqdm.tqdm(enumerate(train_loader)):
        xb, yb = batch["X"].to(device), batch["y"].to(device)
        if batch_idx == 0:
            print("shape xb", xb.shape)
        optimizer.zero_grad()        
        preds_cls = model(xb)
        yb_indices = yb.argmax(dim=1)
        loss = loss_fn(preds_cls, yb_indices)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # optional
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        lrs.append(scheduler.get_last_lr()[0])
        steps.append(epoch * config.BATCH_SIZE_TRAIN + batch_idx)

    print(f"Epoch {epoch} | Train Loss: {total_loss / len(train_loader):.4f}")
    
    # Validation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            xb, yb = batch["X"].to(device), batch["y"].to(device)
            preds_cls = model(xb)
            pred_labels = preds_cls.argmax(1)
            true_labels = yb.argmax(1) if yb.ndim > 1 else yb  #.argmax(1)  val_loader comes from a standard dataset with "y" as class index (long), you don’t need argmax.
            correct += (pred_labels == true_labels).sum().item()
            total += yb.size(0)
    val_acc = correct / total
    print(f"Epoch {epoch} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), paths.OUTPUT_DIR / "best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


0it [00:00, ?it/s]

shape xb torch.Size([8, 127, 42])


815it [00:39, 20.87it/s]


Epoch 0 | Train Loss: 2.5883
Epoch 0 | Val Acc: 0.0788


3it [00:00, 20.63it/s]

shape xb torch.Size([8, 127, 42])


815it [00:38, 21.11it/s]


Epoch 1 | Train Loss: 2.2553
Epoch 1 | Val Acc: 0.1312


0it [00:00, ?it/s]
shape xb torch.Size([8, 127, 42])
815it [00:33, 24.28it/s]
Epoch 0 | Train Loss: 2.7902
Epoch 0 | Val Acc: 0.1749
3it [00:00, 21.27it/s]
shape xb torch.Size([8, 127, 42])
815it [00:31, 25.57it/s]
Epoch 1 | Train Loss: 2.2463
Epoch 1 | Val Acc: 0.2506

In [50]:
# model.load_state_dict(torch.load(paths.OUTPUT_DIR / "best_model.pt"))
# model.eval()
# preds_val = []

# with torch.no_grad():
#     for xb, _ in val_loader:
#         xb = xb.to(device)
#         logits = model(xb)
#         preds_val.append(logits.argmax(1).cpu())

# preds_val = torch.cat(preds_val).numpy()
# true_val_int = y_val.argmax(1).numpy()

# # Evaluation
# from cmi_2025_metric_copy_for_import import CompetitionMetric

# h_f1 = CompetitionMetric().calculate_hierarchical_f1(
#     pd.DataFrame({'gesture': le.classes_[true_val_int]}),
#     pd.DataFrame({'gesture': le.classes_[preds_val]})
# )
# print("Hold-out H-F1 =", round(h_f1, 4))

 |- Model Type	|Strength|	Easy to Try?|
 |------|-------------|---------|
 |- TCN	Fast,| interpretable	|✅✅✅
 |- Transformer |Encoder	Global temporal modeling	|✅✅
 |- CNN + Transformer Hybrid	|Local + global	|✅✅
 |- ResNet1D / InceptionTime	|Robust 1D feature extraction	|✅✅✅
 |- BiLSTM + Attention	|Sequence modeling (non-parallel)	|✅✅
 |- ST-GCN	|Spatial-Temporal & structured	|❌ (if no graph)