In [1]:
import os
import random
import numpy as np
import pandas as pd
import os, json, joblib
import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR

import polars as pl
from pathlib import Path
import warnings 
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

import gc  # garbage collection
import psutil
from scipy.spatial.transform import Rotation as R

In [2]:
# (Competition metric will only be imported when TRAINing)
TRAIN = True                     # ← set to True when you want to train

class config:
    AMP = False
    BATCH_SIZE_TRAIN = 8 #32
    BATCH_SIZE_VALID = 8 #32
    DEBUG = False
    EPOCHS = 2  #30
    FOLDS = 5
    GRADIENT_ACCUMULATION_STEPS = 1
    LEARNING_RATE = 1e-3
    MAX_GRAD_NORM = 1e7
    NUM_WORKERS = 0 # multiprocessing.cpu_count()
    PRINT_FREQ = 20
    SEED = 20
    WEIGHT_DECAY = 0.01
    PAD_PERCENTILE = 95
    SEQUENCE_LENGTH = 150

class paths:
    BASE_DIR = Path("C:/Users/konno/SynologyDrive/datasciense/projects_foler/1_kaggle/CMI/cmi-detect-behavior-with-sensor-data")
    
    OUTPUT_DIR = BASE_DIR / "output-02-wavenet"
    TEST_CSV = BASE_DIR / "test.csv"
    TEST_DEMOGRAPHICS = BASE_DIR / "test_demographics.csv"
    TRAIN_CSV = BASE_DIR / "train.csv"
    TRAIN_DEMOGRAPHICS = BASE_DIR / "train_demographics.csv"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("▶ imports ready · torch", torch.__version__, "device :", device)

▶ imports ready · torch 2.7.1+cpu device : cpu


In [3]:
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
# MixUp the data argumentation in order to regularize the neural network. 

class MotionDataset(torch.utils.data.Dataset):
    def __init__(self, df, imu_cols, tof_columns, sequence_ids, max_len=103, mode=TRAIN):
        self.df = df
        self.imu_cols = imu_cols
        self.tof_columns = tof_columns
        self.sequence_ids = sequence_ids
        self.max_len = max_len
        self.mode = mode
        self.grouped = df.groupby('sequence_id')
        self.label_map = {s: i for i, s in enumerate(sorted(df['gesture'].unique()))}

    def __len__(self):
        return len(self.sequence_ids)

    def __getitem__(self, idx):
        seq_id = self.sequence_ids[idx]
        group = self.grouped.get_group(seq_id)

        tof_values = group[self.tof_columns].values.astype(np.float32)
        imu_values = group[self.imu_cols].values.astype(np.float32)

        # Pad or truncate
        tof_padded = pad_or_truncate(tof_values, self.max_len, self.mode)
        imu_padded = pad_or_truncate(imu_values, self.max_len, self.mode)

        imu_expanded = imu_padded[:, np.newaxis, :]
        fused = np.concatenate([
            tof_padded[:, :, np.newaxis],
            np.broadcast_to(imu_expanded, (self.max_len, tof_padded.shape[1], imu_expanded.shape[2]))
        ], axis=2)
        fused = fused.transpose(1, 2, 0)  # (320, 26, time)

        label = self.label_map[group['gesture'].iloc[0]]

        return {
            "X": torch.tensor(fused, dtype=torch.float32),
            "y": torch.tensor(label, dtype=torch.long),
            "sequence_id": seq_id
        }
    
# train_dataset = MixupDataset(config, df_train, X_tr, y_tr, y_soft_tr)
# train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE_TRAIN, shuffle=True)
# val_dataset = CustomDataset(config, df_train, X_val, y_val, y_soft_val)
# val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE_VALID, shuffle=True)

def pad_or_truncate(seq, max_len, mode=TRAIN, pad_value=0.0, dtype=np.float32) -> np.ndarray:
    """
    Pads or truncates a sequence to a fixed length.

    Parameters:
    - seq: np.ndarray of shape (L, D)
    - max_len: int, desired sequence length
    - mode: bool, True = random pad, False = regular pad
    - pad_value: float or int, value to use for padding
    - dtype: np.dtype, dtype for the output array

    Returns:
    - np.ndarray of shape (max_len, D)
    """
    # print("sequence shape", seq.shape)
    L, D = seq.shape
    # print("mode = ", mode)

    if L > max_len:
        return seq[:max_len] # truncate if too long

    elif L < max_len:
        total_padding = max_len - L
        
        if mode:
            pad_start = np.random.randint(0, total_padding + 1)
            pad_end = total_padding - pad_start
            
        else:
            pad_start = 0
            pad_end = total_padding

        start_padding = np.full((pad_start, D), pad_value, dtype=dtype)
        end_padding = np.full((pad_end, D), pad_value, dtype=dtype)
        padded = np.vstack((start_padding, seq, end_padding))
        # print("padded shape", padded.shape)
        return padded

    else:
        return seq.astype(dtype)

In [5]:
def remove_gravity_from_acc(acc_data, rot_data):

    if isinstance(acc_data, pd.DataFrame):
        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values
    else:
        acc_values = acc_data

    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    
    gravity_world = np.array([0, 0, 9.81])

    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i, :] = acc_values[i, :] 
            continue

        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame
        except ValueError:
             linear_accel[i, :] = acc_values[i, :]
             
    return linear_accel

def calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_vel = np.zeros((num_samples, 3))

    for i in range(num_samples - 1):
        q_t = quat_values[i]
        q_t_plus_dt = quat_values[i+1]

        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \
           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):
            continue

        try:
            rot_t = R.from_quat(q_t)
            rot_t_plus_dt = R.from_quat(q_t_plus_dt)

            # Calculate the relative rotation
            delta_rot = rot_t.inv() * rot_t_plus_dt
            
            # Convert delta rotation to angular velocity vector
            # The rotation vector (Euler axis * angle) scaled by 1/dt
            # is a good approximation for small delta_rot
            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta
        except ValueError:
            # If quaternion is invalid, angular velocity remains zero
            pass
            
    return angular_vel

def calculate_angular_distance(rot_data):
    if isinstance(rot_data, pd.DataFrame):
        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values
    else:
        quat_values = rot_data

    num_samples = quat_values.shape[0]
    angular_dist = np.zeros(num_samples)

    for i in range(num_samples - 1):
        q1 = quat_values[i]
        q2 = quat_values[i+1]

        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \
           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):
            angular_dist[i] = 0 # Или np.nan, в зависимости от желаемого поведения
            continue
        try:
            # Преобразование кватернионов в объекты Rotation
            r1 = R.from_quat(q1)
            r2 = R.from_quat(q2)

            # Вычисление углового расстояния: 2 * arccos(|real(p * q*)|)
            # где p* - сопряженный кватернион q
            # В scipy.spatial.transform.Rotation, r1.inv() * r2 дает относительное вращение.
            # Угол этого относительного вращения - это и есть угловое расстояние.
            relative_rotation = r1.inv() * r2
            
            # Угол rotation vector соответствует угловому расстоянию
            # Норма rotation vector - это угол в радианах
            angle = np.linalg.norm(relative_rotation.as_rotvec())
            angular_dist[i] = angle
        except ValueError:
            angular_dist[i] = 0 # В случае недействительных кватернионов
            pass
            
    return angular_dist

# Create Spatial Adjacency Matrix
def create_8x8_grid_adjacency():
    adj = np.zeros((64, 64), dtype=int)
    for r in range(8):
        for c in range(8):
            idx = r * 8 + c
            if r > 0: adj[idx][(r - 1) * 8 + c] = 1
            if r < 7: adj[idx][(r + 1) * 8 + c] = 1
            if c > 0: adj[idx][r * 8 + (c - 1)] = 1
            if c < 7: adj[idx][r * 8 + (c + 1)] = 1
    return adj

def print_memory():
    process = psutil.Process()
    print(f"Memory Usage: {process.memory_info().rss / 1024**2:.2f} MB")

In [31]:

class STGCNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, A, stride=1, residual=True):
        super().__init__()
        # Temporal convolution: shape = (kernel_size, 1)
        self.A = A  # Adjacency matrix: (V, V)
        self.num_nodes = A.shape[0]
        # self.gcn = GraphConv(in_channels, out_channels)  # Spatial convolution (GCN)
        self.gcn = nn.Conv2d(in_channels, out_channels, kernel_size=(1, 1))
        self.tcn = nn.Sequential(
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=(9,1),
                      padding=(4,0), stride=(stride,1)),
            nn.BatchNorm2d(out_channels),
            nn.Dropout(0.3),
        )
        if not residual:
            self.residual = lambda x: 0
        elif (in_channels == out_channels) and (stride == 1):
            self.residual = lambda x: x
        else:
            self.residual = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=(stride,1)),
                nn.BatchNorm2d(out_channels),
            )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):  # x: [B, C, T, V], A: adjacency matrix
        # x shape: (N, C, T, V)
        N, C, T, V = x.size()
        A = self.A.to(x.device)  # ensure A is on the same device

        # Graph convolution: multiply input by adjacency matrix
        x = torch.einsum('nctv,vw->nctw', (x, A))  # shape: (N, C, T, V)

        x = self.gcn(x)  # (N, out_channels, T, V)
        x = self.tcn(x)  # temporal conv
        x = x + self.residual(x)  # add residual
        return self.relu(x)

class STGCN(nn.Module):
    def __init__(self, in_channels, num_classes, A, num_nodes):
        super().__init__()
        self.data_bn = nn.BatchNorm1d(in_channels * num_nodes)

        self.layers = nn.ModuleList([
            STGCNBlock(in_channels, 64, A, residual=False),
            STGCNBlock(64, 64, A),
            STGCNBlock(64, 64, A),
            STGCNBlock(64, 128, A, stride=2),
            STGCNBlock(128, 128, A),
            STGCNBlock(128, 256, A, stride=2),
            STGCNBlock(256, 256, A)
        ])

        self.pool = nn.AdaptiveAvgPool2d((1, 1))  # output shape: (N, C, 1, 1)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        # x: (N, C, T, V)
        print(f"in_channels: {self.data_bn.num_features}")
        print(f"input x shape before BN: {x.shape}")
        N, C, T, V = x.size()
        x = x.permute(0, 3, 1, 2).contiguous()  # (N, V, C, T)
        x = x.view(N, V * C, T)
        x = self.data_bn(x)
        x = x.view(N, V, C, T).permute(0, 2, 3, 1).contiguous()  # (N, C, T, V)

        for gcn in self.layers:
            x = gcn(x)

        x = self.pool(x)  # (N, C, 1, 1)
        x = x.view(N, -1)  # flatten
        return self.fc(x)  # logits: (N, num_classes)

In [7]:
### DATA CREATION and PRE PROCESSING

print("▶ TRAIN MODE – loading dataset …")


df = pd.read_csv(paths.TRAIN_CSV)
df = df.fillna(0)

print_memory()

print(" 1/6 Calculating base engineered IMU features (magnitude, angle) ...")

df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)
df['rot_angle'] = 2* np.arccos(df['rot_w'].clip(-1, 1))

print(" 2/6 Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag ...")

df['acc_mag_jerk'] = df.groupby('sequence_id')['acc_mag'].diff().fillna(0)
df['rot_angle_vel'] = df.groupby('sequence_id')['rot_angle'].diff().fillna(0)

print(" 3/6 Removing gravity and calculating linear acceleration features...")

linear_accel_list = []
for _, group in df.groupby('sequence_id'):
    acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)
    linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))

df_linear_accel = pd.concat(linear_accel_list)
df = pd.concat([df, df_linear_accel], axis=1)
del df_linear_accel, linear_accel_list  # Memory Management
gc.collect()  # Memory Management

df['linear_acc_mag'] = np.sqrt(df['linear_acc_x']**2 + df['linear_acc_y']**2 + df['linear_acc_z']**2)
df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)

print(" 4/6 Calculating angular velocity from quaternion derivatives...")
angular_vel_list = []
for _, group in df.groupby('sequence_id'):
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    angular_vel_group = calculate_angular_velocity_from_quat(rot_data_group)
    angular_vel_list.append(pd.DataFrame(angular_vel_group, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index))

df_angular_vel = pd.concat(angular_vel_list)
df = pd.concat([df, df_angular_vel], axis=1)
del angular_vel_list, df_angular_vel # Memory Management
gc.collect() # Memory Management


print(" 5/6 Calculating angular distance between successive quaternions...")
angular_distance_list = []
for _, group in df.groupby('sequence_id'):
    rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]
    angular_dist_group = calculate_angular_distance(rot_data_group)
    angular_distance_list.append(pd.DataFrame(angular_dist_group, columns=['angular_distance'], index=group.index))

df_angular_distance = pd.concat(angular_distance_list)
df = pd.concat([df, df_angular_distance], axis=1)
del angular_distance_list, df_angular_distance # Memory Management
gc.collect() # Memory Management

print_memory()

meta_cols = { } # This was an empty dict in your provided code, keeping it as is.

print(" 6/6 Calculating imu_cols_base ...")
imu_cols_orig = ['acc_x', 'acc_y', 'acc_z',
            'rot_w', 'rot_x', 'rot_y', 'rot_z',
            'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5']

imu_cols_base = ['linear_acc_x', 'linear_acc_y', 'linear_acc_z']
imu_cols_base.extend([c for c in df.columns if c.startswith('rot_') and c not in ['rot_angle', 'rot_angle_vel']])

imu_engineered_features = [
    'acc_mag', 'rot_angle',
    'acc_mag_jerk', 'rot_angle_vel',
    'linear_acc_mag', 'linear_acc_mag_jerk',
    'angular_vel_x', 'angular_vel_y', 'angular_vel_z', # Existing new features
    'angular_distance' # Added new feature
]
imu_cols = list(dict.fromkeys(imu_cols_orig + imu_cols_base + imu_engineered_features))  # Remove dups

print("length of imu_cols :", len(imu_cols))

print("✅ Preprocessing done.")
print_memory()

# thm_cols_original = [c for c in df.columns if c.startswith('thm_')

▶ TRAIN MODE – loading dataset …
Memory Usage: 1886.02 MB
 1/6 Calculating base engineered IMU features (magnitude, angle) ...
 2/6 Calculating engineered IMU derivatives (jerk, angular velocity) for original acc_mag ...
 3/6 Removing gravity and calculating linear acceleration features...
 4/6 Calculating angular velocity from quaternion derivatives...
 5/6 Calculating angular distance between successive quaternions...
Memory Usage: 3516.06 MB
 6/6 Calculating imu_cols_base ...
length of imu_cols : 25
✅ Preprocessing done.
Memory Usage: 3516.06 MB


In [8]:
# ### TOF not used
######################

# print(" 6/8 Calculating tof_aggregated_cols_template...")

# ## tof data
# tof_aggregated_cols_template = []
# for i in range(1, 6):
#     tof_aggregated_cols_template.extend([f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max'])

# final_feature_cols = imu_cols + thm_cols_original + tof_aggregated_cols_template
# imu_dim_final = len(imu_cols)
# tof_thm_aggregated_dim_final = len(thm_cols_original) + len(tof_aggregated_cols_template)

# print(f" IMU (incl. engineered & derivatives) {imu_dim_final} | THM ({len(thm_cols_original)}) + Aggregated TOF {tof_thm_aggregated_dim_final} | total {len(final_feature_cols)} features")
# np.save(paths.OUTPUT_DIR / "feature_cols.npy", np.array(final_feature_cols))

# print(" 7/8 calculating tof tof_i_mean/std/min/max...")

# seq_gp = df.groupby('sequence_id') 

# all_steps_for_scaler_list = []
# X_list_unscaled, y_list_int_for_stratify, lens = [], [], [] 

# for seq_id, seq_df_orig in seq_gp:
#     seq_df = seq_df_orig.copy()

#     for i in range(1, 6):
#         pixel_cols_tof = [f"tof_{i}_v{p}" for p in range(64)]
#         tof_sensor_data = seq_df[pixel_cols_tof].replace(-1, np.nan)
#         seq_df[f'tof_{i}_mean'] = tof_sensor_data.mean(axis=1)
#         seq_df[f'tof_{i}_std']  = tof_sensor_data.std(axis=1)
#         seq_df[f'tof_{i}_min']  = tof_sensor_data.min(axis=1)
#         seq_df[f'tof_{i}_max']  = tof_sensor_data.max(axis=1)
    
#     mat_unscaled = seq_df[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32')
    
#     all_steps_for_scaler_list.append(mat_unscaled)
#     X_list_unscaled.append(mat_unscaled)
#     y_list_int_for_stratify.append(seq_df['gesture_int'].iloc[0])
#     lens.append(len(mat_unscaled))


In [9]:
### DATA CONFIGURATION
print("▶ TRAIN MODE – configuring dataset …")

train_dem_df = pd.read_csv(paths.TRAIN_DEMOGRAPHICS)
df_for_groups = pd.merge(df.copy(), train_dem_df, on='subject', how='left')
print("df for group shape :", df_for_groups.shape)


# Extract and Sort TOF Columns
tof_columns = [col for col in df.columns if col.startswith("tof_")]
tof_columns = sorted(tof_columns, key=lambda x: (
    int(x.split('_')[1]), int(x.split('_v')[-1])  # sort by sensor number, then pixel index
))
sequence_ids = df["sequence_id"].unique()

# Group by Sequence and Reshape
grouped = df.groupby('sequence_id')

# Estimate the max length
sequence_lengths = grouped.size().values  # length of each sequence
SEQUENCE_LENGTH = int(np.percentile(sequence_lengths, 90))


train_ids, val_ids = train_test_split(
    sequence_ids,
    test_size=0.2,  # 20% validation
    random_state=42,
    stratify=df.groupby("sequence_id")["gesture"].first()  # keeps gesture label distribution balanced
)


train_dataset = MotionDataset(df, imu_cols, tof_columns, train_ids, max_len=SEQUENCE_LENGTH, mode=TRAIN)
val_dataset   = MotionDataset(df, imu_cols, tof_columns, val_ids, max_len=SEQUENCE_LENGTH, mode=TRAIN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True)


# Combine adjacency matrices for all 5 sensors (block-diagonal)
from scipy.linalg import block_diag
sensor_adj = create_8x8_grid_adjacency()
full_adj = block_diag(*[sensor_adj] * 5)  # shape: (320, 320)
print("full adjusted shape :", full_adj.shape)

labels = df["gesture"].unique()
print("number of classes :", len(labels))

print_memory()

▶ TRAIN MODE – configuring dataset …
df for group shape : (574945, 361)
full adjusted shape : (320, 320)
number of classes : 18
Memory Usage: 5105.27 MB


In [32]:

# Prepare the adjacency Matrix
A = torch.tensor(full_adj, dtype=torch.float32, device=device)  # (320, 320)

# Model Instantiation  [B=32, T=320, V=26, C=103]

model = STGCN(
    in_channels=320,         # channels per node (ToF + IMU)
    num_classes=len(df["gesture"].unique()),  # e.g., 20
    A=A,
    num_nodes=103
).to(device)

In [27]:
print("Checking train_loader batches...")
# print(next(iter(train_dataset)))

print(f"Dataset length: {len(train_loader.dataset)}")

for i, batch in enumerate(train_loader):
    print(f"Batch {i} keys: {batch.keys()}")
    break

Checking train_loader batches...
Dataset length: 6520
Batch 0 keys: dict_keys(['X', 'y', 'sequence_id'])


In [33]:
print("⏩ training started .....")

cw_vals = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
weights_tensor = torch.tensor(cw_vals, dtype=torch.float32).to(device)
loss_fn = nn.CrossEntropyLoss(weight=weights_tensor, label_smoothing=0.1)

print("▶️ Setting scheduler  .....")
steps = []
lrs = []
best_val_acc = 0
patience, patience_counter = 10, 0
EPOCHS = config.EPOCHS
scheduler = OneCycleLR(
    optimizer,
    max_lr=1e-3,
    epochs=config.EPOCHS,
    steps_per_epoch=len(train_loader),
    pct_start=0.0,
    anneal_strategy="cos",
    final_div_factor=100,
)

print("✅ Epoch starts .....")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    correct = 0         # <-- reset here
    total = 0           # <-- reset here
    for batch_idx, batch in tqdm.tqdm(enumerate(train_loader)):
        xb, yb = batch["X"].to(device), batch["y"].to(device)
        # if batch_idx == 0:
        print("shape xb ->: ", xb.shape, "batch_idx ->: ", batch_idx)
        optimizer.zero_grad()        
        preds_cls = model(xb)
        # yb_indices = yb.argmax(dim=1)
        loss = loss_fn(preds_cls, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # optional
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        lrs.append(scheduler.get_last_lr()[0])
        steps.append(epoch * config.BATCH_SIZE_TRAIN + batch_idx)
        correct += (pred_labels == yb).sum().item()
        total += yb.size(0)

    train_acc = correct / total
    print(f"Epoch {epoch} | Train Loss: {total_loss / len(train_loader):.4f} | Train Acc: {train_acc:.4f}")
    
    # Validation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            xb, yb = batch["X"].to(device), batch["y"].to(device)
            xb = xb.permute(0, 3, 2, 1)  # from (N, 320, 26, 103) to (N, 103, 26, 320)
            preds_cls = model(xb)
            pred_labels = preds_cls.argmax(1)
            true_labels = yb.argmax(1) if yb.ndim > 1 else yb  #.argmax(1)  val_loader comes from a standard dataset with "y" as class index (long), you don’t need argmax.
            correct += (pred_labels == true_labels).sum().item()
            total += yb.size(0)
    val_acc = correct / total
    print(f"Epoch {epoch} | Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), paths.OUTPUT_DIR / "best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break


⏩ training started .....
▶️ Setting scheduler  .....
✅ Epoch starts .....


0it [00:00, ?it/s]

shape xb ->:  torch.Size([32, 320, 26, 103]) batch_idx ->:  0
in_channels: 32960
input x shape before BN: torch.Size([32, 320, 26, 103])





RuntimeError: einsum(): subscript v has size 320 for operand 1 which does not broadcast with previously seen size 103

0it [00:00, ?it/s]
shape xb torch.Size([8, 127, 42])
815it [00:33, 24.28it/s]
Epoch 0 | Train Loss: 2.7902
Epoch 0 | Val Acc: 0.1749
3it [00:00, 21.27it/s]
shape xb torch.Size([8, 127, 42])
815it [00:31, 25.57it/s]
Epoch 1 | Train Loss: 2.2463
Epoch 1 | Val Acc: 0.2506

In [None]:
# model.load_state_dict(torch.load(paths.OUTPUT_DIR / "best_model.pt"))
# model.eval()
# preds_val = []

# with torch.no_grad():
#     for xb, _ in val_loader:
#         xb = xb.to(device)
#         logits = model(xb)
#         preds_val.append(logits.argmax(1).cpu())

# preds_val = torch.cat(preds_val).numpy()
# true_val_int = y_val.argmax(1).numpy()

# # Evaluation
# from cmi_2025_metric_copy_for_import import CompetitionMetric

# h_f1 = CompetitionMetric().calculate_hierarchical_f1(
#     pd.DataFrame({'gesture': le.classes_[true_val_int]}),
#     pd.DataFrame({'gesture': le.classes_[preds_val]})
# )
# print("Hold-out H-F1 =", round(h_f1, 4))

 |- Model Type	|Strength|	Easy to Try?|
 |------|-------------|---------|
 |- TCN	Fast,| interpretable	|✅✅✅
 |- Transformer |Encoder	Global temporal modeling	|✅✅
 |- CNN + Transformer Hybrid	|Local + global	|✅✅
 |- ResNet1D / InceptionTime	|Robust 1D feature extraction	|✅✅✅
 |- BiLSTM + Attention	|Sequence modeling (non-parallel)	|✅✅
 |- ST-GCN	|Spatial-Temporal & structured	|❌ (if no graph)