In [None]:
import os
import pickle

base_directories = [
    # put path of pantomime/primary_exp/office(and open)/1/0/normal   
]

def load_all_pkl_files(base_dirs):
    pkl_files = {}  
    for base_dir in base_dirs:
        if not os.path.exists(base_dir):
            print(f"Warning: {base_dir} does not exist.")
            continue
        
        for user_id in os.listdir(base_dir):
            user_path = os.path.join(base_dir, user_id)
            if os.path.isdir(user_path): 
                if user_id not in pkl_files:
                    pkl_files[user_id] = {}
                
                for gesture_id in os.listdir(user_path):
                    gesture_path = os.path.join(user_path, gesture_id)
                    if os.path.isdir(gesture_path):  
                        if gesture_id not in pkl_files[user_id]:
                            pkl_files[user_id][gesture_id] = []
                        
                        for filename in os.listdir(gesture_path):
                            if filename.endswith(".pkl"):
                                file_path = os.path.join(gesture_path, filename)
                                try:
                                    with open(file_path, "rb") as f:
                                        data = pickle.load(f, encoding="latin1")  
                                        pkl_files[user_id][gesture_id].append(data)
                                except (pickle.UnpicklingError, UnicodeDecodeError) as e:
                                    print(f"Error loading {file_path}: {e}")
    return pkl_files

loaded_data = load_all_pkl_files(base_directories)

for user, gestures in loaded_data.items():
    print(f"User {user}:")
    for gesture, files in gestures.items():
        print(f"  Gesture {gesture}: {len(files)} pkl files loaded")

# labeling
def label_data(data):
    labeled_data = []
    user_label_map = {}  # user_label ‚Üí user_id mapping
    for user_idx, (user_id, gestures) in enumerate(data.items()):
        user_label_map[user_idx] = user_id  
        for gesture_idx, (gesture_id, files) in enumerate(gestures.items()):
            for file_data in files:
                labeled_data.append({
                    "user_label": user_idx,
                    "gesture_label": int(gesture_id)-1,
                    "data": file_data
                })
    return labeled_data, user_label_map

labeled_data, user_label_map = label_data(loaded_data)

print(f"Total labeled samples: {len(labeled_data)}")
num_unique_users = len(set(sample['user_label'] for sample in labeled_data))
print(f"num_unique_useres: {num_unique_users}") 



In [None]:
from collections import Counter, defaultdict

assert "labeled_data" in globals(), "labeled_dataÍ∞Ä ÏóÜÏäµÎãàÎã§. Î®ºÏ†Ä pantomime Î°úÎìú/label_data ÏÖÄÏùÑ Ïã§ÌñâÌïòÏÑ∏Ïöî."
assert isinstance(labeled_data, list) and len(labeled_data) > 0, "labeled_dataÍ∞Ä ÎπÑÏñ¥ÏûàÏäµÎãàÎã§."

gesture_counts = Counter(int(s["gesture_label"]) for s in labeled_data)

print("=== Gesture (action) counts ===")
for g in sorted(gesture_counts):
    print(f"gesture {g}: {gesture_counts[g]}")
print("TOTAL:", sum(gesture_counts.values()), "\n")

user_counts = Counter(int(s["user_label"]) for s in labeled_data)

print("=== User counts ===")
has_map = ("user_label_map" in globals()) and isinstance(user_label_map, dict)
for u in sorted(user_counts):
    if has_map and u in user_label_map:
        print(f"user {u} ({user_label_map[u]}): {user_counts[u]}")
    else:
        print(f"user {u}: {user_counts[u]}")
print("TOTAL:", sum(user_counts.values()), "\n")  # 41 users 210~420, 21 gestures 410~440 samples 

baseline preprocessing

In [None]:
import numpy as np
import torch
import time
from sklearn.cluster import KMeans, AgglomerativeClustering

def ahc_upsample(data, target_size, noise_std=1e-6, seed=42):
    """
    data: (N,D)
    target_size: Î™©Ìëú Í∞úÏàò
    """
    rng = np.random.RandomState(seed)
    data = np.asarray(data, dtype=np.float32)
    num_samples, dim = data.shape

    if num_samples >= target_size:
        return data

    if num_samples == 0:
        base = np.zeros((1, dim), dtype=np.float32)
        base = np.vstack([base, base + rng.normal(scale=noise_std, size=(1, dim)).astype(np.float32)])
        centroids = base
    elif num_samples == 1:
        base = data
        base2 = base + rng.normal(scale=noise_std, size=(1, dim)).astype(np.float32)
        centroids = np.vstack([base, base2])
    else:
        centroids = data.copy()

    while centroids.shape[0] < target_size:
        cluster_num = min(target_size - centroids.shape[0], max(2, centroids.shape[0] // 2))

        if centroids.shape[0] < 2:
            centroids = np.vstack([centroids, centroids + rng.normal(scale=noise_std, size=centroids.shape).astype(np.float32)])

        ahc = AgglomerativeClustering(n_clusters=cluster_num)
        labels = ahc.fit_predict(centroids)

        new_centroids = []
        for i in range(cluster_num):
            cluster_points = centroids[labels == i]
            if len(cluster_points) > 0:
                new_centroid = cluster_points.mean(axis=0)
            else:
                new_centroid = centroids[rng.randint(len(centroids))]
            new_centroids.append(new_centroid)

        centroids = np.vstack((centroids, np.array(new_centroids, dtype=np.float32)))

    return centroids[:target_size]

def _as_numpy(x):
    if torch.is_tensor(x):
        return x.detach().cpu().numpy()
    return np.asarray(x)

def extract_frame_list(sample_data):
    """
    return: list of frames, each frame is (Pi, D) ndarray
      - list/tuple of frames
      - ndarray/tensor (T,P,D)
      - ndarray/tensor (N,D) -> frame 1
      - dict -> data/frames/points/pc 
    """
    if isinstance(sample_data, dict):
        for k in ["frames", "data", "points", "pc", "pos", "xyz"]:
            if k in sample_data:
                sample_data = sample_data[k]
                break

    if isinstance(sample_data, (list, tuple)) and len(sample_data) > 0:
        return [np.asarray(_as_numpy(fr), dtype=np.float32) for fr in sample_data]

    arr = np.asarray(_as_numpy(sample_data), dtype=np.float32)
    if arr.ndim == 3:  # (T,P,D)
        return [arr[t] for t in range(arr.shape[0])]
    if arr.ndim == 2:  # (N,D)
        return [arr]
    raise ValueError(f"Unsupported sample_data: type={type(sample_data)}, shape={getattr(arr,'shape',None)}")


def preprocess_frames_panto(sample_data, num_frames=32, points_per_frame=32, seed=42):
    """
    KMeans downsample / AHC upsample
    """
    rng = np.random.RandomState(seed)

    frames = extract_frame_list(sample_data)

    all_points = np.vstack(frames) if len(frames) > 0 else np.zeros((0, 3), dtype=np.float32)
    if all_points.shape[1] >= 3:
        all_points = all_points[:, :3]
    else:
        raise ValueError(f"points dim < 3: {all_points.shape}")

    total_points = all_points.shape[0]
    if total_points == 0:
        return torch.zeros((num_frames, points_per_frame, 3), dtype=torch.float32)
    points_per_bin = total_points // num_frames
    remainder = total_points % num_frames

    bins = []
    start_idx = 0
    for i in range(num_frames):
        extra = 1 if i < remainder else 0
        end_idx = start_idx + points_per_bin + extra
        bins.append(all_points[start_idx:end_idx])
        start_idx = end_idx
    processed_frames = []
    for fr in bins:
        n = fr.shape[0]

        if n == 0:
            pick = all_points[rng.randint(0, total_points, size=2)]
            fr = pick
            n = 2

        if n > points_per_frame:
            km = KMeans(n_clusters=points_per_frame, random_state=seed, n_init=10)
            km.fit(fr)
            resampled = km.cluster_centers_.astype(np.float32)
        elif n < points_per_frame:
            resampled = ahc_upsample(fr, points_per_frame, seed=seed)
        else:
            resampled = fr.astype(np.float32)

        processed_frames.append(resampled)
    return torch.tensor(np.stack(processed_frames, axis=0), dtype=torch.float32)


def preprocess_dataset_panto(labeled_data, num_frames=32, points_per_frame=32, seed=42):
    """
    labeled_data: [{"user_label":int, "gesture_label":int, "data":...}, ...]
    return: list[(tensor(T,32,3), (gesture,user))]
    """
    processed = []
    total_time = 0.0

    for i in range(len(labeled_data)):
        t0 = time.time()
        s = labeled_data[i]
        x = s["data"]
        y = (int(s["gesture_label"]), int(s["user_label"]))

        x_proc = preprocess_frames_panto(x, num_frames=num_frames, points_per_frame=points_per_frame, seed=seed)
        processed.append((x_proc, y))

        total_time += (time.time() - t0)

    print(f"üìå preprocessing time per sample: {total_time / max(len(labeled_data),1):.4f} sec")
    print(f"‚è± total: {total_time:.2f} sec")
    return processed


# -------------------------
# save the preprocessed dataset
# -------------------------
import h5py

def save_to_hdf5(processed, filename="Panto_fixed.h5"):
    n = len(processed)
    sample_shape = processed[0][0].shape  # (T,32,3)

    with h5py.File(filename, "w") as f:
        data_dset = f.create_dataset("data", shape=(n, *sample_shape), dtype="float32")
        labels_dset = f.create_dataset("labels", shape=(n, 2), dtype="int32")

        for i, (x, y) in enumerate(processed):
            data_dset[i] = x.numpy()
            labels_dset[i] = np.array([y[0], y[1]], dtype=np.int32)

    print(f"‚úÖ saved: {filename}")
    print("data:", (n, *sample_shape), "labels:", (n, 2))

panto_processed = preprocess_dataset_panto(labeled_data, num_frames=32, points_per_frame=32, seed=42)
print("Ï≤´ ÏÉòÌîå:", panto_processed[0][0].shape, panto_processed[0][1])
save_to_hdf5(panto_processed, filename="Panto_fixed.h5") 


PointNet style frame encoder + GRU

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PointNetPPEncoder(nn.Module):
    def __init__(self, out_dim=64):
        super(PointNetPPEncoder, self).__init__()
        self.mlp1 = nn.Sequential(
            nn.Conv1d(3, 32, 1),
            nn.BatchNorm1d(32),
            nn.ReLU()
        )
        self.mlp2 = nn.Sequential(
            nn.Conv1d(32, out_dim, 1),
            nn.BatchNorm1d(out_dim),
            nn.ReLU()
        )

    def forward(self, x):
        batch_size, num_points, _ = x.shape
        x = x.permute(0, 2, 1) 
        x = self.mlp1(x)  
        x = self.mlp2(x)  
        x = torch.max(x, 2, keepdim=False)[0]  
        return x


class PointNetPPGRU(nn.Module):
    def __init__(self, num_classes, feature_dim=64, hidden_dim=64, num_layers=2):
        super(PointNetPPGRU, self).__init__()
        self.pointnet = PointNetPPEncoder(out_dim=feature_dim) 
        self.gru = nn.GRU(input_size=feature_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)  

    def forward(self, x):
        batch_size, num_frames, num_points, _ = x.shape
        encoded_features = []

        for t in range(num_frames):
            frame_feature = self.pointnet(x[:, t, :, :])  
            encoded_features.append(frame_feature)

        encoded_features = torch.stack(encoded_features, dim=1)  
        # GRU 
        gru_out, _ = self.gru(encoded_features)  # (batch, 16, hidden_dim)
        last_output = gru_out[:, -1, :] 

        logits = self.fc(last_output)  
        return logits


In [None]:
########### 5FOLD CV  ###########
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import h5py
import random
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader, Subset
from torch.optim.lr_scheduler import OneCycleLR


FILENAME = "Panto_fixed.h5"   

RUN_ACTION = True   
RUN_USER   = True

EXPECTED_ACTION_CLASSES = 21  #
EXPECTED_USER_CLASSES   = 41  #
K = 5
VAL_RATIO = 0.1

BATCH_SIZE  = 128
NUM_WORKERS = 4
PIN_MEMORY  = True

EPOCHS  = 100
LR      = 1e-3
MAX_LR  = 3e-3

PATIENCE      = 10
MIN_DELTA     = 1e-4
WARMUP_EPOCHS = 5


MODEL_RETURNS_LOGPROB = False
MAX_GPUS = 4
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

CKPT_DIR = "./ckpt_5fold_panto"
os.makedirs(CKPT_DIR, exist_ok=True)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False

def load_h5_dataset(filename):
    with h5py.File(filename, "r") as f:
        X = np.array(f["data"], dtype=np.float32)    # (N, T, P, 3)
        Y = np.array(f["labels"], dtype=np.int64)    # (N, 2) -> [gesture/action, user]
    assert Y.ndim == 2 and Y.shape[1] >= 2, "labels must be (N,2) with [gesture/action, user]"
    y_action = Y[:, 0].astype(np.int64)
    y_user   = Y[:, 1].astype(np.int64)
    return X, y_action, y_user


def remap_labels(y):
    y = np.asarray(y, dtype=np.int64)
    uniq = np.unique(y)
    lut = {v: i for i, v in enumerate(uniq)}
    y2 = np.array([lut[v] for v in y], dtype=np.int64)
    return y2, int(len(uniq))

class GestureDataset(Dataset):
    def __init__(self, data_np, labels_np):
        self.data = np.asarray(data_np, dtype=np.float32)
        self.labels = np.asarray(labels_np, dtype=np.int64)
        assert len(self.data) == len(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.data[idx])          # (T,P,3)
        y = int(self.labels[idx])
        return x, y

def stratified_kfold_indices(y, k=5, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    for c in np.unique(y):
        idx_c = np.where(y == c)[0]
        rng.shuffle(idx_c)
        for i, idx in enumerate(idx_c):
            folds[i % k].append(int(idx))
    return [np.array(f, dtype=np.int64) for f in folds]


def stratified_train_val_split(indices, y, val_ratio=0.1, seed=42):
    rng = np.random.default_rng(seed)
    indices = np.asarray(indices)
    y_pool = y[indices]

    train_idx, val_idx = [], []
    for c in np.unique(y_pool):
        idx_c = indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        n_val = max(1, int(len(idx_c) * val_ratio))
        val_idx.extend(idx_c[:n_val].tolist())
        train_idx.extend(idx_c[n_val:].tolist())

    rng.shuffle(train_idx)
    rng.shuffle(val_idx)
    return np.array(train_idx, dtype=np.int64), np.array(val_idx, dtype=np.int64)

def wrap_model_for_multi_gpu(model: nn.Module) -> nn.Module:
    if torch.cuda.is_available():
        n_avail = torch.cuda.device_count()
        if n_avail > 1:
            n_use = min(MAX_GPUS, n_avail)
            device_ids = list(range(n_use))
            print(f"Using {n_use} GPUs via DataParallel: {device_ids}")
            model = nn.DataParallel(model, device_ids=device_ids)
    return model.to(device)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    for data, labels in loader:
        data = data.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        outputs = model(data)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return correct / max(total, 1)


def train_one_fold(
    model,
    train_loader,
    val_loader,
    epochs=100,
    lr=1e-3,
    max_lr=3e-3,
    patience=12,
    min_delta=1e-4,
    warmup_epochs=5,
    model_returns_logprob=False
):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = OneCycleLR(optimizer, max_lr=max_lr, steps_per_epoch=len(train_loader), epochs=epochs)

    criterion = nn.NLLLoss() if model_returns_logprob else nn.CrossEntropyLoss()

    best_val_acc = 0.0
    best_state = None
    no_improve = 0

    for epoch in range(epochs):
        model.train()
        train_loss, correct, total = 0.0, 0, 0

        for data, labels in train_loader:
            data = data.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            outputs = model(data)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += float(loss.item())
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / max(total, 1)
        val_acc = evaluate(model, val_loader)

        print(f"Epoch {epoch+1:03d}/{epochs} | Train Loss {train_loss:.4f} | Train Acc {train_acc:.4f} | Val Acc {val_acc:.4f}")

        if (val_acc - best_val_acc) > min_delta:
            best_val_acc = val_acc
            best_state = deepcopy(model.state_dict())
            no_improve = 0
            print("‚úÖ Best model updated!")
        else:
            if (epoch + 1) > warmup_epochs:
                no_improve += 1

        if (epoch + 1) > warmup_epochs and no_improve >= patience:
            print(f"‚èπÔ∏è Early stopping triggered (no improvement for {patience} epochs).")
            break

    print(f"üéØ Best Val Acc: {best_val_acc:.4f}")
    if best_state is not None:
        model.load_state_dict(best_state)   # ‚úÖ bestÎ°ú Î≥µÏõê
    return best_val_acc

def run_5fold_cv(X, y_raw, task_name, expected_classes=None):
    y, num_classes = remap_labels(y_raw)

    if expected_classes is not None:
        if num_classes != expected_classes:
            print(f"[WARN] {task_name}: num_classes={num_classes} (expected {expected_classes}).")

    dataset = GestureDataset(X, y)
    folds = stratified_kfold_indices(y, k=K, seed=42)

    fold_test_accs, fold_val_accs = [], []

    for fold in range(K):
        print(f"\n==================== {task_name.upper()} | Fold {fold+1}/{K} ====================")

        test_idx = folds[fold]
        trainval_idx = np.concatenate([folds[i] for i in range(K) if i != fold])
        train_idx, val_idx = stratified_train_val_split(trainval_idx, y, val_ratio=VAL_RATIO, seed=42 + fold)

        train_loader = DataLoader(
            Subset(dataset, train_idx),
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY,
            persistent_workers=(NUM_WORKERS > 0)
        )
        val_loader = DataLoader(
            Subset(dataset, val_idx),
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY,
            persistent_workers=(NUM_WORKERS > 0)
        )
        test_loader = DataLoader(
            Subset(dataset, test_idx),
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY,
            persistent_workers=(NUM_WORKERS > 0)
        )

        model = PointNetPPGRU(num_classes)  
        model = wrap_model_for_multi_gpu(model)

        best_val_acc = train_one_fold(
            model, train_loader, val_loader,
            epochs=EPOCHS, lr=LR, max_lr=MAX_LR,
            patience=PATIENCE, min_delta=MIN_DELTA, warmup_epochs=WARMUP_EPOCHS,
            model_returns_logprob=MODEL_RETURNS_LOGPROB
        )

        ckpt_path = os.path.join(CKPT_DIR, f"best_model_{task_name}_fold{fold+1}.pth")
        torch.save(model.state_dict(), ckpt_path)
        print(f"üíæ Saved checkpoint: {ckpt_path}")

        test_acc = evaluate(model, test_loader)
        fold_val_accs.append(best_val_acc)
        fold_test_accs.append(test_acc)
        print(f"üî• {task_name} | Fold {fold+1} Test Acc: {test_acc:.4f}")

    val_mean = float(np.mean(fold_val_accs))
    val_std  = float(np.std(fold_val_accs, ddof=1)) if K > 1 else 0.0
    test_mean = float(np.mean(fold_test_accs))
    test_std  = float(np.std(fold_test_accs, ddof=1)) if K > 1 else 0.0

    print(f"\n==================== {task_name.upper()} | 5-Fold CV Summary ====================")
    print(f"#Classes: {num_classes}")
    print(f"Val  Acc: {val_mean:.4f} ¬± {val_std:.4f}")
    print(f"Test Acc: {test_mean:.4f} ¬± {test_std:.4f}")

    return {
        "task": task_name,
        "num_classes": num_classes,
        "val_mean": val_mean, "val_std": val_std,
        "test_mean": test_mean, "test_std": test_std
    }

set_seed(42)

X, y_action, y_user = load_h5_dataset(FILENAME)

print("[INFO] Loaded:", FILENAME)
print("[INFO] X:", X.shape, "action unique:", len(np.unique(y_action)), "user unique:", len(np.unique(y_user)))

results = []
if RUN_ACTION:
    results.append(run_5fold_cv(X, y_action, "action", expected_classes=EXPECTED_ACTION_CLASSES))
if RUN_USER:
    results.append(run_5fold_cv(X, y_user, "user", expected_classes=EXPECTED_USER_CLASSES))

print("\n==================== Overall Summary ====================")
for r in results:
    print(f"{r['task']:>6s} | C={r['num_classes']:>3d} | Val {r['val_mean']:.4f}¬±{r['val_std']:.4f} | Test {r['test_mean']:.4f}¬±{r['test_std']:.4f}") #9552/9144


DGCNN + GRU

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import DynamicEdgeConv, global_max_pool


def MLP(channels):
    layers = []
    for i in range(1, len(channels)):
        layers.append(nn.Linear(channels[i - 1], channels[i]))
        layers.append(nn.ReLU())
        layers.append(nn.BatchNorm1d(channels[i]))
    return nn.Sequential(*layers)

class DGCNNEncoder(nn.Module):
    def __init__(self, out_dim=64, k=16):
        super(DGCNNEncoder, self).__init__()
        self.conv1 = DynamicEdgeConv(MLP([2 * 3, 64, 64, 64]), k, aggr='max')
        self.conv2 = DynamicEdgeConv(MLP([2 * 64, 128]), k, aggr='max')
        self.lin1 = MLP([128 + 64, out_dim])  

    def forward(self, x, batch):
        x1 = self.conv1(x, batch)
        x2 = self.conv2(x1, batch)
        out = self.lin1(torch.cat([x1, x2], dim=1))  
        out = global_max_pool(out, batch) 
        return out  

class DGCNNGRU(nn.Module):
    def __init__(self, num_classes, feature_dim=64, hidden_dim=64, num_layers=2, k=16):
        super(DGCNNGRU, self).__init__()
        self.dgcnn = DGCNNEncoder(out_dim=feature_dim, k=k)
        self.gru = nn.GRU(input_size=feature_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, batch):
        batch_size, num_frames, num_points, _ = x.shape
        encoded_features = []

        for t in range(num_frames):
            frame_x = x[:, t, :, :].reshape(-1, 3) 
            frame_batch = torch.arange(batch_size, device=x.device).repeat_interleave(num_points)  

            frame_feature = self.dgcnn(frame_x, frame_batch) 
            encoded_features.append(frame_feature)
        encoded_features = torch.stack(encoded_features, dim=1)  

        gru_out, _ = self.gru(encoded_features)  # (batch, 16, hidden_dim)
        last_output = gru_out[:, -1, :]  

        logits = self.fc(last_output)  # (batch, num_classes)
        return F.log_softmax(logits, dim=-1) 
    
class DGCNNWithAutoBatchIdx(nn.Module):
    def __init__(self, base_model: nn.Module):
        super().__init__()
        self.base = base_model
    def forward(self, x):
        B, T, N, _ = x.shape
        batch_idx = torch.arange(B, device=x.device).repeat_interleave(N)
        return self.base(x, batch_idx)


In [None]:
import os, random
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader, Subset
from torch.optim.lr_scheduler import OneCycleLR

H5_PATH   = "Panto_fixed.h5"
CKPT_DIR  = "./ckpt_5fold_dgcnn_panto"
MODEL_TAG = "dgcnn"                
os.makedirs(CKPT_DIR, exist_ok=True)

RUN_ACTION = True
RUN_USER   = True

K = 5
VAL_RATIO = 0.1

BATCH_SIZE  = 16
NUM_WORKERS = 4
PIN_MEMORY  = True

EPOCHS  = 100
LR      = 1e-3
MAX_LR  = 3e-3
PATIENCE      = 12
MIN_DELTA     = 1e-4
WARMUP_EPOCHS = 5

MODEL_RETURNS_LOGPROB = True

MAX_GPUS = 4

EXPECTED_GESTURE_CLASSES = 21
EXPECTED_USER_CLASSES    = 41

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    torch.backends.cudnn.benchmark = True

def make_model(num_classes: int) -> nn.Module:
    return DGCNNWithAutoBatchIdx(DGCNNGRU(num_classes)) 

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def load_h5_panto(path):
    with h5py.File(path, "r") as f:
        X = np.array(f["data"])       # (N,T,P,3)
        Y = np.array(f["labels"])     # (N,2)
    y_gesture = Y[:, 0].astype(np.int64)
    y_user    = Y[:, 1].astype(np.int64)
    return X, y_gesture, y_user

def remap_labels(y_raw):
    y_raw = np.asarray(y_raw, dtype=np.int64)
    uniq = np.unique(y_raw)
    lut = {v: i for i, v in enumerate(uniq)}
    y = np.array([lut[v] for v in y_raw], dtype=np.int64)
    return y, int(len(uniq))

def stratified_kfold_indices(y, k=5, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    for c in np.unique(y):
        idx_c = np.where(y == c)[0]
        rng.shuffle(idx_c)
        for i, idx in enumerate(idx_c):
            folds[i % k].append(int(idx))
    return [np.array(f, dtype=np.int64) for f in folds]

def stratified_train_val_split(indices, y, val_ratio=0.1, seed=42):
    rng = np.random.default_rng(seed)
    indices = np.asarray(indices)
    y_pool = y[indices]
    train_idx, val_idx = [], []
    for c in np.unique(y_pool):
        idx_c = indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        n_val = max(1, int(len(idx_c) * val_ratio))
        val_idx.extend(idx_c[:n_val].tolist())
        train_idx.extend(idx_c[n_val:].tolist())
    rng.shuffle(train_idx); rng.shuffle(val_idx)
    return np.array(train_idx, dtype=np.int64), np.array(val_idx, dtype=np.int64)

class H5TensorDataset(Dataset):
    def __init__(self, X_np, y_np):
        self.X = X_np
        self.y = y_np
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return torch.from_numpy(self.X[i]).float(), int(self.y[i])

def wrap_dp(model: nn.Module) -> nn.Module:
    if torch.cuda.is_available() and torch.cuda.device_count() > 1 and MAX_GPUS > 1:
        n_use = min(MAX_GPUS, torch.cuda.device_count())
        model = nn.DataParallel(model, device_ids=list(range(n_use)))
        print(f"[DP] Using {n_use} GPUs")
    return model.to(device)

def get_base_state_dict(model: nn.Module) -> dict:
    if isinstance(model, nn.DataParallel):
        model = model.module
    return model.state_dict()

@torch.no_grad()
def evaluate_acc(model, loader):
    model.eval()
    ok, tot = 0, 0
    for x, y in loader:
        x = x.to(device, non_blocking=True)
        y = torch.as_tensor(y, dtype=torch.long, device=device)
        out = model(x)
        pred = out.argmax(dim=1)
        ok += (pred == y).sum().item()
        tot += y.numel()
    return ok / max(tot, 1)

def train_one_fold(model, train_loader, val_loader):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    scheduler = OneCycleLR(optimizer, max_lr=MAX_LR, steps_per_epoch=len(train_loader), epochs=EPOCHS)
    criterion = nn.NLLLoss() if MODEL_RETURNS_LOGPROB else nn.CrossEntropyLoss()

    best_val = 0.0
    best_state = None
    no_improve = 0

    for epoch in range(EPOCHS):
        model.train()
        tr_loss = 0.0
        for x, y in train_loader:
            x = x.to(device, non_blocking=True)
            y = torch.as_tensor(y, dtype=torch.long, device=device)

            optimizer.zero_grad(set_to_none=True)
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            scheduler.step()
            tr_loss += float(loss.item())

        va = evaluate_acc(model, val_loader)
        print(f"Epoch {epoch+1:03d}/{EPOCHS} | TrainLoss {tr_loss:.4f} | ValAcc {va:.4f}")

        if (va - best_val) > MIN_DELTA:
            best_val = va
            best_state = deepcopy(model.state_dict())
            no_improve = 0
            print("‚úÖ Best updated")
        else:
            if (epoch + 1) > WARMUP_EPOCHS:
                no_improve += 1

        if (epoch + 1) > WARMUP_EPOCHS and no_improve >= PATIENCE:
            print(f"‚èπÔ∏è Early stop ({PATIENCE} epochs no improve)")
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return best_val

def run_5fold_cv(X, y_raw, task_name: str):
    y, num_classes = remap_labels(y_raw)
    ds = H5TensorDataset(X, y)
    folds = stratified_kfold_indices(y, k=K, seed=42)

    fold_val, fold_test = [], []

    for fold in range(K):
        print(f"\n==================== {MODEL_TAG.upper()} | {task_name.upper()} | Fold {fold+1}/{K} ====================")
        test_idx = folds[fold]
        trainval_idx = np.concatenate([folds[i] for i in range(K) if i != fold])
        train_idx, val_idx = stratified_train_val_split(trainval_idx, y, val_ratio=VAL_RATIO, seed=42 + fold)

        train_loader = DataLoader(Subset(ds, train_idx), batch_size=BATCH_SIZE, shuffle=True,
                                  num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                                  persistent_workers=(NUM_WORKERS > 0))
        val_loader   = DataLoader(Subset(ds, val_idx), batch_size=BATCH_SIZE, shuffle=False,
                                  num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                                  persistent_workers=(NUM_WORKERS > 0))
        test_loader  = DataLoader(Subset(ds, test_idx), batch_size=BATCH_SIZE, shuffle=False,
                                  num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                                  persistent_workers=(NUM_WORKERS > 0))

        model = wrap_dp(make_model(num_classes))

        best_val = train_one_fold(model, train_loader, val_loader)

        ckpt_path = os.path.join(CKPT_DIR, f"best_model_{MODEL_TAG}_{task_name}_fold{fold+1}.pth")
        torch.save(get_base_state_dict(model), ckpt_path)
        print(f"üíæ Saved: {ckpt_path}")

        te = evaluate_acc(model, test_loader)
        fold_val.append(best_val)
        fold_test.append(te)
        print(f"üî• Fold{fold+1} TestAcc: {te:.4f}")

    val_mean = float(np.mean(fold_val))
    val_std  = float(np.std(fold_val, ddof=1)) if K > 1 else 0.0
    test_mean = float(np.mean(fold_test))
    test_std  = float(np.std(fold_test, ddof=1)) if K > 1 else 0.0

    print(f"\n==================== {MODEL_TAG.upper()} | {task_name.upper()} | Summary ====================")
    print(f"#Classes: {num_classes}")
    print(f"Val  Acc: {val_mean:.4f} ¬± {val_std:.4f}")
    print(f"Test Acc: {test_mean:.4f} ¬± {test_std:.4f}")

    return {"task": task_name, "num_classes": num_classes,
            "val_mean": val_mean, "val_std": val_std, "test_mean": test_mean, "test_std": test_std}

set_seed(42)

X, y_gesture_raw, y_user_raw = load_h5_panto(H5_PATH)
print(f"[INFO] X={X.shape}, gesture uniq={len(np.unique(y_gesture_raw))}, user uniq={len(np.unique(y_user_raw))}")

if EXPECTED_GESTURE_CLASSES is not None:
    assert len(np.unique(y_gesture_raw)) == EXPECTED_GESTURE_CLASSES
if EXPECTED_USER_CLASSES is not None:
    assert len(np.unique(y_user_raw)) == EXPECTED_USER_CLASSES

results = []
if RUN_ACTION:
    results.append(run_5fold_cv(X, y_gesture_raw, "action"))
if RUN_USER:
    results.append(run_5fold_cv(X, y_user_raw, "user"))

print("\n==================== Overall Summary ====================")
for r in results:
    print(f"{r['task']:>6s} | C={r['num_classes']:>3d} | "
          f"Val {r['val_mean']:.4f}¬±{r['val_std']:.4f} | "
          f"Test {r['test_mean']:.4f}¬±{r['test_std']:.4f}")


PointTransformerV3

In [None]:
import os, time, random, math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from copy import deepcopy
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader

os.environ.setdefault("SPCONV_ALLOW_TF32", "1")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision("high")

from model import PointTransformerV3

assert "labeled_data" in globals() and isinstance(labeled_data, list) and len(labeled_data) > 0, \
    "load pantomime"

RUN_ACTION = True
RUN_USER   = True

K_FOLDS   = 5
VAL_RATIO = 0.1  
FOLD_SEED = 42
VAL_SEED  = 123

BATCH_SIZE  = 4
NUM_WORKERS = 4
PIN_MEMORY  = True

EPOCHS = 100

BASE_LR = 3e-4
MAX_LR  = 1e-3
MIN_LR  = 1e-5
WARMUP_EPOCHS = 5

PATIENCE  = 12
MIN_DELTA = 1e-4
GRAD_CLIP_NORM = 1.0
WEIGHT_DECAY   = 1e-4

GRID_SIZE = 0.08
USE_T_SINCOS = True

USE_AMP = True
USE_DATAPARALLEL = False
MAX_GPUS = 4
MAX_POINTS = 8192
PREFETCH = 4

DETERMINISTIC_SUBSAMPLE = True
SUBSAMPLE_SEED = 2025

USE_BALANCED_SUBSET = False
PER_GESTURE = 450
BALANCE_SEED = 12

CKPT_DIR  = "./ckpt_ptv3_panto_5fold_stable"
MODEL_TAG = "ptv3"
os.makedirs(CKPT_DIR, exist_ok=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    torch.backends.cudnn.benchmark = True

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

def remap_labels(y_raw):
    y_raw = np.asarray(y_raw, dtype=np.int64)
    uniq = np.unique(y_raw)
    lut = {v: i for i, v in enumerate(uniq)}
    y = np.array([lut[v] for v in y_raw], dtype=np.int64)
    return y, int(len(uniq))

def stratified_train_val_split(pos_indices, y, val_ratio=0.1, seed=42):
    rng = np.random.default_rng(seed)
    pos_indices = np.asarray(pos_indices, dtype=np.int64)
    y_pool = y[pos_indices]
    train, val = [], []
    for c in np.unique(y_pool):
        idx_c = pos_indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        n_val = max(1, int(len(idx_c) * val_ratio))
        val.extend(idx_c[:n_val].tolist())
        train.extend(idx_c[n_val:].tolist())
    rng.shuffle(train); rng.shuffle(val)
    return np.array(train, dtype=np.int64), np.array(val, dtype=np.int64)

def make_stratified_folds(pos_indices, y, k=5, seed=42):
    """
    pos_indices: 0..(pool-1) positions
    y: labels on pool positions (already remapped)
    return: folds = [np.array(test_pos_for_fold0), ... fold(k-1)]
    """
    rng = np.random.default_rng(seed)
    pos_indices = np.asarray(pos_indices, dtype=np.int64)
    y_pool = y[pos_indices]

    by_class = {}
    for c in np.unique(y_pool):
        idx_c = pos_indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        by_class[int(c)] = idx_c

    folds = [[] for _ in range(k)]
    for c, idx_c in by_class.items():
        parts = np.array_split(idx_c, k)
        for f in range(k):
            folds[f].extend(parts[f].tolist())

    folds = [np.array(folds[f], dtype=np.int64) for f in range(k)]
    for f in range(k):
        rng.shuffle(folds[f])
    return folds


def _get_x_from_item(item):
    x = item.get("data", item)
    if isinstance(x, dict):
        for k in ["pos", "points", "pc", "xyz", "coords", "data", "frames"]:
            if k in x:
                x = x[k]
                break
    return x

class PantoIndexLabelDataset(Dataset):
    def __init__(self, base_list, idxs, labels_mapped):
        self.base = base_list
        self.idxs = np.asarray(idxs, dtype=np.int64)
        self.labels = np.asarray(labels_mapped, dtype=np.int64)
        assert len(self.idxs) == len(self.labels)

    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, i):
        base_idx = int(self.idxs[i])
        item = self.base[base_idx]
        x = _get_x_from_item(item)
        y = int(self.labels[i])
        return x, y, base_idx

def _cap_points(coord, feat, max_points, seed_int=None):
    if (max_points is None) or (coord.size(0) <= max_points):
        return coord, feat
    n = coord.size(0)

    if DETERMINISTIC_SUBSAMPLE and (seed_int is not None):
        g = torch.Generator(device="cpu")
        g.manual_seed(int(SUBSAMPLE_SEED + int(seed_int)) % (2**31 - 1))
        idx = torch.randperm(n, generator=g)[:max_points]
    else:
        idx = torch.randperm(n)[:max_points]

    return coord[idx], feat[idx]

def _to_points_flat(x, use_t_sincos=True, max_points=MAX_POINTS, sample_id=None):
    if isinstance(x, dict):
        for k in ["pos","points","pc","xyz","coords","data","frames"]:
            if k in x:
                x = x[k]
                break

    # list of frames
    if isinstance(x, (list, tuple)) and len(x) > 0 and (not torch.is_tensor(x)) and (not isinstance(x, np.ndarray)):
        coords, feats = [], []
        for t, fr in enumerate(x):
            if fr is None:
                continue
            if isinstance(fr, dict):
                for k in ["pos","points","pc","xyz","coords","data"]:
                    if k in fr:
                        fr = fr[k]
                        break
            fr = torch.as_tensor(fr, dtype=torch.float32)
            if fr.ndim != 2 or fr.size(-1) < 3:
                continue
            coord = fr[:, :3].contiguous()
            if coord.size(0) == 0:
                continue
            if use_t_sincos:
                s = math.sin(float(t))
                c = math.cos(float(t))
                pe = coord.new_empty((coord.size(0), 2))
                pe[:, 0] = s
                pe[:, 1] = c
                feat = torch.cat([coord, pe], dim=1)
            else:
                feat = coord
            coords.append(coord); feats.append(feat)

        if len(coords) == 0:
            return torch.zeros((0,3), dtype=torch.float32), torch.zeros((0,5 if use_t_sincos else 3), dtype=torch.float32)

        coord_i = torch.cat(coords, dim=0)
        feat_i  = torch.cat(feats,  dim=0)
        coord_i, feat_i = _cap_points(coord_i, feat_i, max_points, seed_int=sample_id)
        return coord_i, feat_i

    x = torch.as_tensor(x, dtype=torch.float32)

    # (T,P,C)
    if x.ndim == 3 and x.size(-1) >= 3:
        T, P, _ = x.shape
        coord = x[..., :3].reshape(-1, 3).contiguous()
        if use_t_sincos:
            t = torch.arange(T, dtype=torch.float32)
            pe_t = torch.stack([torch.sin(t), torch.cos(t)], dim=1)
            pe = pe_t.repeat_interleave(P, dim=0)
            feat = torch.cat([coord, pe.to(coord.device)], dim=1)
        else:
            feat = coord
        coord, feat = _cap_points(coord, feat, max_points, seed_int=sample_id)
        return coord, feat

    # (N,C)
    if x.ndim == 2 and x.size(-1) >= 3:
        coord = x[:, :3].contiguous()
        if use_t_sincos:
            pe = coord.new_empty((coord.size(0), 2))
            pe[:, 0] = 0.0
            pe[:, 1] = 1.0
            feat = torch.cat([coord, pe], dim=1)
        else:
            feat = coord
        coord, feat = _cap_points(coord, feat, max_points, seed_int=sample_id)
        return coord, feat

    raise ValueError(f"Unsupported sample shape/type: type={type(x)} shape={getattr(x, 'shape', None)}")

def ptv3_collate_cls(batch):
    # batch: (x, y, sample_id)
    B = len(batch)
    coords_i, feats_i, lens, labels = [], [], [], []

    for (x, y, sid) in batch:
        coord, feat = _to_points_flat(x, use_t_sincos=USE_T_SINCOS, max_points=MAX_POINTS, sample_id=int(sid))
        if coord.numel() == 0:
            raise ValueError("Empty point cloud (Ni=0). Ìï¥Îãπ ÏÉòÌîå Ï†úÍ±∞/Ï≤òÎ¶¨ ÌïÑÏöî.")
        coords_i.append(coord)
        feats_i.append(feat)
        lens.append(coord.size(0))
        labels.append(int(y))

    Nmax = int(max(lens))
    Cin  = int(feats_i[0].size(1))

    coord_pad = torch.zeros((B, Nmax, 3), dtype=torch.float32)
    feat_pad  = torch.zeros((B, Nmax, Cin), dtype=torch.float32)
    lengths   = torch.tensor(lens, dtype=torch.long)
    labels    = torch.tensor(labels, dtype=torch.long)

    for b in range(B):
        n = coords_i[b].size(0)
        coord_pad[b, :n] = coords_i[b]
        feat_pad[b, :n]  = feats_i[b]

    batch_dict = {"coord": coord_pad, "feat": feat_pad, "lengths": lengths, "grid_size": float(GRID_SIZE)}
    return batch_dict, labels

class PTv3Classifier(nn.Module):
    def __init__(self, num_classes, in_channels):
        super().__init__()
        self.backbone = PointTransformerV3(
            in_channels=in_channels,
            cls_mode=True,
            enable_flash=False,
        )
        self.head = nn.LazyLinear(num_classes)

    def forward(self, batch_dict):
        coord = batch_dict["coord"]
        feat  = batch_dict["feat"]
        lengths = batch_dict["lengths"]
        grid_size = batch_dict["grid_size"]

        B, Nmax, _ = coord.shape
        ar = torch.arange(Nmax, device=coord.device).unsqueeze(0)
        mask = ar < lengths.unsqueeze(1)

        coord_flat = coord[mask]
        feat_flat  = feat[mask]
        batch = torch.repeat_interleave(torch.arange(B, device=coord.device), lengths)

        data_dict = {"coord": coord_flat, "feat": feat_flat, "batch": batch, "grid_size": float(grid_size)}

        point = self.backbone(data_dict)
        featN = point.feat
        batchN = point.batch

        C = featN.size(1)
        pooled = featN.new_zeros((B, C))
        cnt = featN.new_zeros((B, 1))
        pooled.index_add_(0, batchN, featN)
        cnt.index_add_(0, batchN, torch.ones((featN.size(0), 1), device=featN.device, dtype=featN.dtype))
        pooled = pooled / cnt.clamp_min(1.0)

        return self.head(pooled)

def wrap_model(model: nn.Module) -> nn.Module:
    if USE_DATAPARALLEL and torch.cuda.is_available() and torch.cuda.device_count() > 1:
        n_use = min(MAX_GPUS, torch.cuda.device_count())
        model = nn.DataParallel(model, device_ids=list(range(n_use)))
        print(f"[DP] Using {n_use} GPUs")
    return model.to(device)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    ok, tot = 0, 0
    for bd, y in loader:
        for k, v in bd.items():
            if torch.is_tensor(v):
                bd[k] = v.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        logits = model(bd)
        ok += (logits.argmax(1) == y).sum().item()
        tot += y.numel()
    return ok / max(tot, 1)

def set_lr(optimizer, lr: float):
    for pg in optimizer.param_groups:
        pg["lr"] = lr

def warmup_cosine_lr(step, total_steps, warmup_steps, base_lr, max_lr, min_lr):
    if step < warmup_steps:
        return base_lr + (max_lr - base_lr) * (step / max(1, warmup_steps))
    t = (step - warmup_steps) / max(1, total_steps - warmup_steps)
    t = min(max(t, 0.0), 1.0)
    return min_lr + 0.5 * (max_lr - min_lr) * (1.0 + math.cos(math.pi * t))

def train_one_run(model, train_loader, val_loader):
    optimizer = optim.Adam(model.parameters(), lr=BASE_LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.CrossEntropyLoss()

    use_amp = bool(USE_AMP and device.type == "cuda")
    use_bf16 = bool(use_amp and torch.cuda.is_bf16_supported())
    amp_dtype = torch.bfloat16 if use_bf16 else torch.float16
    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and (not use_bf16)))

    total_steps  = EPOCHS * len(train_loader)
    warmup_steps = WARMUP_EPOCHS * len(train_loader)
    global_step = 0

    best_val = 0.0
    best_state = None
    no_improve = 0
    first_step_printed = False

    for epoch in range(EPOCHS):
        model.train()
        tr_loss, tr_ok, tr_tot = 0.0, 0, 0

        for bd, y in train_loader:
            lr = warmup_cosine_lr(global_step, total_steps, warmup_steps, BASE_LR, MAX_LR, MIN_LR)
            set_lr(optimizer, lr)
            global_step += 1

            for k, v in bd.items():
                if torch.is_tensor(v):
                    bd[k] = v.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            if (not first_step_printed) and device.type == "cuda":
                first_step_printed = True
                print(f"[GPU CHECK] coord={bd['coord'].device}, feat={bd['feat'].device}, y={y.device} | AMP={use_amp} dtype={amp_dtype}")

            optimizer.zero_grad(set_to_none=True)

            with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=use_amp):
                logits = model(bd)
                loss = criterion(logits, y)

            if not torch.isfinite(loss):
                print(f"[WARN] non-finite loss detected. skip step. loss={float(loss)}")
                optimizer.zero_grad(set_to_none=True)
                continue

            if scaler.is_enabled():
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                if GRAD_CLIP_NORM and GRAD_CLIP_NORM > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                if GRAD_CLIP_NORM and GRAD_CLIP_NORM > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
                optimizer.step()

            tr_loss += float(loss.item())
            tr_ok += (logits.argmax(1) == y).sum().item()
            tr_tot += y.numel()

        tr_acc = tr_ok / max(tr_tot, 1)
        va_acc = evaluate(model, val_loader)
        cur_lr = optimizer.param_groups[0]["lr"]

        print(f"Epoch {epoch+1:03d}/{EPOCHS} | LR {cur_lr:.2e} | TrainLoss {tr_loss:.4f} | TrainAcc {tr_acc:.4f} | ValAcc {va_acc:.4f}")

        if (va_acc - best_val) > MIN_DELTA:
            best_val = va_acc
            best_state = deepcopy(model.state_dict())
            no_improve = 0
            print("‚úÖ Best updated")
        else:
            if (epoch + 1) > WARMUP_EPOCHS:
                no_improve += 1

        if (epoch + 1) > WARMUP_EPOCHS and no_improve >= PATIENCE:
            print(f"‚èπÔ∏è Early stop ({PATIENCE} epochs no improve)")
            break

    if best_state is not None:
        model.load_state_dict(best_state)
    return best_val

def speed_check(train_loader, model, steps=6):
    it = iter(train_loader)
    t0 = time.time()
    batches = []
    for _ in range(steps):
        batches.append(next(it))
    t1 = time.time()

    bd, y = batches[-1]
    for k, v in bd.items():
        if torch.is_tensor(v):
            bd[k] = v.to(device, non_blocking=True)
    y = y.to(device, non_blocking=True)

    model.train()
    opt = optim.SGD(model.parameters(), lr=1e-6)

    if device.type == "cuda":
        torch.cuda.synchronize()
    t2 = time.time()
    for _ in range(steps):
        opt.zero_grad(set_to_none=True)
        out = model(bd)
        loss = out.mean()
        loss.backward()
        opt.step()
    if device.type == "cuda":
        torch.cuda.synchronize()
    t3 = time.time()

    print(f"[SPEED] dataload+collate: {(t1-t0)/steps*1000:.2f} ms/step | fwd+bwd: {(t3-t2)/steps*1000:.2f} ms/step")

all_indices = np.arange(len(labeled_data), dtype=np.int64)

if USE_BALANCED_SUBSET:
    rng = np.random.RandomState(BALANCE_SEED)
    by_g = defaultdict(list)
    for i, s in enumerate(labeled_data):
        by_g[int(s["gesture_label"])].append(i)
    pool = []
    for g in sorted(by_g.keys()):
        idxs = by_g[g]
        rng.shuffle(idxs)
        pool.extend(idxs[:min(PER_GESTURE, len(idxs))])
    pool_indices = np.array(sorted(pool), dtype=np.int64)
    print(f"[BALANCED] pool={len(pool_indices)} (PER_GESTURE={PER_GESTURE})")
else:
    pool_indices = all_indices
    print(f"[POOL] pool={len(pool_indices)} (all)")

def run_kfold(task_name: str):
    if task_name not in ["action", "user"]:
        raise ValueError(task_name)

    y_raw = np.array(
        [int(labeled_data[i]["gesture_label"] if task_name == "action" else labeled_data[i]["user_label"])
         for i in pool_indices],
        dtype=np.int64
    )
    y, num_classes = remap_labels(y_raw)

    print(f"\n[{task_name.upper()}] #classes={num_classes} | pool={len(pool_indices)} | K={K_FOLDS}")

    all_pos = np.arange(len(pool_indices), dtype=np.int64)
    folds = make_stratified_folds(all_pos, y, k=K_FOLDS, seed=FOLD_SEED)

    fold_results = []

    for fold in range(K_FOLDS):
        test_pos = folds[fold]
        trainval_pos = np.setdiff1d(all_pos, test_pos, assume_unique=False)
        train_pos, val_pos = stratified_train_val_split(trainval_pos, y, val_ratio=VAL_RATIO, seed=VAL_SEED + fold)

        train_idx = pool_indices[train_pos]
        val_idx   = pool_indices[val_pos]
        test_idx  = pool_indices[test_pos]

        ds_train = PantoIndexLabelDataset(labeled_data, train_idx, y[train_pos])
        ds_val   = PantoIndexLabelDataset(labeled_data, val_idx,   y[val_pos])
        ds_test  = PantoIndexLabelDataset(labeled_data, test_idx,  y[test_pos])

        loader_kw = dict(
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY,
            persistent_workers=(NUM_WORKERS > 0),
            prefetch_factor=PREFETCH if NUM_WORKERS > 0 else None,
            collate_fn=ptv3_collate_cls,
        )

        train_loader = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,  **loader_kw)
        val_loader   = DataLoader(ds_val,   batch_size=BATCH_SIZE, shuffle=False, **loader_kw)
        test_loader  = DataLoader(ds_test,  batch_size=BATCH_SIZE, shuffle=False, **loader_kw)

        in_ch = 5 if USE_T_SINCOS else 3
        model = wrap_model(PTv3Classifier(num_classes=num_classes, in_channels=in_ch))

        print(f"\n----- {task_name.upper()} | FOLD {fold}/{K_FOLDS-1} -----")
        print(f"[SPLIT] train/val/test = {len(ds_train)}/{len(ds_val)}/{len(ds_test)}")
        print(f"[STABLE CFG] AMP={USE_AMP} | bf16={torch.cuda.is_available() and torch.cuda.is_bf16_supported()} | "
              f"MAX_POINTS={MAX_POINTS} (deterministic={DETERMINISTIC_SUBSAMPLE}) | GRID_SIZE={GRID_SIZE} | "
              f"LR warmup+cosine (base={BASE_LR}, max={MAX_LR}, min={MIN_LR}) | CLIP={GRAD_CLIP_NORM} | wd={WEIGHT_DECAY}")

        if device.type == "cuda" and fold == 0:
            speed_check(train_loader, model, steps=6)

        best_val = train_one_run(model, train_loader, val_loader)

        ckpt_path = os.path.join(CKPT_DIR, f"best_model_{MODEL_TAG}_{task_name}_fold{fold}.pth")
        torch.save(model.state_dict(), ckpt_path)
        print(f"üíæ Saved: {ckpt_path}")

        te = evaluate(model, test_loader)
        print(f"üî• {task_name} | Fold {fold} | BestVal {best_val:.4f} | TestAcc {te:.4f}")

        fold_results.append({
            "task": task_name,
            "fold": int(fold),
            "num_classes": int(num_classes),
            "best_val": float(best_val),
            "test_acc": float(te),
            "sizes": {"train": len(ds_train), "val": len(ds_val), "test": len(ds_test)}
        })

        del model, train_loader, val_loader, test_loader, ds_train, ds_val, ds_test
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    test_accs = [r["test_acc"] for r in fold_results]
    mean_acc = float(np.mean(test_accs)) if len(test_accs) else 0.0
    std_acc  = float(np.std(test_accs))  if len(test_accs) else 0.0

    print(f"\n==================== {task_name.upper()} | {K_FOLDS}-FOLD SUMMARY ====================")
    for r in fold_results:
        sz = r["sizes"]
        print(f"fold{r['fold']} | Val(best) {r['best_val']:.4f} | Test {r['test_acc']:.4f} | "
              f"train/val/test={sz['train']}/{sz['val']}/{sz['test']}")
    print(f"MEAN¬±STD TestAcc = {mean_acc:.4f} ¬± {std_acc:.4f}")

    return fold_results

set_seed(42)

all_results = []
if RUN_ACTION:
    all_results.extend(run_kfold("action"))
if RUN_USER:
    all_results.extend(run_kfold("user"))

FASTHAR

In [None]:
# FastHAR need to be downloaded
# -*- coding: utf-8 -*-  
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from pc_encoder.pointnet import pointnet_encoder
from pc_encoder.pn2 import PointNet2Encoder
from pc_encoder.pointmlp import PointMLPEncoder

from pc_encoder.conv2d_dgcnn import DGCNN_encoder as conv2d_dgcnn_encoder
from pc_encoder.conv2d_sa_dgcnn import conv2d_sa_dgcnn_encoder
from pc_encoder.conv3d_dgcnn import conv3d_dgcnn_encoder
from pc_encoder.conv3d_sa_dgcnn import conv3d_sa_dgcnn_encoder

# Transformer
from torch.nn import TransformerEncoder,TransformerEncoderLayer
import math
from timm.models.layers import trunc_normal_
# mort sort
from utils.mort_sort import simplied_morton_sorting,morton_sorting
# rnn sort
from utils.pointcloudRnn import PointCloudSortingRNN


class pcseq_classifier(nn.Module):
    def __init__(self, args):
        super(pcseq_classifier, self).__init__()
        self.args = args
        self.emb_dims = args.emb_dims
        self.hidden_dims = args.hidden_dims
        self.what_encoder = args.encoder
        device = torch.device("cuda" if args.cuda else "cpu")
        if self.what_encoder == 'conv2d_dgcnn':
            self.encoder = conv2d_dgcnn_encoder(args).to(device) 
        elif self.what_encoder == 'conv2d_sa_dgcnn':
            self.encoder = conv2d_sa_dgcnn_encoder(args).to(device)
        elif self.what_encoder == 'conv3d_dgcnn':
            self.encoder = conv3d_dgcnn_encoder(args).to(device)
        elif self.what_encoder == 'conv3d_sa_dgcnn':
            self.encoder = conv3d_sa_dgcnn_encoder(args).to(device)
        elif self.what_encoder == 'pointnet':
            self.encoder = pointnet_encoder(args).to(device)
        elif self.what_encoder == 'pointnet2':
            self.encoder = PointNet2Encoder(args).to(device)
        elif self.what_encoder == 'pointmlp':
            self.encoder = PointMLPEncoder(args,k_neighbors=[6],dim_expansion=[4],pre_blocks=[1],pos_blocks=[1],reducers=[2],normalize="center",res_expansion=0.5).to(device)
        
        # transformer, wait to be optimized
        transformer_encoder_layer = TransformerEncoderLayer(d_model=args.emb_dims, nhead=4, dim_feedforward=256, dropout=args.dropout, activation='relu')
        self.transformer = TransformerEncoder(transformer_encoder_layer, num_layers=2)
        # add cls token
        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.emb_dims))
        self.cls_pos = nn.Parameter(torch.randn(1, 1, self.emb_dims))
        trunc_normal_(self.cls_token, std=.02)
        trunc_normal_(self.cls_pos, std=.02)

        #use transformer
        self.fc1 = nn.Linear(args.hidden_dims*2 *2, args.hidden_dims*2) 
        self.fc1_5 = nn.Linear(args.hidden_dims*2, args.hidden_dims)
        self.fc2 = nn.Linear(args.hidden_dims, int(args.hidden_dims/2))
        self.bn1 = nn.BatchNorm1d(args.hidden_dims*2)
        self.bn1_5 = nn.BatchNorm1d(args.hidden_dims)
        self.bn2 = nn.BatchNorm1d(int(args.hidden_dims/2))
        # self.bn1 = nn.BatchNorm1d(args.hidden_dims*2, affine=True)
        # self.bn1_5 = nn.BatchNorm1d(args.hidden_dims, affine=True)
        # self.bn2 = nn.BatchNorm1d(int(args.hidden_dims/2), affine=True)
        self.dropout = nn.Dropout(args.dropout)
        self.fc = nn.Linear(int(args.hidden_dims/2), args.num_classes)

        # sort_method
        self.sort = PointCloudSortingRNN(128, 256) if args.sort == 'rnn' else None
        self.batch_sort = self.batch_pcseq_rnn if args.sort == 'rnn' else self.batch_pcseq_mort
    
    def generate_positional_encoding(self, seq_len, d_model):
        PE = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        PE[:, 0::2] = torch.sin(position * div_term)
        PE[:, 1::2] = torch.cos(position * div_term)
        return PE
        
    def forward(self, x,frame_length):
        x = self.batch_sort(x,frame_length)
        x=x.permute(0,1,3,2)
        batch_sz = x.size(0)

        if '3d' not in self.what_encoder:
            x = x.permute(1,0,2,3)
            index_seq = np.argsort(frame_length.cpu().numpy()).tolist()
            x = x[:,index_seq]
            ascending_frame_length = sorted(frame_length)
            reverse_to_original_index = np.argsort(index_seq)
            y = torch.zeros((max(frame_length), batch_sz, self.emb_dims), device=x.device)
            if self.what_encoder in ['conv2d_sa_dgcnn','conv2d_dgcnn','pointnet2','pointmlp','pointnet']:
                head = 0
                for m in range(max(frame_length)):
                    while(m>=ascending_frame_length[head]):
                        head += 1
                    y[m][head:]= self.encoder(x[m][head:].type(torch.FloatTensor).to(x.device))
                y = y[:,reverse_to_original_index]
            y = y.permute(1,0,2)
        else:
            y = self.encoder(x) #y: (batch_size,frame_num,emb_dims)

        # Transformer part
        seq_len = y.size(1)  # Assuming y is of shape (batch_size, seq_len, d_model)
        cls_tokens = self.cls_token.expand(batch_sz, -1, -1)
        cls_pos = self.cls_pos
        pos_encoding = self.generate_positional_encoding(seq_len, self.emb_dims).unsqueeze(0).to(y.device)
        y = torch.cat((cls_tokens, y), dim=1)
        pos_encoding = torch.cat((cls_pos, pos_encoding), dim=1)
        y = y + pos_encoding
        y = y.permute(1,0,2)
        lengths = frame_length+1
        src_key_padding_mask = (torch.arange(seq_len+1).to(x.device) >= lengths.unsqueeze(1)).to(x.device) #src_key_padding_mask: (batch_size,seq_len')
        y = self.transformer(y,src_key_padding_mask=src_key_padding_mask) 
        y = y.permute(1,0,2) 
        # Âèñcls_tokenÂØπÂ∫îÁöÑËæìÂá∫‰Ωú‰∏∫Êï¥‰ΩìÂ∫èÂàóÁöÑÁâπÂæÅË°®Á§∫ÁöÑ‰∏ÄÈÉ®ÂàÜ
        z1 = y[:,0,:].to(x.device)
        # ÂèñÊúÄÂêé‰∏Ä‰∏™ÊúâÊïàframeÂØπÂ∫îÁöÑËæìÂá∫‰Ωú‰∏∫Êï¥‰ΩìÂ∫èÂàóÁöÑÁâπÂæÅË°®Á§∫ÁöÑ‰∏ÄÈÉ®ÂàÜ/ÊúâÊïàframe‰∏≠ÊúÄÂ§ßÂÄº
        z2 = torch.zeros(batch_sz,self.emb_dims).to(x.device)
        for i in range(batch_sz):
            # z2[i] = y[i,frame_length[i],:]
            valid_idx = min(frame_length[i].item(), y.shape[1] - 1)
            z2[i] = y[i, valid_idx, :]

        z = torch.cat((z1,z2),dim=1)

        z = self.fc1(z) 
        y1 = self.bn1(z)  
        y1 = F.leaky_relu(y1,negative_slope=0.2)
        y1 = self.dropout(y1) 

        # add for use transformer
        y1 = self.fc1_5(y1)
        y1 = self.bn1_5(y1)
        y1 = F.leaky_relu(y1,negative_slope=0.2)
        y1 = self.dropout(y1)

        y1 = self.fc2(y1)  
        y1 = self.bn2(y1)   
        y1 = F.leaky_relu(y1,negative_slope=0.2)
        y1 = self.dropout(y1)

        y1 = self.fc(y1)
        return y1
    

    def batch_pcseq_mort(self,pcseq,frame_length):
        b, f, n, c = pcseq.size()
        for i in range(b):
            #sorted_indices = simplied_morton_sorting(pcseq[i,:frame_length[i],:,:])
            sorted_indices = morton_sorting(pcseq[i,:frame_length[i],:,:])
            pcseq[i,:frame_length[i],:,:] = pcseq[i,:frame_length[i],:,:].view(-1,3)[sorted_indices,:].view(frame_length[i],n,c)
        return pcseq
    
    def batch_pcseq_rnn(self,pcseq,frame_length):
        b, f, n, c = pcseq.size()
        pcseq = pcseq.view(-1,n,c)
        id_base = torch.arange(0, b, device=pcseq.device) * f

        indices = torch.cat([id_base[i] + torch.arange(frame_length[i], device=pcseq.device) for i in range(b)])

        assert indices.size(0) == torch.sum(frame_length).item()

        pcseq2 = torch.index_select(pcseq, 0, indices) # (indices.size(0),n,c)

        sorted_indices = self.sort(pcseq2)  # (indices.size(0),n)

        pcseq2 = pcseq2.gather(1, sorted_indices.unsqueeze(2).expand(-1, -1, pcseq2.size(2)))

        pcseq.index_copy_(0, indices, pcseq2.view(-1, n, c))
        pcseq = pcseq.view(b,f,n,c)
        return pcseq

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class Args:
    exp_name = 'test_exp'
    encoder = 'conv3d_sa_dgcnn'
    dataset = 'pointcloud'
    batch_size = 32
    test_batch_size = 32
    epochs = 500
    use_sgd = False
    lr = 0.001
    momentum = 0.9
    scheduler = 'cos'
    no_cuda = False
    seed = 1
    eval = False
    num_points = 256
    dropout = 0.5
    emb_dims = 256
    hidden_dims = 128
    k = 6
    model_path = ''
    num_classes = 21
    dir = './data'  #
    gpu = 0
    depth = 2
    sort = 'morton'
    dataset_stride = 1
    

args = Args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
model = pcseq_classifier(args).to(device)

In [None]:
import os, time, math, json, random
import numpy as np
import h5py
from copy import deepcopy
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from classification_model import pcseq_classifier  

# -------------------------
# CONFIG
# -------------------------
H5_PATH   = "../Panto_fixed.h5"
CKPT_DIR  = "./ckpt_fasthar_panto_5fold"
MODEL_TAG = "fasthar"
os.makedirs(CKPT_DIR, exist_ok=True)

RUN_ACTION = True
RUN_USER   = True

K_FOLDS   = 5
FOLD_SEED = 42
VAL_RATIO = 0.1
VAL_SEED  = 123

BATCH_SIZE  = 32
NUM_WORKERS = 4
PIN_MEMORY  = True

EPOCHS    = 200
LR        = 1e-3
WEIGHT_DECAY = 0.0
USE_ONECYCLE = True
MAX_LR    = 1e-3

PATIENCE  = 12
MIN_DELTA = 1e-4

USE_AMP = True

USE_BALANCED_SUBSET = False
PER_CLASS = 450
BALANCE_SEED = 12

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

def load_h5_dataset_both(filename):
    with h5py.File(filename, "r") as f:
        X = np.array(f["data"]).astype(np.float32)     # (B,T,N,3)
        L = np.array(f["labels"]).astype(np.int64)     # (B,) or (B,2)=[action,user]
    if L.ndim == 1:
        action = L.copy()
        user   = np.zeros_like(action)
    else:
        action = L[:, 0].copy()
        user   = (L[:, 1].copy() if L.shape[1] > 1 else np.zeros(L.shape[0], dtype=np.int64))
    return X, action, user

def _normalize_global(seq_xyz, eps=1e-5):
    pts = seq_xyz.reshape(-1, 3)
    centroid = pts.mean(axis=0)
    pts = pts - centroid
    max_dist = np.linalg.norm(pts, axis=1).max()
    scale = max(max_dist, eps)
    return (pts / scale).reshape(seq_xyz.shape)

def create_sequential_pointcloud_samples(X, normalize=True, per_frame=False, eps=1e-5, verbose=True):
    assert X.ndim == 4 and X.shape[-1] == 3, f"Expect (B,T,N,3), got {X.shape}"
    n = len(X)
    X = X.astype(np.float32, copy=False)
    out = np.empty_like(X, dtype=np.float32)

    t0 = time.perf_counter()
    if not normalize:
        out[...] = X
    else:
        if per_frame:
            for i in range(n):
                T = X[i].shape[0]
                tmp = np.empty_like(X[i], dtype=np.float32)
                for t in range(T):
                    fr = X[i, t]
                    c = fr.mean(axis=0)
                    fr = fr - c
                    md = np.linalg.norm(fr, axis=1).max()
                    tmp[t] = fr / max(md, eps)
                out[i] = tmp
        else:
            for i in range(n):
                out[i] = _normalize_global(X[i], eps=eps)
    total = time.perf_counter() - t0
    if verbose:
        print(f"[Normalize] {n} samples | total={total:.3f}s | avg={total/max(n,1)*1000:.3f} ms/sample")
    return out

def remap_labels(y_raw):
    y_raw = np.asarray(y_raw, dtype=np.int64)
    uniq = np.unique(y_raw)
    lut = {v: i for i, v in enumerate(uniq)}
    y = np.array([lut[v] for v in y_raw], dtype=np.int64)
    return y, int(len(uniq))

def make_stratified_folds(pos_indices, y, k=5, seed=42):
    rng = np.random.default_rng(seed)
    pos_indices = np.asarray(pos_indices, dtype=np.int64)
    y_pool = y[pos_indices]

    by_class = {}
    for c in np.unique(y_pool):
        idx_c = pos_indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        by_class[int(c)] = idx_c

    folds = [[] for _ in range(k)]
    for c, idx_c in by_class.items():
        parts = np.array_split(idx_c, k)
        for f in range(k):
            folds[f].extend(parts[f].tolist())

    folds = [np.array(folds[f], dtype=np.int64) for f in range(k)]
    for f in range(k):
        rng.shuffle(folds[f])
    return folds

def stratified_train_val_split(pos_indices, y, val_ratio=0.1, seed=42):
    rng = np.random.default_rng(seed)
    pos_indices = np.asarray(pos_indices, dtype=np.int64)
    y_pool = y[pos_indices]
    train, val = [], []
    for c in np.unique(y_pool):
        idx_c = pos_indices[y_pool == c].copy()
        rng.shuffle(idx_c)
        n_val = max(1, int(len(idx_c) * val_ratio))
        val.extend(idx_c[:n_val].tolist())
        train.extend(idx_c[n_val:].tolist())
    rng.shuffle(train); rng.shuffle(val)
    return np.array(train, dtype=np.int64), np.array(val, dtype=np.int64)

def build_balanced_pool_indices(labels_raw, per_class=450, seed=12):
    rng = np.random.RandomState(seed)
    by_c = defaultdict(list)
    for i, y in enumerate(labels_raw):
        by_c[int(y)].append(i)
    pool = []
    for c in sorted(by_c.keys()):
        idxs = by_c[c]
        rng.shuffle(idxs)
        pool.extend(idxs[:min(per_class, len(idxs))])
    return np.array(sorted(pool), dtype=np.int64)

class SeqDataset(Dataset):
    def __init__(self, X, y, idxs):
        self.X = X
        self.y = y
        self.idxs = np.asarray(idxs, dtype=np.int64)
        self.T = int(X.shape[1])

    def __len__(self): return len(self.idxs)

    def __getitem__(self, i):
        j = int(self.idxs[i])
        seq = torch.from_numpy(self.X[j])            
        label = int(self.y[j])
        flen = self.T                               
        return seq, label, flen, j                  

def collate_fixed(batch):
    seqs, labels, flens, sample_ids = zip(*batch)
    x = torch.stack(seqs, dim=0)                                # (B,T,N,3)
    y = torch.tensor(labels, dtype=torch.long)
    fl = torch.tensor(flens, dtype=torch.long)
    sid = torch.tensor(sample_ids, dtype=torch.long)
    return x, y, fl, sid


@torch.no_grad()
def eval_acc(model, loader):
    model.eval()
    ok, tot = 0, 0
    for x, y, fl, _sid in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        fl = fl.to(device, non_blocking=True)
        logits = model(x, fl)
        ok += (logits.argmax(1) == y).sum().item()
        tot += y.numel()
    return ok / max(tot, 1)

def train_one_fold(model, train_loader, val_loader):
    model.to(device)
    opt = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    crit = nn.CrossEntropyLoss()

    if USE_ONECYCLE:
        sch = optim.lr_scheduler.OneCycleLR(
            opt, max_lr=MAX_LR, epochs=EPOCHS, steps_per_epoch=max(1, len(train_loader))
        )
    else:
        sch = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=max(1, EPOCHS))

    use_amp = bool(USE_AMP and device.type == "cuda")
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    best_val = -1.0
    best_state = None
    no_improve = 0

    for epoch in range(1, EPOCHS + 1):
        model.train()
        tr_ok, tr_tot, tr_loss = 0, 0, 0.0

        for x, y, fl, _sid in train_loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            fl = fl.to(device, non_blocking=True)

            opt.zero_grad(set_to_none=True)
            with torch.autocast(device_type="cuda", enabled=use_amp, dtype=torch.float16):
                logits = model(x, fl)
                loss = crit(logits, y)

            if scaler.is_enabled():
                scaler.scale(loss).backward()
                scaler.step(opt)
                scaler.update()
            else:
                loss.backward()
                opt.step()

            if USE_ONECYCLE:
                sch.step()

            tr_loss += float(loss.item())
            tr_ok += (logits.argmax(1) == y).sum().item()
            tr_tot += y.numel()

        if not USE_ONECYCLE:
            sch.step()

        tr_acc = tr_ok / max(tr_tot, 1)
        va_acc = eval_acc(model, val_loader)
        print(f"Epoch {epoch:03d}/{EPOCHS} | loss={tr_loss:.4f} | tr={tr_acc:.4f} | va={va_acc:.4f}")

        if (va_acc - best_val) > MIN_DELTA:
            best_val = va_acc
            best_state = deepcopy(model.state_dict())
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= PATIENCE:
                print(f"‚èπÔ∏è Early stop (patience={PATIENCE})")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return float(best_val)

def run_kfold(task_name, X_proc, action_raw, user_raw):
    assert task_name in ["action", "user"]

    labels_raw = action_raw if task_name == "action" else user_raw

    if USE_BALANCED_SUBSET:
        pool_indices = build_balanced_pool_indices(labels_raw, per_class=PER_CLASS, seed=BALANCE_SEED)
        print(f"[{task_name}] BALANCED pool={len(pool_indices)} (PER_CLASS={PER_CLASS})")
    else:
        pool_indices = np.arange(len(labels_raw), dtype=np.int64)
        print(f"[{task_name}] pool={len(pool_indices)} (all)")

    y_pool_raw = labels_raw[pool_indices]
    y_pool, num_classes = remap_labels(y_pool_raw)

    all_pos = np.arange(len(pool_indices), dtype=np.int64)
    folds = make_stratified_folds(all_pos, y_pool, k=K_FOLDS, seed=FOLD_SEED)

    print(f"\n[{task_name.upper()}] classes={num_classes} | K={K_FOLDS} | VAL_RATIO={VAL_RATIO}")

    fold_results = []
    for fold in range(K_FOLDS):
        test_pos = folds[fold]
        trainval_pos = np.setdiff1d(all_pos, test_pos, assume_unique=False)
        train_pos, val_pos = stratified_train_val_split(trainval_pos, y_pool, val_ratio=VAL_RATIO, seed=VAL_SEED + fold)

        # global indices
        train_idx = pool_indices[train_pos]
        val_idx   = pool_indices[val_pos]
        test_idx  = pool_indices[test_pos]

        # labels (remapped, aligned with pool positions)
        y_train = y_pool[train_pos]
        y_val   = y_pool[val_pos]
        y_test  = y_pool[test_pos]

        ds_tr = SeqDataset(X_proc, y_pool, train_idx) 
        ds_va = SeqDataset(X_proc, y_pool, val_idx)
        ds_te = SeqDataset(X_proc, y_pool, test_idx)

        y_global = np.full((len(X_proc),), -1, dtype=np.int64)
        y_global[pool_indices] = y_pool
        ds_tr = SeqDataset(X_proc, y_global, train_idx)
        ds_va = SeqDataset(X_proc, y_global, val_idx)
        ds_te = SeqDataset(X_proc, y_global, test_idx)

        loader_kw = dict(
            num_workers=NUM_WORKERS,
            pin_memory=PIN_MEMORY,
            persistent_workers=(NUM_WORKERS > 0),
            collate_fn=collate_fixed
        )
        tr_loader = DataLoader(ds_tr, batch_size=BATCH_SIZE, shuffle=True,  **loader_kw)
        va_loader = DataLoader(ds_va, batch_size=BATCH_SIZE, shuffle=False, **loader_kw)
        te_loader = DataLoader(ds_te, batch_size=BATCH_SIZE, shuffle=False, **loader_kw)

        args_task = deepcopy(args)
        args_task.num_classes = int(num_classes)
        model = pcseq_classifier(args_task).to(device)

        print(f"\n----- {task_name.upper()} | FOLD {fold}/{K_FOLDS-1} -----")
        print(f"[SPLIT] train/val/test = {len(ds_tr)}/{len(ds_va)}/{len(ds_te)}")

        best_val = train_one_fold(model, tr_loader, va_loader)
        test_acc = eval_acc(model, te_loader)

        ckpt_path = os.path.join(CKPT_DIR, f"best_model_{MODEL_TAG}_{task_name}_fold{fold}.pth")
        torch.save(model.state_dict(), ckpt_path)
        print(f"üíæ Saved: {ckpt_path}")
        print(f"üî• Fold {fold} | BestVal={best_val:.4f} | TestAcc={test_acc:.4f}")

        fold_results.append({
            "task": task_name,
            "fold": int(fold),
            "num_classes": int(num_classes),
            "best_val": float(best_val),
            "test_acc": float(test_acc),
            "sizes": {"train": int(len(ds_tr)), "val": int(len(ds_va)), "test": int(len(ds_te))}
        })

        del model
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    test_accs = [r["test_acc"] for r in fold_results]
    print(f"\n==== {task_name.upper()} SUMMARY ====")
    print(f"MEAN¬±STD TestAcc = {float(np.mean(test_accs)):.4f} ¬± {float(np.std(test_accs)):.4f}")


    out_json = os.path.join(CKPT_DIR, f"{MODEL_TAG}_{task_name}_5fold_results.json")
    with open(out_json, "w") as f:
        json.dump(fold_results, f, indent=2)
    print(f"üìù Saved: {out_json}")

    return fold_results

# -------------------------
# RUN
# -------------------------
X, y_action_raw, y_user_raw = load_h5_dataset_both(H5_PATH)
X_proc = create_sequential_pointcloud_samples(X, normalize=True, per_frame=False, verbose=True)

all_results = []
if RUN_ACTION:
    all_results.extend(run_kfold("action", X_proc, y_action_raw, y_user_raw))
if RUN_USER:
    all_results.extend(run_kfold("user", X_proc, y_action_raw, y_user_raw))


--------