
# Three-Pipeline Baseline (IMU / Late Fusion / Early Fusion) — Improved, Kaggle-ready

**What you get**
- ✅ Keeps your original script exactly (saved as `original_script.py`)  
- ✅ Splits workflow into clear cells (setup → data → features → models → CV → submit)  
- ✅ 3 pipelines:
  - **Model1**: IMU-only (columns like `ax, ay, az, gx, gy, gz, qx, qy, qz, qw` 등 패턴 자동 탐지)
  - **Model2**: Late Fusion (모달리티별 가지를 따로 학습 후 concat)
  - **Model3**: Early Fusion (특성 전부 합쳐 단일 네트로 학습)
- ✅ 누수 방지: `StratifiedGroupKFold`(subject/session/group 컬럼 자동 감지) → 없으면 `StratifiedKFold`
- ✅ 정규화 일관성 + 윈저라이징(winsorize) + 안정적 스케일링
- ✅ Optional **Temperature Scaling** (logits calibration) + 간단 앙상블
- ✅ Robust 경로 탐지: `/kaggle/input/**`에서 `train.csv / train.parquet / train_labels.csv` 등 자동 탐색

> 이 노트북은 **원본 코드를 보존**하면서, 그 위에 **일반화 잘 되는 세 파이프라인**을 깔끔하게 얹어 실행되도록 구성했습니다.  
> 데이터셋 구조가 특이해도 최대한 **안전하게 작동**하도록 예외와 로그를 넣어놨어요.


In [None]:

# ===============
# 0. Env & Utils
# ===============
import os, sys, gc, math, time, json, random, re, glob, warnings
from pathlib import Path
from collections import defaultdict
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Optional: polars가 있으면 빨라요. 없으면 pandas 사용.
try:
    import polars as pl
    HAS_POLARS = True
except Exception:
    HAS_POLARS = False

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold, GroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss

SEED = int(os.getenv("SEED", 42))
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def set_seed(seed=42):
    np.random.seed(seed); random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(SEED)

def log(s):
    print(str(s), flush=True)

def exists(p): 
    return p is not None and os.path.exists(p)

def find_files(patterns):
    hits = []
    for pat in patterns:
        hits.extend(glob.glob(pat, recursive=True))
    return sorted(set(hits))

def winsorize_df(df, cols, lower_q=0.005, upper_q=0.995):
    if len(cols) == 0: 
        return df
    quantiles = df[cols].quantile([lower_q, upper_q]).T
    lower = quantiles[lower_q]; upper = quantiles[upper_q]
    df_clipped = df.copy()
    for c in cols:
        df_clipped[c] = df[c].clip(lower=lower[c], upper=upper[c])
    return df_clipped

def infer_problem_type(y):
    # 분류로 가정 (이진/다중). 숫자 unique 개수로 판별.
    uniq = np.unique(y.dropna())
    if y.dtype.kind in 'ifu' and len(uniq) > 20:
        return 'regression'  # 매우 많은 값이면 회귀로 간주
    return 'classification'

def guess_group_column(df):
    for cand in ['group','subject','session','user','pid','sid','fold_group']:
        if cand in df.columns: 
            return cand
    return None

def detect_imu_columns(cols):
    # 일반적인 IMU 패턴 자동 탐지
    patterns = [
        r'(^|_)a[xyz]$',        # ax, ay, az
        r'(^|_)g[xyz]$',        # gx, gy, gz
        r'(^|_)q[wxyz]$',       # qw, qx, qy, qz
        r'(^|_)acc(_|$)',       # acc, acc_x ...
        r'(^|_)gyro(_|$)',      # gyro
        r'(^|_)mag(_|$)',       # magnetometer 등
    ]
    imu_cols = []
    for c in cols:
        for p in patterns:
            if re.search(p, c, flags=re.IGNORECASE):
                imu_cols.append(c); break
    return sorted(set(imu_cols))

def split_modalities(cols):
    # 간단한 규칙 기반 모달리티 분리
    imu = detect_imu_columns(cols)
    audio = [c for c in cols if re.search(r'(^|_)(mfcc|audio|mel|spec|rms|zcr)', c, re.I)]
    vision = [c for c in cols if re.search(r'(^|_)(img|vision|resnet|vit|pixel|bbox)', c, re.I)]
    text = [c for c in cols if re.search(r'(^|_)(bert|tfidf|nlp|text|tok|len_)', c, re.I)]
    used = set(imu) | set(audio) | set(vision) | set(text)
    other = [c for c in cols if c not in used]
    return dict(imu=imu, audio=audio, vision=vision, text=text, other=other)

def softmax_np(x):
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e / e.sum(axis=1, keepdims=True)

In [None]:

# ===============================
# 1. Save Original Script (Frozen)
# ===============================
# 원본 거 건들지 말라는 요구 충족: 그대로 py 파일로 저장해 둠.
# 필요시 여기서 불러다 분석/참조 가능.

ORIGINAL_PATH = '/kaggle/working/original_script.py' if os.path.exists('/kaggle') else 'original_script.py'
with open(ORIGINAL_PATH, 'w', encoding='utf-8') as f:
    f.write("#!/usr/bin/env python\n# coding: utf-8\n\n# In[ ]:\n\n\n\n# =======================\n# CONFIG & PATH CHECK (Improved Full)\n# =======================\nimport os, glob, pprint, random, numpy as np\n\nCFG = {\n    \"INPUT_ROOT\": \"/kaggle/input\",\n    \"DATASET_DIR\": \"/kaggle/input/cmi-detect-behavior-with-sensor-data\",\n    \"OOF_DIR\": \"/kaggle/input/cmi-oof\",\n    \"CHECKPOINT_GLOBS\": [\n        \"/kaggle/input/**/best*.pt\",\n        \"/kaggle/input/**/model*.pt\",\n        \"/kaggle/input/**/best*.h5\",\n        \"/kaggle/input/**/model*.h5\",\n    ],\n    # toggles\n    \"USE_FP16\": True,\n    \"DETERMINISTIC\": True,\n    \"TTA_ENABLE\": True,\n    \"V3_ENABLE_CONTEXT_NORM\": True,\n    \"V3_WINSOR_P\": 0.01,\n    \"V3_ARM_BIN_THRESHOLD\": 52.0,  # cm\n    \"V3_USE_GROUP_TS\": True,\n    # output\n    \"SUBMISSION_NAME\": \"submission.csv\",\n}\npprint.pprint(CFG)\n\n# required\nrequired = [\n    f'{CFG[\"DATASET_DIR\"]}/test.csv',\n    f'{CFG[\"DATASET_DIR\"]}/test_demographics.csv',\n]\nprint(\"\\n== MUST ==\")\nfor p in required:\n    print(p, \"\u2705\" if os.path.exists(p) else \"\u274c\")\n\n# optional\noptional = [\n    f'{CFG[\"OOF_DIR\"]}/oof_predict1.csv',\n    f'{CFG[\"OOF_DIR\"]}/oof_predict2.csv',\n    f'{CFG[\"OOF_DIR\"]}/oof_predict3.csv',\n    f'{CFG[\"DATASET_DIR\"]}/train_demographics.csv',\n    f'{CFG[\"DATASET_DIR\"]}/train_labels.csv',\n]\nprint(\"\\n== OPTIONAL ==\")\nfor p in optional:\n    print(p, \"\u2705\" if os.path.exists(p) else \"\u274c\")\n\nprint(\"\\n== CHECKPOINT GUESS ==\")\nhits = []\nfor pat in CFG[\"CHECKPOINT_GLOBS\"]:\n    found = glob.glob(pat, recursive=True)[:6]\n    if found:\n        print(pat, \"->\", len(found), \"found; sample:\", found[:3])\n        hits.extend(found)\nif not hits:\n    print(\"No checkpoints found under /kaggle/input/** (attach your weights via 'Add data').\")\n\n# deterministic & precision\ntry:\n    import torch\n    if CFG[\"DETERMINISTIC\"]:\n        seed = 42\n        random.seed(seed)\n        np.random.seed(seed)\n        os.environ[\"PYTHONHASHSEED\"] = str(seed)\n        torch.manual_seed(seed)\n        torch.cuda.manual_seed_all(seed)\n        torch.backends.cudnn.deterministic = True\n        torch.backends.cudnn.benchmark = False\n        try:\n            torch.backends.cuda.matmul.allow_tf32 = False\n        except Exception:\n            pass\n    if CFG[\"USE_FP16\"]:\n        print(\"[INFO] FP16/BF16 inference allowed when supported.\")\nexcept Exception as e:\n    print(\"[WARN] Torch setup:\", e)\n\n# bridge toggles for older cells\nTTA_ENABLE = bool(CFG.get(\"TTA_ENABLE\", True))\nV3_ENABLE_CONTEXT_NORM = bool(CFG.get(\"V3_ENABLE_CONTEXT_NORM\", True))\nV3_WINSOR_P = float(CFG.get(\"V3_WINSOR_P\", 0.01))\nV3_ARM_BIN_THRESHOLD = float(CFG.get(\"V3_ARM_BIN_THRESHOLD\", 52.0))\nV3_USE_GROUP_TS = bool(CFG.get(\"V3_USE_GROUP_TS\", True))\n\n\n# ### [CMI - Detect Behavior with Sensor Data](https://www.kaggle.com/competitions/cmi-detect-behavior-with-sensor-data/code?competitionId=102335&sortBy=scoreDescending&excludeNonAccessedDatasources=true)\n# \n# \n# | &nbsp; | &nbsp; | &nbsp; | | &nbsp; | &nbsp; | &nbsp; |\n# | - | - | :-: | :- | :-: | :-: | -: |\n# | [Model 1](#Model_1) | 0.820 | &nbsp; v.6 &nbsp; | [CMI25 . IMU+THM/TOF . TF BlendingModel](https://www.kaggle.com/code/hideyukizushi/cmi25-imu-thm-tof-tf-blendingmodel-lb-82) |  &nbsp; grandmaster &nbsp; | [yukiZ](https://www.kaggle.com/hideyukizushi) | Japan |\n# | [Model 2](#Model_2) | 0.829 | &nbsp; v.7 &nbsp; | [5fold single bert model](https://www.kaggle.com/code/wasupandceacar/lb-0-82-5fold-single-bert-model) |  &nbsp; expert &nbsp; | [wasupandceacar](https://www.kaggle.com/wasupandceacar) | United States |\n# | [Model 3](#Model_3) | 0.835 | &nbsp; v.10 &nbsp; | [Gated GRU + Hybrid Ensemble_v02](https://www.kaggle.com/code/pepushi/gated-gru-hybrid-ensemble-v02) |  &nbsp; expert &nbsp; | [uhey](https://www.kaggle.com/pepushi) | Japan |\n# |  |  |  |  |  |  |  |\n# | h-blend | |  | Models.[1,2,3](https://www.kaggle.com/code/nina2025/cmi-ensemble-of-3-solutions-horizontal-blend?scriptVersionId=255523877)&nbsp;=&nbsp; [ 0.271, &nbsp;0.347, &nbsp;0.382 ]| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |  |  |  |  |  |  |  |\n# |  |  |  | BEFORE FORK |  | correct weights for Models.[1,2,3](https://www.kaggle.com/code/nina2025/cmi-ensemble-of-3-solutions-horizontal-blend?scriptVersionId=255523877) |  |\n# |  |  |  |  |  |  |  |\n# |21/08| [0.852](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257241619) | v.1 | Model[.3](#predict_3) &nbsp;{'A': 0.51, 'B': 0.19, 'C': 0.30}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |21/08| [0.852](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257241787) | v.2 | Model[.3](#predict_3) &nbsp;{'A': 0.52, 'B': 0.19, 'C': 0.29}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |21/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257242814) | v.3 | Model[.3](#predict_3) &nbsp;{'A': 0.53, 'B': 0.18, 'C': 0.29}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]| Top.1\n# |21/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257242928) | v.4 | Model[.3](#predict_3) &nbsp;{'A': 0.54, 'B': 0.18, 'C': 0.28}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]| Top.2\n# |21/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257295188) | v.5 | Model[.3](#predict_3) &nbsp;{'A': 0.535, 'B': 0.18, 'C': 0.285}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |22/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257410653) | v.7 | Model[.3](#predict_3) &nbsp;{'A': 0.531, 'B': 0.1795, 'C': 0.2895}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |22/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257410730) | v.8 | Model[.3](#predict_3) &nbsp;{'A': 0.532, 'B': 0.1790, 'C': 0.2890}| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |22/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257439028) | v.10 | Model[.3](#predict_3) &nbsp;[directed random scheme 1](#predict_3)| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |22/08| [0.853](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257439157) | v.11 | Model[.3](#predict_3) &nbsp;[directed random scheme 2](#predict_3)| [ 70 x 30 ](#predict) |[ +0.0021, -0.0007, -0.0014 ]|\n# |  |  |  |  |  |  |  |\n# |  |  |  | FORK: [CMI . ENSEMBLE MODEL](https://www.kaggle.com/code/kerta27/cmi-ensemble-model) BY [KERT](https://www.kaggle.com/kerta27) |  |  |  |\n# |  |  |  |  |  |  |  |\n# |22/08| [0.854](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data?scriptVersionId=257476184) | v.13 | Model[.3](#predict_3) &nbsp;{'A': 0.53, 'B': 0.18, 'C': 0.29}| [ 70 x 30 ](#predict) |[+0.0021, -0.00074, -0.00136]|\n# |23/08| [?](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data) | v.14 | Model[.3](#predict_3) &nbsp;{'A': 0.53, 'B': 0.18, 'C': 0.29}| [ 70 x 30 ](#predict) |[+0.0021, -0.00055, -0.00155]|\n# |23/08| [?](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data) | v.15 | Model[.3](#predict_3) &nbsp;[directed random scheme 1](#predict_3)| [ 70 x 30 ](#predict) |[+0.0021, -0.0007, -0.0014]|\n# |23/08| [?](https://www.kaggle.com/code/nina2025/cmi-detect-behavior-with-sensor-data) | v.16 | Model[.3](#predict_3) &nbsp;[directed random scheme 2](#predict_3)| [ 70 x 30 ](#predict) |[+0.0021, -0.0007, -0.0014]|\n\n# # Model 1\n\n# In[ ]:\n\n\nimport os, json, joblib, numpy as np, pandas as pd\nimport random\nfrom pathlib import Path\nimport warnings \nwarnings.filterwarnings(\"ignore\")\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom sklearn.utils.class_weight import compute_class_weight\n\nfrom tensorflow.keras.utils import Sequence, to_categorical, pad_sequences\nfrom tensorflow.keras.models import Model, load_model\nfrom tensorflow.keras.layers import (\n    Input, Conv1D, BatchNormalization, Activation, add, MaxPooling1D, Dropout,\n    Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Multiply, Reshape,\n    Lambda, Concatenate, GRU, GaussianNoise\n)\nfrom tensorflow.keras.regularizers import l2\nfrom tensorflow.keras.optimizers import Adam\nfrom tensorflow.keras.callbacks import EarlyStopping\nfrom tensorflow.keras import backend as K\nimport tensorflow as tf\nimport polars as pl\nfrom sklearn.model_selection import StratifiedGroupKFold\nfrom scipy.spatial.transform import Rotation as R\n\ndef seed_everything(seed):\n    os.environ['PYTHONHASHSEED'] = str(seed)\n    random.seed(seed)\n    np.random.seed(seed)\n    tf.random.set_seed(seed)\n    tf.experimental.numpy.random.seed(seed)\n    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'\n    os.environ['TF_DETERMINISTIC_OPS'] = '1'\n\nseed_everything(seed=42)\n# (Competition metric will only be imported when TRAINing)\nTRAIN = False                     # \u2190 set to True when you want to train\nRAW_DIR = Path(\"/kaggle/input/cmi-detect-behavior-with-sensor-data\")\nPRETRAINED_DIR = Path(\"/kaggle/input/cmi-d-111\")\nEXPORT_DIR = Path(\"./\")                                    # artefacts will be saved here\nBATCH_SIZE = 64\nPAD_PERCENTILE = 95\nLR_INIT = 5e-4\nWD = 3e-3\nMIXUP_ALPHA = 0.4\nEPOCHS = 160\nPATIENCE = 40\n\nprint(\"\u25b6 imports ready \u00b7 tensorflow\", tf.__version__)\n\n#Tensor Manipulations\ndef time_sum(x):\n    return K.sum(x, axis=1)\n\ndef squeeze_last_axis(x):\n    return tf.squeeze(x, axis=-1)\n\ndef expand_last_axis(x):\n    return tf.expand_dims(x, axis=-1)\n\ndef se_block(x, reduction=8):\n    ch = x.shape[-1]\n    se = GlobalAveragePooling1D()(x)\n    se = Dense(ch // reduction, activation='relu')(se)\n    se = Dense(ch, activation='sigmoid')(se)\n    se = Reshape((1, ch))(se)\n    return Multiply()([x, se])\n\n# Residual CNN Block with SE\ndef residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):\n    shortcut = x\n    for _ in range(2):\n        x = Conv1D(filters, kernel_size, padding='same', use_bias=False,\n                   kernel_regularizer=l2(wd))(x)\n        x = BatchNormalization()(x)\n        x = Activation('relu')(x)\n    x = se_block(x)\n    if shortcut.shape[-1] != filters:\n        shortcut = Conv1D(filters, 1, padding='same', use_bias=False,\n                          kernel_regularizer=l2(wd))(shortcut)\n        shortcut = BatchNormalization()(shortcut)\n    x = add([x, shortcut])\n    x = Activation('relu')(x)\n    x = MaxPooling1D(pool_size)(x)\n    x = Dropout(drop)(x)\n    return x\n\ndef attention_layer(inputs):\n    score = Dense(1, activation='tanh')(inputs)\n    score = Lambda(squeeze_last_axis)(score)\n    weights = Activation('softmax')(score)\n    weights = Lambda(expand_last_axis)(weights)\n    context = Multiply()([inputs, weights])\n    context = Lambda(time_sum)(context)\n    return context\n\n# Normalizes and cleans the time series sequence. \n\ndef preprocess_sequence(df_seq: pd.DataFrame, feature_cols: list[str], scaler: StandardScaler):\n    mat = df_seq[feature_cols].ffill().bfill().fillna(0).values\n    return scaler.transform(mat).astype('float32')\n\n# MixUp the data argumentation in order to regularize the neural network. \n\nclass MixupGenerator(Sequence):\n    def __init__(self, X, y, batch_size, alpha=0.2):\n        self.X, self.y = X, y\n        self.batch = batch_size\n        self.alpha = alpha\n        self.indices = np.arange(len(X))\n    def __len__(self):\n        return int(np.ceil(len(self.X) / self.batch))\n    def __getitem__(self, i):\n        idx = self.indices[i*self.batch:(i+1)*self.batch]\n        Xb, yb = self.X[idx], self.y[idx]\n        lam = np.random.beta(self.alpha, self.alpha)\n        perm = np.random.permutation(len(Xb))\n        X_mix = lam * Xb + (1-lam) * Xb[perm]\n        y_mix = lam * yb + (1-lam) * yb[perm]\n        return X_mix, y_mix\n    def on_epoch_end(self):\n        np.random.shuffle(self.indices)\n\n\ndef remove_gravity_from_acc(acc_data, rot_data):\n\n    if isinstance(acc_data, pd.DataFrame):\n        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values\n    else:\n        acc_values = acc_data\n\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n\n    num_samples = acc_values.shape[0]\n    linear_accel = np.zeros_like(acc_values)\n    \n    gravity_world = np.array([0, 0, 9.81])\n\n    for i in range(num_samples):\n        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):\n            linear_accel[i, :] = acc_values[i, :] \n            continue\n\n        try:\n            rotation = R.from_quat(quat_values[i])\n            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)\n            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame\n        except ValueError:\n             linear_accel[i, :] = acc_values[i, :]\n             \n    return linear_accel\n\ndef calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n\n    num_samples = quat_values.shape[0]\n    angular_vel = np.zeros((num_samples, 3))\n\n    for i in range(num_samples - 1):\n        q_t = quat_values[i]\n        q_t_plus_dt = quat_values[i+1]\n\n        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \\\n           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):\n            continue\n\n        try:\n            rot_t = R.from_quat(q_t)\n            rot_t_plus_dt = R.from_quat(q_t_plus_dt)\n\n            # Calculate the relative rotation\n            delta_rot = rot_t.inv() * rot_t_plus_dt\n            \n            # Convert delta rotation to angular velocity vector\n            # The rotation vector (Euler axis * angle) scaled by 1/dt\n            # is a good approximation for small delta_rot\n            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta\n        except ValueError:\n            # If quaternion is invalid, angular velocity remains zero\n            pass\n            \n    return angular_vel\n    \ndef calculate_angular_distance(rot_data):\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n\n    num_samples = quat_values.shape[0]\n    angular_dist = np.zeros(num_samples)\n\n    for i in range(num_samples - 1):\n        q1 = quat_values[i]\n        q2 = quat_values[i+1]\n\n        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \\\n           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):\n            angular_dist[i] = 0 # \u0418\u043b\u0438 np.nan, \u0432 \u0437\u0430\u0432\u0438\u0441\u0438\u043c\u043e\u0441\u0442\u0438 \u043e\u0442 \u0436\u0435\u043b\u0430\u0435\u043c\u043e\u0433\u043e \u043f\u043e\u0432\u0435\u0434\u0435\u043d\u0438\u044f\n            continue\n        try:\n            # \u041f\u0440\u0435\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u0435 \u043a\u0432\u0430\u0442\u0435\u0440\u043d\u0438\u043e\u043d\u043e\u0432 \u0432 \u043e\u0431\u044a\u0435\u043a\u0442\u044b Rotation\n            r1 = R.from_quat(q1)\n            r2 = R.from_quat(q2)\n\n            # \u0412\u044b\u0447\u0438\u0441\u043b\u0435\u043d\u0438\u0435 \u0443\u0433\u043b\u043e\u0432\u043e\u0433\u043e \u0440\u0430\u0441\u0441\u0442\u043e\u044f\u043d\u0438\u044f: 2 * arccos(|real(p * q*)|)\n            # \u0433\u0434\u0435 p* - \u0441\u043e\u043f\u0440\u044f\u0436\u0435\u043d\u043d\u044b\u0439 \u043a\u0432\u0430\u0442\u0435\u0440\u043d\u0438\u043e\u043d q\n            # \u0412 scipy.spatial.transform.Rotation, r1.inv() * r2 \u0434\u0430\u0435\u0442 \u043e\u0442\u043d\u043e\u0441\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0435 \u0432\u0440\u0430\u0449\u0435\u043d\u0438\u0435.\n            # \u0423\u0433\u043e\u043b \u044d\u0442\u043e\u0433\u043e \u043e\u0442\u043d\u043e\u0441\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0433\u043e \u0432\u0440\u0430\u0449\u0435\u043d\u0438\u044f - \u044d\u0442\u043e \u0438 \u0435\u0441\u0442\u044c \u0443\u0433\u043b\u043e\u0432\u043e\u0435 \u0440\u0430\u0441\u0441\u0442\u043e\u044f\u043d\u0438\u0435.\n            relative_rotation = r1.inv() * r2\n            \n            # \u0423\u0433\u043e\u043b rotation vector \u0441\u043e\u043e\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0443\u0435\u0442 \u0443\u0433\u043b\u043e\u0432\u043e\u043c\u0443 \u0440\u0430\u0441\u0441\u0442\u043e\u044f\u043d\u0438\u044e\n            # \u041d\u043e\u0440\u043c\u0430 rotation vector - \u044d\u0442\u043e \u0443\u0433\u043e\u043b \u0432 \u0440\u0430\u0434\u0438\u0430\u043d\u0430\u0445\n            angle = np.linalg.norm(relative_rotation.as_rotvec())\n            angular_dist[i] = angle\n        except ValueError:\n            angular_dist[i] = 0 # \u0412 \u0441\u043b\u0443\u0447\u0430\u0435 \u043d\u0435\u0434\u0435\u0439\u0441\u0442\u0432\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0445 \u043a\u0432\u0430\u0442\u0435\u0440\u043d\u0438\u043e\u043d\u043e\u0432\n            pass\n            \n    return angular_dist\n\ndef build_two_branch_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):\n    inp = Input(shape=(pad_len, imu_dim+tof_dim))\n    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)\n    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)\n\n    # IMU deep branch\n    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.1, wd=wd)\n    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)\n\n    # TOF/Thermal lighter branch\n    x2 = Conv1D(64, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)\n    x2 = BatchNormalization()(x2); x2 = Activation('relu')(x2)\n    x2 = MaxPooling1D(2)(x2); x2 = Dropout(0.2)(x2)\n    x2 = Conv1D(128, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x2)\n    x2 = BatchNormalization()(x2); x2 = Activation('relu')(x2)\n    x2 = MaxPooling1D(2)(x2); x2 = Dropout(0.2)(x2)\n\n    merged = Concatenate()([x1, x2])\n\n    xa = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)\n    xb = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)\n    xc = GaussianNoise(0.09)(merged)\n    xc = Dense(16, activation='elu')(xc)\n    \n    x = Concatenate()([xa, xb, xc])\n    x = Dropout(0.4)(x)\n    x = attention_layer(x)\n\n    for units, drop in [(256, 0.5), (128, 0.3)]:\n        x = Dense(units, use_bias=False, kernel_regularizer=l2(wd))(x)\n        x = BatchNormalization()(x); x = Activation('relu')(x)\n        x = Dropout(drop)(x)\n\n    out = Dense(n_classes, activation='softmax', kernel_regularizer=l2(wd))(x)\n    return Model(inp, out)\n\ntmp_model = build_two_branch_model(127,7,325,18)\n\ncustom_objs = {\n    'time_sum': time_sum, 'squeeze_last_axis': squeeze_last_axis, 'expand_last_axis': expand_last_axis,\n    'se_block': se_block, 'residual_se_cnn_block': residual_se_cnn_block, 'attention_layer': attention_layer,\n}\n\n# ----------------------------------------------------------------- #\n# Load any Models\n# * is 2 Train Model Load\n# ----------------------------------------------------------------- #\n\nPRETRAINED_DIR = Path(\"/kaggle/input/cmi-d-111\")\nprint(\"\u25b6 INFERENCE MODE 1,2 \u2013 loading artefacts from\", PRETRAINED_DIR)\nfinal_feature_cols = np.load(PRETRAINED_DIR / \"feature_cols.npy\", allow_pickle=True).tolist()\npad_len        = int(np.load(PRETRAINED_DIR / \"sequence_maxlen.npy\"))\nscaler         = joblib.load(PRETRAINED_DIR / \"scaler.pkl\")\ngesture_classes = np.load(PRETRAINED_DIR / \"gesture_classes.npy\", allow_pickle=True)\n\nmodels1 = []\nprint(f\"  Loading models for ensemble inference...\")\nfor fold in range(10):\n    model_path = f\"{PRETRAINED_DIR}/D-111_{fold}.h5\"\n    print(\">>>LoadModel>>>\",model_path)\n    model = load_model(model_path, compile=False, custom_objects=custom_objs)\n    models1.append(model)\nprint(\"-\"*50)\n\nfor fold in range(10):\n    model_path = f\"{PRETRAINED_DIR}/v0629_{fold}.h5\"\n    print(\">>>LoadModel>>>\",model_path)\n    model = load_model(model_path, compile=False, custom_objects=custom_objs)\n    models1.append(model)\nprint(\"-\"*50)\nprint(f\"[INFO]NumUseModels:{len(models1)}\")\n\n\nPRETRAINED_DIR = Path(\"/kaggle/input/n-splits-10\")\nprint(\"\u25b6 INFERENCE MODE 3 \u2013 loading artefacts from\", PRETRAINED_DIR)\nfinal_feature_cols = np.load(PRETRAINED_DIR / \"feature_cols.npy\", allow_pickle=True).tolist()\npad_len        = int(np.load(PRETRAINED_DIR / \"sequence_maxlen.npy\"))\nscaler         = joblib.load(PRETRAINED_DIR / \"scaler.pkl\")\ngesture_classes = np.load(PRETRAINED_DIR / \"gesture_classes.npy\", allow_pickle=True)\nfor fold in range(10):\n    model_path = f\"{PRETRAINED_DIR}/gesture_model_fold_{fold}.h5\"\n    print(\">>>LoadModel>>>\",model_path)\n    model = load_model(model_path, compile=False, custom_objects=custom_objs)\n    models1.append(model)\nprint(\"-\"*50)\nprint(f\"[INFO]NumUseModels:{len(models1)}\")\n\nfor fold in range(10):\n    MODEL_DIR = \"/kaggle/input/cmi-data-tensorflow-train\"\n    \n    model_path = f\"{MODEL_DIR}/gesture_model_fold_{fold}.h5\"\n    print(\">>>LoadModel>>>\",model_path)\n    model = load_model(model_path, compile=False, custom_objects=custom_objs)\n    models1.append(model)\nprint(\"-\"*50)\nprint(f\"[INFO]NumUseModels:{len(models1)}\")\n\n# In[ ]:\n\n\ndef predict1(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:\n    df_seq = sequence.to_pandas()\n    linear_accel = remove_gravity_from_acc(df_seq, df_seq)\n    df_seq['linear_acc_x'], df_seq['linear_acc_y'], df_seq['linear_acc_z'] = linear_accel[:, 0], linear_accel[:, 1], linear_accel[:, 2]\n    df_seq['linear_acc_mag'] = np.sqrt(df_seq['linear_acc_x']**2 + df_seq['linear_acc_y']**2 + df_seq['linear_acc_z']**2)\n    df_seq['linear_acc_mag_jerk'] = df_seq['linear_acc_mag'].diff().fillna(0)\n    angular_vel = calculate_angular_velocity_from_quat(df_seq)\n    df_seq['angular_vel_x'], df_seq['angular_vel_y'], df_seq['angular_vel_z'] = angular_vel[:, 0], angular_vel[:, 1], angular_vel[:, 2]\n    df_seq['angular_distance'] = calculate_angular_distance(df_seq)\n    \n    for i in range(1, 6):\n        pixel_cols = [f\"tof_{i}_v{p}\" for p in range(64)]; tof_data = df_seq[pixel_cols].replace(-1, np.nan)\n        df_seq[f'tof_{i}_mean'], df_seq[f'tof_{i}_std'], df_seq[f'tof_{i}_min'], df_seq[f'tof_{i}_max'] = tof_data.mean(axis=1), tof_data.std(axis=1), tof_data.min(axis=1), tof_data.max(axis=1)\n        \n    mat_unscaled = df_seq[final_feature_cols].ffill().bfill().fillna(0).values.astype('float32')\n    mat_scaled = scaler.transform(mat_unscaled)\n    pad_input = pad_sequences([mat_scaled], maxlen=pad_len, padding='post', truncating='post', dtype='float32')\n    \n    all_preds = [model.predict(pad_input, verbose=0)[0] for model in models1]\n    avg_pred = np.median(all_preds, axis=0) \n    return avg_pred\n\n# # Model 2\n\n# In[ ]:\n\n\nimport os\nimport torch\nimport kagglehub\nfrom pathlib import Path\nimport numpy as np\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom scipy.spatial.transform import Rotation as R\nfrom collections import defaultdict\nfrom torch.utils.data import Dataset, DataLoader, Subset\nfrom tqdm.notebook import tqdm\nfrom torch.amp import autocast\nimport pandas as pd\nimport polars as pl\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.utils.class_weight import compute_class_weight\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom transformers import BertConfig, BertModel\n\n\ndef remove_gravity_from_acc(acc_data, rot_data):\n    if isinstance(acc_data, pd.DataFrame):\n        acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values\n    else:\n        acc_values = acc_data\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n    num_samples = acc_values.shape[0]\n    linear_accel = np.zeros_like(acc_values)\n    gravity_world = np.array([0, 0, 9.81])\n    for i in range(num_samples):\n        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):\n            linear_accel[i, :] = acc_values[i, :] \n            continue\n        try:\n            rotation = R.from_quat(quat_values[i])\n            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)\n            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame\n        except ValueError:\n             linear_accel[i, :] = acc_values[i, :]\n    return linear_accel\n\ndef calculate_angular_velocity_from_quat(rot_data, time_delta=1/200): # Assuming 200Hz sampling rate\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n    num_samples = quat_values.shape[0]\n    angular_vel = np.zeros((num_samples, 3))\n    for i in range(num_samples - 1):\n        q_t = quat_values[i]\n        q_t_plus_dt = quat_values[i+1]\n        if np.all(np.isnan(q_t)) or np.all(np.isclose(q_t, 0)) or \\\n           np.all(np.isnan(q_t_plus_dt)) or np.all(np.isclose(q_t_plus_dt, 0)):\n            continue\n        try:\n            rot_t = R.from_quat(q_t)\n            rot_t_plus_dt = R.from_quat(q_t_plus_dt)\n            delta_rot = rot_t.inv() * rot_t_plus_dt\n            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta\n        except ValueError:\n            pass\n    return angular_vel\n\ndef calculate_angular_distance(rot_data):\n    if isinstance(rot_data, pd.DataFrame):\n        quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    else:\n        quat_values = rot_data\n    num_samples = quat_values.shape[0]\n    angular_dist = np.zeros(num_samples)\n    for i in range(num_samples - 1):\n        q1 = quat_values[i]\n        q2 = quat_values[i+1]\n        if np.all(np.isnan(q1)) or np.all(np.isclose(q1, 0)) or \\\n           np.all(np.isnan(q2)) or np.all(np.isclose(q2, 0)):\n            angular_dist[i] = 0\n            continue\n        try:\n            r1 = R.from_quat(q1)\n            r2 = R.from_quat(q2)\n            relative_rotation = r1.inv() * r2\n            angle = np.linalg.norm(relative_rotation.as_rotvec())\n            angular_dist[i] = angle\n        except ValueError:\n            angular_dist[i] = 0 # \u0412 \u0441\u043b\u0443\u0447\u0430\u0435 \u043d\u0435\u0434\u0435\u0439\u0441\u0442\u0432\u0438\u0442\u0435\u043b\u044c\u043d\u044b\u0445 \u043a\u0432\u0430\u0442\u0435\u0440\u043d\u0438\u043e\u043d\u043e\u0432\n            pass\n    return angular_dist\n\n\nclass CMIFeDataset(Dataset):\n    def __init__(self, data_path, config):\n        self.config = config\n        self.init_feature_names(data_path)\n        df = self.generate_features(pd.read_csv(data_path, usecols=set(self.base_cols+self.feature_cols)))\n        self.generate_dataset(df)\n\n    def init_feature_names(self, data_path):\n        self.imu_engineered_features = [\n            'acc_mag', 'rot_angle',\n            'acc_mag_jerk', 'rot_angle_vel',\n            'linear_acc_mag', 'linear_acc_mag_jerk',\n            'angular_vel_x', 'angular_vel_y', 'angular_vel_z',\n            'angular_distance'\n        ]\n\n        self.tof_mode = self.config.get(\"tof_mode\", \"stats\")\n        self.tof_region_stats = ['mean', 'std', 'min', 'max']\n        self.tof_cols = self.generate_tof_feature_names()\n\n        columns = pd.read_csv(data_path, nrows=0).columns.tolist()\n        imu_cols_base = ['linear_acc_x', 'linear_acc_y', 'linear_acc_z']\n        imu_cols_base.extend([c for c in columns if c.startswith('rot_') and c not in ['rot_angle', 'rot_angle_vel']])\n        self.imu_cols = list(dict.fromkeys(imu_cols_base + self.imu_engineered_features))\n        self.thm_cols = [c for c in columns if c.startswith('thm_')]\n        self.feature_cols = self.imu_cols + self.thm_cols + self.tof_cols\n        self.imu_dim = len(self.imu_cols)\n        self.thm_dim = len(self.thm_cols)\n        self.tof_dim = len(self.tof_cols)\n        self.base_cols = ['acc_x', 'acc_y', 'acc_z',\n                          'rot_x', 'rot_y', 'rot_z', 'rot_w',\n                          'sequence_id', 'subject', \n                          'sequence_type', 'gesture', 'orientation'] + [c for c in columns if c.startswith('thm_')] + [f\"tof_{i}_v{p}\" for i in range(1, 6) for p in range(64)]\n        self.fold_cols = ['subject', 'sequence_type', 'gesture', 'orientation']\n\n    def generate_tof_feature_names(self):\n        features = []\n        if self.config.get(\"tof_raw\", False):\n            for i in range(1, 6):\n                features.extend([f\"tof_{i}_v{p}\" for p in range(64)])\n        for i in range(1, 6):\n            if self.tof_mode != 0:\n                for stat in self.tof_region_stats:\n                    features.append(f'tof_{i}_{stat}')\n                if self.tof_mode > 1:\n                    for r in range(self.tof_mode):\n                        for stat in self.tof_region_stats:\n                            features.append(f'tof{self.tof_mode}_{i}_region_{r}_{stat}')\n                if self.tof_mode == -1:\n                    for mode in [2, 4, 8, 16, 32]:\n                        for r in range(mode):\n                            for stat in self.tof_region_stats:\n                                features.append(f'tof{mode}_{i}_region_{r}_{stat}')\n        return features\n\n    def compute_features(self, df):\n        df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)\n        df['rot_angle'] = 2 * np.arccos(df['rot_w'].clip(-1, 1))\n        df['acc_mag_jerk'] = df.groupby('sequence_id')['acc_mag'].diff().fillna(0)\n        df['rot_angle_vel'] = df.groupby('sequence_id')['rot_angle'].diff().fillna(0)\n            \n        linear_accel_list = []\n        for _, group in df.groupby('sequence_id'):\n            acc_data_group = group[['acc_x', 'acc_y', 'acc_z']]\n            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]\n            linear_accel_group = remove_gravity_from_acc(acc_data_group, rot_data_group)\n            linear_accel_list.append(pd.DataFrame(linear_accel_group, columns=['linear_acc_x', 'linear_acc_y', 'linear_acc_z'], index=group.index))\n        df_linear_accel = pd.concat(linear_accel_list)\n        df = pd.concat([df, df_linear_accel], axis=1)\n        df['linear_acc_mag'] = np.sqrt(df['linear_acc_x']**2 + df['linear_acc_y']**2 + df['linear_acc_z']**2)\n        df['linear_acc_mag_jerk'] = df.groupby('sequence_id')['linear_acc_mag'].diff().fillna(0)\n    \n        angular_vel_list = []\n        for _, group in df.groupby('sequence_id'):\n            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]\n            angular_vel_group = calculate_angular_velocity_from_quat(rot_data_group)\n            angular_vel_list.append(pd.DataFrame(angular_vel_group, columns=['angular_vel_x', 'angular_vel_y', 'angular_vel_z'], index=group.index))\n        df_angular_vel = pd.concat(angular_vel_list)\n        df = pd.concat([df, df_angular_vel], axis=1)\n    \n        angular_distance_list = []\n        for _, group in df.groupby('sequence_id'):\n            rot_data_group = group[['rot_x', 'rot_y', 'rot_z', 'rot_w']]\n            angular_dist_group = calculate_angular_distance(rot_data_group)\n            angular_distance_list.append(pd.DataFrame(angular_dist_group, columns=['angular_distance'], index=group.index))\n        df_angular_distance = pd.concat(angular_distance_list)\n        df = pd.concat([df, df_angular_distance], axis=1)\n\n        if self.tof_mode != 0:\n            new_columns = {}\n            for i in range(1, 6):\n                pixel_cols = [f\"tof_{i}_v{p}\" for p in range(64)]\n                tof_data = df[pixel_cols].replace(-1, np.nan)\n                new_columns.update({\n                    f'tof_{i}_mean': tof_data.mean(axis=1),\n                    f'tof_{i}_std': tof_data.std(axis=1),\n                    f'tof_{i}_min': tof_data.min(axis=1),\n                    f'tof_{i}_max': tof_data.max(axis=1)\n                })\n                if self.tof_mode > 1:\n                    region_size = 64 // self.tof_mode\n                    for r in range(self.tof_mode):\n                        region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]\n                        new_columns.update({\n                            f'tof{self.tof_mode}_{i}_region_{r}_mean': region_data.mean(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_std': region_data.std(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_min': region_data.min(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_max': region_data.max(axis=1)\n                        })\n                if self.tof_mode == -1:\n                    for mode in [2, 4, 8, 16, 32]:\n                        region_size = 64 // mode\n                        for r in range(mode):\n                            region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]\n                            new_columns.update({\n                                f'tof{mode}_{i}_region_{r}_mean': region_data.mean(axis=1),\n                                f'tof{mode}_{i}_region_{r}_std': region_data.std(axis=1),\n                                f'tof{mode}_{i}_region_{r}_min': region_data.min(axis=1),\n                                f'tof{mode}_{i}_region_{r}_max': region_data.max(axis=1)\n                            })\n            df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)\n        return df\n        \n    def generate_features(self, df):\n        self.le = LabelEncoder()\n        df['gesture_int'] = self.le.transform(df['gesture'])\n        self.class_num = len(self.le.classes_)\n        \n        if all(c in df.columns for c in self.imu_engineered_features) and all(c in df.columns for c in self.tof_cols):\n            print(\"Have precomputed, skip compute.\")\n        else:\n            print(\"Not precomputed, do compute.\")\n            df = self.compute_features(df)\n\n        if self.config.get(\"save_precompute\", False):\n            df.to_csv(self.config.get(\"save_filename\", \"train.csv\"))\n        return df\n\n    def scale(self, data_unscaled):\n        scaler_function = self.config.get(\"scaler_function\", StandardScaler())\n        scaler = scaler_function.fit(np.concatenate(data_unscaled, axis=0))\n        return [scaler.transform(x) for x in data_unscaled], scaler\n\n    def pad(self, data_scaled, cols):\n        pad_data = np.zeros((len(data_scaled), self.pad_len, len(cols)), dtype='float32')\n        for i, seq in enumerate(data_scaled):\n            seq_len = min(len(seq), self.pad_len)\n            pad_data[i, :seq_len] = seq[:seq_len]\n        return pad_data\n\n    def get_nan_value(self, data, ratio):\n        max_value = data.max().max()\n        nan_value = -max_value * ratio\n        return nan_value\n\n    def generate_dataset(self, df):\n        seq_gp = df.groupby('sequence_id') \n        imu_unscaled, thm_unscaled, tof_unscaled = [], [], []\n        imu_mask, thm_mask, tof_mask = [], [], []\n        classes, lens = [], []\n        self.imu_nan_value = self.get_nan_value(df[self.imu_cols], self.config[\"nan_ratio\"][\"imu\"])\n        self.thm_nan_value = self.get_nan_value(df[self.thm_cols], self.config[\"nan_ratio\"][\"thm\"])\n        self.tof_nan_value = self.get_nan_value(df[self.tof_cols], self.config[\"nan_ratio\"][\"tof\"])\n\n        self.fold_feats = defaultdict(list)\n        for seq_id, seq_df in seq_gp:\n            imu_data = seq_df[self.imu_cols]\n            if self.config[\"fbfill\"][\"imu\"]:\n                imu_data = imu_data.ffill().bfill()\n            imu_unscaled.append(imu_data.fillna(self.imu_nan_value).values.astype('float32'))\n\n            thm_data = seq_df[self.thm_cols]\n            if self.config[\"fbfill\"][\"thm\"]:\n                thm_data = thm_data.ffill().bfill()\n            thm_unscaled.append(thm_data.fillna(self.thm_nan_value).values.astype('float32'))\n\n            tof_data = seq_df[self.tof_cols]\n            if self.config[\"fbfill\"][\"tof\"]:\n                tof_data = tof_data.ffill().bfill()\n            tof_unscaled.append(tof_data.fillna(self.tof_nan_value).values.astype('float32'))\n            \n            classes.append(seq_df['gesture_int'].iloc[0])\n            lens.append(len(imu_data))\n\n            for col in self.fold_cols:\n                self.fold_feats[col].append(seq_df[col].iloc[0])\n            \n        self.dataset_indices = classes\n        self.pad_len = int(np.percentile(lens, self.config.get(\"percent\", 95)))\n        if self.config.get(\"one_scale\", True):\n            x_unscaled = [np.concatenate([imu, thm, tof], axis=1) for imu, thm, tof in zip(imu_unscaled, thm_unscaled, tof_unscaled)]\n            x_scaled, self.x_scaler = self.scale(x_unscaled)\n            x = self.pad(x_scaled, self.imu_cols+self.thm_cols+self.tof_cols)\n            self.imu = x[..., :self.imu_dim]\n            self.thm = x[..., self.imu_dim:self.imu_dim+self.thm_dim]\n            self.tof = x[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]\n        else:\n            imu_scaled, self.imu_scaler = self.scale(imu_unscaled)\n            thm_scaled, self.thm_scaler = self.scale(thm_unscaled)\n            tof_scaled, self.tof_scaler = self.scale(tof_unscaled)\n            self.imu = self.pad(imu_scaled, self.imu_cols)\n            self.thm = self.pad(thm_scaled, self.thm_cols)\n            self.tof = self.pad(tof_scaled, self.tof_cols)\n        self.precompute_scaled_nan_values()\n        self.class_ = F.one_hot(torch.from_numpy(np.array(classes)).long(), num_classes=len(self.le.classes_)).float().numpy()\n        self.class_weight = torch.FloatTensor(compute_class_weight('balanced', classes=np.arange(len(self.le.classes_)), y=classes))\n\n    def precompute_scaled_nan_values(self):\n        dummy_df = pd.DataFrame(\n            np.array([[self.imu_nan_value]*len(self.imu_cols) + \n                     [self.thm_nan_value]*len(self.thm_cols) +\n                     [self.tof_nan_value]*len(self.tof_cols)]),\n            columns=self.imu_cols + self.thm_cols + self.tof_cols\n        )\n        \n        if self.config.get(\"one_scale\", True):\n            scaled = self.x_scaler.transform(dummy_df)\n            self.imu_scaled_nan = scaled[0, :self.imu_dim].mean()\n            self.thm_scaled_nan = scaled[0, self.imu_dim:self.imu_dim+self.thm_dim].mean()\n            self.tof_scaled_nan = scaled[0, self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim].mean()\n        else:\n            self.imu_scaled_nan = self.imu_scaler.transform(dummy_df[self.imu_cols])[0].mean()\n            self.thm_scaled_nan = self.thm_scaler.transform(dummy_df[self.thm_cols])[0].mean()\n            self.tof_scaled_nan = self.tof_scaler.transform(dummy_df[self.tof_cols])[0].mean()\n\n    def get_scaled_nan_tensors(self, imu, thm, tof):\n        return torch.full(imu.shape, self.imu_scaled_nan, device=imu.device), \\\n            torch.full(thm.shape, self.thm_scaled_nan, device=thm.device), \\\n            torch.full(tof.shape, self.tof_scaled_nan, device=tof.device)\n\n    def inference_process(self, sequence):\n        df_seq = sequence.to_pandas().copy()\n        if not all(c in df_seq.columns for c in self.imu_engineered_features):\n            df_seq['acc_mag'] = np.sqrt(df_seq['acc_x']**2 + df_seq['acc_y']**2 + df_seq['acc_z']**2)\n            df_seq['rot_angle'] = 2 * np.arccos(df_seq['rot_w'].clip(-1, 1))\n            df_seq['acc_mag_jerk'] = df_seq['acc_mag'].diff().fillna(0)\n            df_seq['rot_angle_vel'] = df_seq['rot_angle'].diff().fillna(0)\n            if all(col in df_seq.columns for col in ['acc_x', 'acc_y', 'acc_z', 'rot_x', 'rot_y', 'rot_z', 'rot_w']):\n                linear_accel = remove_gravity_from_acc(\n                    df_seq[['acc_x', 'acc_y', 'acc_z']], \n                    df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']]\n                )\n                df_seq[['linear_acc_x', 'linear_acc_y', 'linear_acc_z']] = linear_accel\n            else:\n                df_seq['linear_acc_x'] = df_seq.get('acc_x', 0)\n                df_seq['linear_acc_y'] = df_seq.get('acc_y', 0)\n                df_seq['linear_acc_z'] = df_seq.get('acc_z', 0)\n            df_seq['linear_acc_mag'] = np.sqrt(df_seq['linear_acc_x']**2 + df_seq['linear_acc_y']**2 + df_seq['linear_acc_z']**2)\n            df_seq['linear_acc_mag_jerk'] = df_seq['linear_acc_mag'].diff().fillna(0)\n            if all(col in df_seq.columns for col in ['rot_x', 'rot_y', 'rot_z', 'rot_w']):\n                angular_vel = calculate_angular_velocity_from_quat(df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']])\n                df_seq[['angular_vel_x', 'angular_vel_y', 'angular_vel_z']] = angular_vel\n            else:\n                df_seq[['angular_vel_x', 'angular_vel_y', 'angular_vel_z']] = 0\n            if all(col in df_seq.columns for col in ['rot_x', 'rot_y', 'rot_z', 'rot_w']):\n                df_seq['angular_distance'] = calculate_angular_distance(df_seq[['rot_x', 'rot_y', 'rot_z', 'rot_w']])\n            else:\n                df_seq['angular_distance'] = 0\n\n        if self.tof_mode != 0:\n            new_columns = {} \n            for i in range(1, 6):\n                pixel_cols = [f\"tof_{i}_v{p}\" for p in range(64)]\n                tof_data = df_seq[pixel_cols].replace(-1, np.nan)\n                new_columns.update({\n                    f'tof_{i}_mean': tof_data.mean(axis=1),\n                    f'tof_{i}_std': tof_data.std(axis=1),\n                    f'tof_{i}_min': tof_data.min(axis=1),\n                    f'tof_{i}_max': tof_data.max(axis=1)\n                })\n                if self.tof_mode > 1:\n                    region_size = 64 // self.tof_mode\n                    for r in range(self.tof_mode):\n                        region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]\n                        new_columns.update({\n                            f'tof{self.tof_mode}_{i}_region_{r}_mean': region_data.mean(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_std': region_data.std(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_min': region_data.min(axis=1),\n                            f'tof{self.tof_mode}_{i}_region_{r}_max': region_data.max(axis=1)\n                        })\n                if self.tof_mode == -1:\n                    for mode in [2, 4, 8, 16, 32]:\n                        region_size = 64 // mode\n                        for r in range(mode):\n                            region_data = tof_data.iloc[:, r*region_size : (r+1)*region_size]\n                            new_columns.update({\n                                f'tof{mode}_{i}_region_{r}_mean': region_data.mean(axis=1),\n                                f'tof{mode}_{i}_region_{r}_std': region_data.std(axis=1),\n                                f'tof{mode}_{i}_region_{r}_min': region_data.min(axis=1),\n                                f'tof{mode}_{i}_region_{r}_max': region_data.max(axis=1)\n                            })\n            df_seq = pd.concat([df_seq, pd.DataFrame(new_columns)], axis=1)\n        \n        imu_unscaled = df_seq[self.imu_cols]\n        if self.config[\"fbfill\"][\"imu\"]:\n            imu_unscaled = imu_unscaled.ffill().bfill()\n        imu_unscaled = imu_unscaled.fillna(self.imu_nan_value).values.astype('float32')\n\n        thm_unscaled = df_seq[self.thm_cols]\n        if self.config[\"fbfill\"][\"thm\"]:\n            thm_unscaled = thm_unscaled.ffill().bfill()\n        thm_unscaled = thm_unscaled.fillna(self.thm_nan_value).values.astype('float32')\n\n        tof_unscaled = df_seq[self.tof_cols]\n        if self.config[\"fbfill\"][\"tof\"]:\n            tof_unscaled = tof_unscaled.ffill().bfill()\n        tof_unscaled = tof_unscaled.fillna(self.tof_nan_value).values.astype('float32')\n        \n        if self.config.get(\"one_scale\", True):\n            x_unscaled = np.concatenate([imu_unscaled, thm_unscaled, tof_unscaled], axis=1)\n            x_scaled = self.x_scaler.transform(x_unscaled)\n            imu_scaled = x_scaled[..., :self.imu_dim]\n            thm_scaled = x_scaled[..., self.imu_dim:self.imu_dim+self.thm_dim]\n            tof_scaled = x_scaled[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]\n        else:\n            imu_scaled = self.imu_scaler.transform(imu_unscaled)\n            thm_scaled = self.thm_scaler.transform(thm_unscaled)\n            tof_scaled = self.tof_scaler.transform(tof_unscaled)\n\n        combined = np.concatenate([imu_scaled, thm_scaled, tof_scaled], axis=1)\n        padded = np.zeros((self.pad_len, combined.shape[1]), dtype='float32')\n        seq_len = min(combined.shape[0], self.pad_len)\n        padded[:seq_len] = combined[:seq_len]\n        imu = padded[..., :self.imu_dim]\n        thm = padded[..., self.imu_dim:self.imu_dim+self.thm_dim]\n        tof = padded[..., self.imu_dim+self.thm_dim:self.imu_dim+self.thm_dim+self.tof_dim]\n        \n        return torch.from_numpy(imu).float().unsqueeze(0), torch.from_numpy(thm).float().unsqueeze(0), torch.from_numpy(tof).float().unsqueeze(0)\n\n    def __getitem__(self, idx):\n        return self.imu[idx], self.thm[idx], self.tof[idx], self.class_[idx]\n\n    def __len__(self):\n        return len(self.class_)\n\nclass CMIFoldDataset:\n    def __init__(self, data_path, config, full_dataset_function, n_folds=5, random_seed=0):\n        self.full_dataset = full_dataset_function(data_path=data_path, config=config)\n        self.imu_dim = self.full_dataset.imu_dim\n        self.thm_dim = self.full_dataset.thm_dim\n        self.tof_dim = self.full_dataset.tof_dim\n        self.le = self.full_dataset.le\n        self.class_names = self.full_dataset.le.classes_\n        self.class_weight = self.full_dataset.class_weight\n        all_indices = np.arange(len(self.full_dataset))\n        self.n_folds = n_folds\n        self.skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)\n        self.folds = list(self.skf.split(all_indices, np.array(self.full_dataset.dataset_indices)))\n    \n    def get_fold_datasets(self, fold_idx):\n        if self.folds is None or fold_idx >= self.n_folds:\n            return None, None\n        fold_train_idx, fold_valid_idx = self.folds[fold_idx]\n        return Subset(self.full_dataset, fold_train_idx), Subset(self.full_dataset, fold_valid_idx)\n\n    def print_fold_stats(self):\n        def get_label_counts(subset):\n            counts = {name: 0 for name in self.class_names}\n            if subset is None:\n                return counts\n            for idx in subset.indices:\n                label_idx = self.full_dataset.dataset_indices[idx]\n                counts[self.class_names[label_idx]] += 1\n            return counts\n        \n        print(\"\\n\u4ea4\u53c9\u9a8c\u8bc1\u6298\u53e0\u7edf\u8ba1:\")\n        for fold_idx in range(self.n_folds):\n            train_fold, valid_fold = self.get_fold_datasets(fold_idx)\n            train_counts = get_label_counts(train_fold)\n            valid_counts = get_label_counts(valid_fold)\n                \n            print(f\"\\nFold {fold_idx + 1}:\")\n            print(f\"{'\u7c7b\u522b':<50} {'\u8bad\u7ec3\u96c6':<10} {'\u9a8c\u8bc1\u96c6':<10}\")\n            for name in self.class_names:\n                print(f\"{name:<50} {train_counts[name]:<10} {valid_counts[name]:<10}\")\n\n\nclass SEBlock(nn.Module):\n    def __init__(self, channels, reduction = 8):\n        super().__init__()\n        self.fc1 = nn.Linear(channels, channels // reduction, bias=True)\n        self.fc2 = nn.Linear(channels // reduction, channels, bias=True)\n        self.sigmoid = nn.Sigmoid()\n\n    def forward(self, x):\n        # x: (B, C, L)\n        se = F.adaptive_avg_pool1d(x, 1).squeeze(-1)      # -> (B, C)\n        se = F.relu(self.fc1(se), inplace=True)          # -> (B, C//r)\n        se = self.sigmoid(self.fc2(se)).unsqueeze(-1)    # -> (B, C, 1)\n        return x * se                \n\nclass ResNetSEBlock(nn.Module):\n    def __init__(self, in_channels, out_channels, wd = 1e-4):\n        super().__init__()\n        self.conv1 = nn.Conv1d(in_channels, out_channels,\n                               kernel_size=3, padding=1, bias=False)\n        self.bn1 = nn.BatchNorm1d(out_channels)\n        self.conv2 = nn.Conv1d(out_channels, out_channels,\n                               kernel_size=3, padding=1, bias=False)\n        self.bn2 = nn.BatchNorm1d(out_channels)\n        # SE\n        self.se = SEBlock(out_channels)\n        \n        if in_channels != out_channels:\n            self.shortcut = nn.Sequential(\n                nn.Conv1d(in_channels, out_channels, kernel_size=1,\n                          padding=0, bias=False),\n                nn.BatchNorm1d(out_channels)\n            )\n        else:\n            self.shortcut = nn.Identity()\n\n        self.relu = nn.ReLU(inplace=True)\n\n    def forward(self, x) :\n        identity = self.shortcut(x)              # (B, out, L)\n        out = self.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n        out = self.se(out)                       # (B, out, L)\n        out = out + identity\n        return self.relu(out)\n\nclass CMIModel(nn.Module):\n    def __init__(self, imu_dim, thm_dim, tof_dim, n_classes, **kwargs):\n        super().__init__()\n        self.imu_branch = nn.Sequential(\n            self.residual_se_cnn_block(imu_dim, kwargs[\"imu1_channels\"], kwargs[\"imu1_layers\"],\n                                       drop=kwargs[\"imu1_dropout\"]),\n            self.residual_se_cnn_block(kwargs[\"imu1_channels\"], kwargs[\"feat_dim\"], kwargs[\"imu2_layers\"],\n                                       drop=kwargs[\"imu2_dropout\"])\n        )\n\n        self.thm_branch = nn.Sequential(\n            nn.Conv1d(thm_dim, kwargs[\"thm1_channels\"], kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm1d(kwargs[\"thm1_channels\"]),\n            nn.ReLU(inplace=True),\n            nn.MaxPool1d(2, ceil_mode=True),\n            nn.Dropout(kwargs[\"thm1_dropout\"]),\n            \n            nn.Conv1d(kwargs[\"thm1_channels\"], kwargs[\"feat_dim\"], kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm1d(kwargs[\"feat_dim\"]),\n            nn.ReLU(inplace=True),\n            nn.MaxPool1d(2, ceil_mode=True),\n            nn.Dropout(kwargs[\"thm2_dropout\"])\n        )\n        \n        self.tof_branch = nn.Sequential(\n            nn.Conv1d(tof_dim, kwargs[\"tof1_channels\"], kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm1d(kwargs[\"tof1_channels\"]),\n            nn.ReLU(inplace=True),\n            nn.MaxPool1d(2, ceil_mode=True),\n            nn.Dropout(kwargs[\"tof1_dropout\"]),\n            \n            nn.Conv1d(kwargs[\"tof1_channels\"], kwargs[\"feat_dim\"], kernel_size=3, padding=1, bias=False),\n            nn.BatchNorm1d(kwargs[\"feat_dim\"]),\n            nn.ReLU(inplace=True),\n            nn.MaxPool1d(2, ceil_mode=True),\n            nn.Dropout(kwargs[\"tof2_dropout\"])\n        )\n\n        self.cls_token = nn.Parameter(torch.zeros(1, 1, kwargs[\"feat_dim\"]))\n        self.bert = BertModel(BertConfig(\n            hidden_size=kwargs[\"feat_dim\"],\n            num_hidden_layers=kwargs[\"bert_layers\"],\n            num_attention_heads=kwargs[\"bert_heads\"],\n            intermediate_size=kwargs[\"feat_dim\"]*4\n        ))\n        \n        self.classifier = nn.Sequential(\n            nn.Linear(kwargs[\"feat_dim\"], kwargs[\"cls1_channels\"], bias=False),\n            nn.BatchNorm1d(kwargs[\"cls1_channels\"]),\n            nn.ReLU(inplace=True),\n            nn.Dropout(kwargs[\"cls1_dropout\"]),\n            nn.Linear(kwargs[\"cls1_channels\"], kwargs[\"cls2_channels\"], bias=False),\n            nn.BatchNorm1d(kwargs[\"cls2_channels\"]),\n            nn.ReLU(inplace=True),\n            nn.Dropout(kwargs[\"cls2_dropout\"]),\n            nn.Linear(kwargs[\"cls2_channels\"], n_classes)\n        )\n    \n    def residual_se_cnn_block(self, in_channels, out_channels, num_layers, pool_size=2, drop=0.3, wd=1e-4):\n        return nn.Sequential(\n            *[ResNetSEBlock(in_channels=in_channels, out_channels=in_channels) for i in range(num_layers)],\n            ResNetSEBlock(in_channels, out_channels, wd=wd),\n            nn.MaxPool1d(pool_size),\n            nn.Dropout(drop)\n        )\n    \n    def forward(self, imu, thm, tof):\n        imu_feat = self.imu_branch(imu.permute(0, 2, 1))\n        thm_feat = self.thm_branch(thm.permute(0, 2, 1))\n        tof_feat = self.tof_branch(tof.permute(0, 2, 1))\n        \n        bert_input = torch.cat([imu_feat, thm_feat, tof_feat], dim=-1).permute(0, 2, 1)\n        cls_token = self.cls_token.expand(bert_input.size(0), -1, -1)  # (B,1,H)\n        bert_input = torch.cat([cls_token, bert_input], dim=1)  # (B,T+1,H)\n        outputs = self.bert(inputs_embeds=bert_input)\n        pred_cls = outputs.last_hidden_state[:, 0, :]\n\n        return self.classifier(pred_cls)\n\n\nCUDA0 = \"cuda:0\"\nseed = 0\nbatch_size = 64\nnum_workers = 4\nn_folds = 5\n\nroot_dir = Path(\"/kaggle/input/cmi-detect-behavior-with-sensor-data\")\nuniverse_csv_path = Path(\"/kaggle/input/cmi-precompute/pytorch/all/1/tof-1_raw.csv\")\n\ndeterministic = kagglehub.package_import('wasupandceacar/deterministic').deterministic\ndeterministic.init_all(seed)\ndef init_dataset():\n    dataset_config = {\n        \"percent\": 95,\n        \"scaler_config\": StandardScaler(),\n        \"nan_ratio\": {\n            \"imu\": 0,\n            \"thm\": 0,\n            \"tof\": 0,\n        },\n        \"fbfill\": {\n            \"imu\": True,\n            \"thm\": True,\n            \"tof\": True,\n        },\n        \"one_scale\": True,\n        \"tof_raw\": True,\n        \"tof_mode\": 16,\n        \"save_precompute\": False,\n    }\n    dataset = CMIFoldDataset(universe_csv_path, dataset_config,\n                             n_folds=n_folds, random_seed=seed, full_dataset_function=CMIFeDataset)\n    dataset.print_fold_stats()\n    return dataset\n\ndef get_fold_dataset(dataset, fold):\n    _, valid_dataset = dataset.get_fold_datasets(fold)\n    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)\n    return valid_loader\n\ndataset = init_dataset()\n\nmodel_function = CMIModel\nmodel_args = {\"feat_dim\": 500,\n              \"imu1_channels\": 219, \"imu1_dropout\": 0.2946731587132302, \"imu2_dropout\": 0.2697745571929592,\n              \"imu1_weight_decay\": 0.0014824054650601245, \"imu2_weight_decay\": 0.002742543773142381,\n              \"imu1_layers\": 0, \"imu2_layers\": 0,\n              \"thm1_channels\": 82, \"thm1_dropout\": 0.2641274454844602, \"thm2_dropout\": 0.302896343020985, \n              \"tof1_channels\": 82, \"tof1_dropout\": 0.2641274454844602, \"tof2_dropout\": 0.3028963430209852, \n              \"bert_layers\": 8, \"bert_heads\": 10,\n              \"cls1_channels\": 937, \"cls2_channels\": 303, \"cls1_dropout\": 0.2281834512100508, \"cls2_dropout\": 0.22502521933558461}\nmodel_args.update({\n    \"imu_dim\": dataset.full_dataset.imu_dim, \n    \"thm_dim\": dataset.full_dataset.thm_dim,\n    \"tof_dim\": dataset.full_dataset.tof_dim,\n    \"n_classes\": dataset.full_dataset.class_num})\nmodel_dir = Path(\"/kaggle/input/cmi-models-public/pytorch/train_fold_model05_tof16_raw/1\")\n\nmodel_dicts = [\n    {\n        \"model_function\": model_function,\n        \"model_args\": model_args,\n        \"model_path\": model_dir / f\"fold{fold}/best_ema.pt\",\n    } for fold in range(n_folds)\n]\n\nmodels2 = list()\nfor model_dict in model_dicts:\n    model_function = model_dict[\"model_function\"]\n    model_args = model_dict[\"model_args\"]\n    model_path = model_dict[\"model_path\"]\n    model = model_function(**model_args).to(CUDA0)\n    state_dict = {k.replace(\"_orig_mod.\", \"\"): v for k,v in torch.load(model_path).items()}\n    model.load_state_dict(state_dict)\n    model = model.eval()\n    models2.append(model)\n\n\nmetric_package = kagglehub.package_import('wasupandceacar/cmi-metric')\n\nmetric = metric_package.Metric()\nimu_only_metric = metric_package.Metric()\n\ndef to_cuda(*tensors):\n    return [tensor.to(CUDA0) for tensor in tensors]\n\ndef predict_fold(model, imu, thm, tof):\n    pred = model(imu, thm, tof)\n    return pred\n\ndef valid(model, valid_bar):\n    with torch.no_grad():\n        for imu, thm, tof, y in valid_bar:\n            imu, thm, tof, y = to_cuda(imu, thm, tof, y)\n            with autocast(device_type='cuda', dtype=torch.bfloat16): \n                logits = predict_fold(model, imu, thm, tof)\n            metric.add(dataset.le.classes_[y.argmax(dim=1).cpu()], dataset.le.classes_[logits.argmax(dim=1).cpu()])\n            _, thm, tof = dataset.full_dataset.get_scaled_nan_tensors(imu, thm, tof)\n            with autocast(device_type='cuda', dtype=torch.bfloat16): \n                logits = model(imu, thm, tof)\n            imu_only_metric.add(dataset.le.classes_[y.argmax(dim=1).cpu()], dataset.le.classes_[logits.argmax(dim=1).cpu()])\n\n# for fold, model in enumerate(models2):\n#     valid_loader = get_fold_dataset(dataset, fold)\n#     valid_bar = tqdm(valid_loader, desc=f\"Valid\", position=0, leave=False)\n#     valid(model, valid_bar)\n\n# print(f\"\"\"\n# Normal score: {metric.score()}\n# IMU only score: {imu_only_metric.score()}\n# \"\"\")\n\ndef avg_predict(models, imu, thm, tof):\n    outputs = []\n    with autocast(device_type='cuda'):\n        for model in models:\n            logits = model(imu, thm, tof)\n        outputs.append(logits)\n    return torch.mean(torch.stack(outputs), dim=0)\n\n# In[ ]:\n\n\ndef predict2(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:\n    imu, thm, tof = dataset.full_dataset.inference_process(sequence)\n    with torch.no_grad():\n        imu, thm, tof = to_cuda(imu, thm, tof)\n        logits = avg_predict(models2, imu, thm, tof)\n        probabilities = F.softmax(logits, dim=1).cpu().numpy()\n    return probabilities\n\n# # Model 3\n\n# In[ ]:\n\n\nimport os\nimport json\nimport joblib\nimport numpy as np\nimport pandas as pd\nfrom pathlib import Path\nimport warnings\nimport random\nimport math\nimport matplotlib.pyplot as plt\nimport polars as pl\nimport tensorflow as tf\nfrom tensorflow.keras import backend as K\nfrom tensorflow.keras.models import Model, load_model\nfrom tensorflow.keras.layers import (\n    Input, Conv1D, BatchNormalization, Activation, add, MaxPooling1D, Dropout,\n    Bidirectional, GRU, GlobalAveragePooling1D, Dense, Multiply, Reshape,\n    Lambda, Concatenate\n)\nfrom tensorflow.keras.optimizers import Adam as AdamTF\nfrom tensorflow.keras.regularizers import l2\nfrom tensorflow.keras.utils import Sequence, to_categorical, pad_sequences\nfrom tensorflow.keras.callbacks import EarlyStopping\nfrom tensorflow.keras.optimizers.schedules import CosineDecay\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.utils.data import Dataset, DataLoader\nfrom torch.optim import Adam as AdamTorch\nfrom torch.optim.lr_scheduler import CosineAnnealingWarmRestarts\n\nfrom sklearn.model_selection import StratifiedGroupKFold\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom sklearn.utils.class_weight import compute_class_weight\nfrom scipy.spatial.transform import Rotation as R\nfrom scipy.signal import firwin\n\n# \u8a55\u4fa1\u30e1\u30c8\u30ea\u30af\u30b9\u306f\u30ed\u30fc\u30ab\u30eb\u691c\u8a3c/\u5b66\u7fd2\u6642\u306b\u306e\u307f\u30a4\u30f3\u30dd\u30fc\u30c8\ntry:\n    from cmi_2025_metric_copy_for_import import CompetitionMetric\nexcept ImportError:\n    CompetitionMetric = None\n    print(\"CompetitionMetric could not be imported. OOF/CV score will not be calculated.\")\n\ndef seed_everything(seed=42):\n    \"\"\"\n    \u5b9f\u884c\u74b0\u5883\u306e\u4e71\u6570\u30b7\u30fc\u30c9\u3092\u7d71\u4e00\u7684\u306b\u8a2d\u5b9a\u3059\u308b\u95a2\u6570\u3002\n    \"\"\"\n    os.environ['PYTHONHASHSEED'] = str(seed)\n    random.seed(seed)\n    np.random.seed(2025)\n    tf.random.set_seed(seed)\n    tf.experimental.numpy.random.seed(seed)\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'\n    os.environ['TF_DETERMINISTIC_OPS'] = '1'\n    # torch.backends.cudnn.deterministic = True # \u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u304c\u4f4e\u4e0b\u3059\u308b\u53ef\u80fd\u6027\u304c\u3042\u308b\u305f\u3081\u30b3\u30e1\u30f3\u30c8\u30a2\u30a6\u30c8\n    # torch.backends.cudnn.benchmark = False\n\nseed_everything(seed=42)\nwarnings.filterwarnings(\"ignore\")\n\nTRAIN = False\n\n# --- \u30d1\u30b9\u8a2d\u5b9a ---\nRAW_DIR = Path(\"/kaggle/input/cmi-detect-behavior-with-sensor-data\")\n# YOUR_MODELS_DIR\u306f\u81ea\u5206\u306e\u5b66\u7fd2\u6e08\u307f\u30e2\u30c7\u30eb\u304c\u683c\u7d0d\u3055\u308c\u3066\u3044\u308bKaggle\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306e\u30d1\u30b9\u306b\u8a2d\u5b9a\u3057\u3066\u304f\u3060\u3055\u3044\nYOUR_MODELS_DIR = Path(\"/kaggle/input/cmi-data-gated-gru\") # \u2605\u2605\u2605 \u81ea\u5206\u306e\u30e2\u30c7\u30eb\u30d1\u30b9\u306b\u5909\u66f4 \u2605\u2605\u2605\nPUBLIC_TF_MODEL_DIR = Path(\"/kaggle/input/lb-0-78-quaternions-tf-bilstm-gru-attention\")\nPUBLIC_PT_MODEL_DIR = Path(\"/kaggle/input/cmi3-models-p\")\nEXPORT_DIR = Path(\"./\") # \u5b66\u7fd2\u6e08\u307f\u30e2\u30c7\u30eb\u3084\u30a2\u30fc\u30c6\u30a3\u30d5\u30a1\u30af\u30c8\u306e\u4fdd\u5b58\u5148\n\n# --- \u30e2\u30c7\u30eb\u5b66\u7fd2\u30cf\u30a4\u30d1\u30fc\u30d1\u30e9\u30e1\u30fc\u30bf ---\nBATCH_SIZE = 64          # \u30d0\u30c3\u30c1\u30b5\u30a4\u30ba\nPAD_PERCENTILE = 95      # \u30b7\u30fc\u30b1\u30f3\u30b9\u9577\u306e\u30d1\u30c7\u30a3\u30f3\u30b0\u3092\u6c7a\u3081\u308b\u305f\u3081\u306e\u30d1\u30fc\u30bb\u30f3\u30bf\u30a4\u30eb\u5024\nLR_INIT = 4e-4           # \u5b66\u7fd2\u7387\u306e\u521d\u671f\u5024 (\u5fae\u8abf\u6574)\nWD = 3e-3                # Weight Decay\uff08L2\u6b63\u5247\u5316\uff09\u306e\u4fc2\u6570\nMIXUP_ALPHA = 0.4        # Mixup\u306e\u03b1\u5024\nEPOCHS = 360             # \u6700\u5927\u30a8\u30dd\u30c3\u30af\u6570 (\u5897\u52a0)\nPATIENCE = 50            # EarlyStopping\u306epatience (\u5897\u52a0)\nN_SPLITS = 10             # \u30af\u30ed\u30b9\u30d0\u30ea\u30c7\u30fc\u30b7\u30e7\u30f3\u306e\u5206\u5272\u6570\nMASKING_PROB = 0.25      # \u5b66\u7fd2\u6642\u306bTOF/THM\u30c7\u30fc\u30bf\u3092\u30de\u30b9\u30af\u3059\u308b\u78ba\u7387\nGATE_LOSS_WEIGHT = 0.2   # Gated\u30e2\u30c7\u30eb\u306e\u30b2\u30fc\u30c8\u640d\u5931\u306b\u5bfe\u3059\u308b\u91cd\u307f\n\nprint(f\"\u25b6 \u30e9\u30a4\u30d6\u30e9\u30ea\u306e\u30a4\u30f3\u30dd\u30fc\u30c8\u5b8c\u4e86\")\nprint(f\"  - TensorFlow: {tf.__version__}\")\nprint(f\"  - PyTorch: {torch.__version__}\")\nprint(f\"\u25b6 TRAIN\u30e2\u30fc\u30c9: {TRAIN}\")\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n\n# PyTorch\u30e2\u30c7\u30eb\u7528\u306e\u6a19\u6e96\u5316\u30d1\u30e9\u30e1\u30fc\u30bf\nmean_pt = torch.tensor([\n    0, 0, 0, 0, 0, 0, 9.0319e-03, 1.0849e+00, -2.6186e-03, 3.7651e-03,\n    -5.3660e-03, -2.8177e-03, 1.3318e-03, -1.5876e-04, 6.3495e-01,\n    6.2877e-01, 6.0607e-01, 6.2142e-01, 6.3808e-01, 6.5420e-01,\n    7.4102e-03, -3.4159e-03, -7.5237e-03, -2.6034e-02, 2.9704e-02,\n    -3.1546e-02, -2.0610e-03, -4.6986e-03, -4.7216e-03, -2.6281e-02,\n    1.5799e-02, 1.0016e-02\n], dtype=torch.float32).view(1, -1, 1).to(device)\n\nstd_pt = torch.tensor([\n    1, 1, 1, 1, 1, 1, 0.2067, 0.8583, 0.3162,\n    0.2668, 0.2917, 0.2341, 0.3023, 0.3281, 1.0264, 0.8838, 0.8686, 1.0973,\n    1.0267, 0.9018, 0.4658, 0.2009, 0.2057, 1.2240, 0.9535, 0.6655, 0.2941,\n    0.3421, 0.8156, 0.6565, 1.1034, 1.5577\n], dtype=torch.float32).view(1, -1, 1).to(device) + 1e-8\n\nclass ImuFeatureExtractor(nn.Module):\n    \"\"\"\n    \u2605\u2605\u2605 PyTorch\u30e2\u30c7\u30eb\u7528\u306e\u7279\u5fb4\u91cf\u62bd\u51fa\u5668 \u2605\u2605\u2605\n    \u516c\u958b\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u3068\u4e00\u81f4\u3055\u305b\u308b\u305f\u3081\u3001\u5143\u306e\u6b63\u3057\u3044\u5b9a\u7fa9\u306b\u4fee\u6b63\u3002\n    \"\"\"\n    def __init__(self, fs=100., add_quaternion=False):\n        super().__init__()\n        self.fs = fs\n        self.add_quaternion = add_quaternion\n\n        k = 15\n\n        # \u25bc\u25bc\u25bc\u3010\u3053\u3053\u304c\u4fee\u6b63\u70b9\u3011\u25bc\u25bc\u25bc\n        # \u516c\u958b\u30e2\u30c7\u30eb\u306e\u91cd\u307f\u30d5\u30a1\u30a4\u30eb\u306b\u5b58\u5728\u3059\u308b 'self.lpf' \u5c64\u3092\u518d\u5ea6\u8ffd\u52a0\u3059\u308b\n        self.lpf = nn.Conv1d(6, 6, kernel_size=k, padding=k//2,\n                                 groups=6, bias=False)\n        nn.init.kaiming_uniform_(self.lpf.weight, a=math.sqrt(5))\n        # \u25b2\u25b2\u25b2\u3010\u3053\u3053\u307e\u3067\u304c\u4fee\u6b63\u70b9\u3011\u25b2\u25b2\u25b2\n\n        self.lpf_acc  = nn.Conv1d(3, 3, k, padding=k//2, groups=3, bias=False)\n        self.lpf_gyro = nn.Conv1d(3, 3, k, padding=k//2, groups=3, bias=False)\n\n    def forward(self, imu):\n        acc  = imu[:, 0:3, :]\n        gyro = imu[:, 3:6, :]\n\n        # 1) magnitude\n        acc_mag  = torch.norm(acc,  dim=1, keepdim=True)\n        gyro_mag = torch.norm(gyro, dim=1, keepdim=True)\n\n        # 2) jerk\n        jerk = F.pad(acc[:, :, 1:] - acc[:, :, :-1], (1,0))\n        gyro_delta = F.pad(gyro[:, :, 1:] - gyro[:, :, :-1], (1,0))\n\n        # 3) energy\n        acc_pow  = acc ** 2\n        gyro_pow = gyro ** 2\n\n        # 4) LPF / HPF\n        # self.lpf \u306f forward\u30d1\u30b9\u3067\u306f\u4f7f\u308f\u308c\u3066\u3044\u306a\u3044\u304c\u3001\u91cd\u307f\u8aad\u307f\u8fbc\u307f\u306e\u305f\u3081\u306b\u5b9a\u7fa9\u304c\u5fc5\u8981\n        acc_lpf  = self.lpf_acc(acc)\n        acc_hpf  = acc - acc_lpf\n        gyro_lpf = self.lpf_gyro(gyro)\n        gyro_hpf = gyro - gyro_lpf\n\n        features = [\n            acc, gyro,\n            acc_mag, gyro_mag,\n            jerk, gyro_delta,\n            acc_pow, gyro_pow,\n            acc_lpf, acc_hpf,\n            gyro_lpf, gyro_hpf,\n        ]\n        return torch.cat(features, dim=1)\n\nclass SEBlock(nn.Module):\n    def __init__(self, channels, reduction=8):\n        super().__init__()\n        self.squeeze = nn.AdaptiveAvgPool1d(1)\n        self.excitation = nn.Sequential(\n            nn.Linear(channels, channels // reduction, bias=False), nn.ReLU(inplace=True),\n            nn.Linear(channels // reduction, channels, bias=False), nn.Sigmoid()\n        )\n    def forward(self, x):\n        b, c, _ = x.size()\n        y = self.squeeze(x).view(b, c)\n        y = self.excitation(y).view(b, c, 1)\n        return x * y.expand_as(x)\n\nclass ResidualSECNNBlock(nn.Module):\n    def __init__(self, in_channels, out_channels, kernel_size, pool_size=2, dropout=0.3):\n        super().__init__()\n        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2, bias=False)\n        self.bn1 = nn.BatchNorm1d(out_channels)\n        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2, bias=False)\n        self.bn2 = nn.BatchNorm1d(out_channels)\n        self.se = SEBlock(out_channels)\n        self.shortcut = nn.Sequential()\n        if in_channels != out_channels:\n            self.shortcut = nn.Sequential(nn.Conv1d(in_channels, out_channels, 1, bias=False), nn.BatchNorm1d(out_channels))\n        self.pool = nn.MaxPool1d(pool_size)\n        self.dropout = nn.Dropout(dropout)\n    def forward(self, x):\n        out = F.relu(self.bn1(self.conv1(x)))\n        out = self.bn2(self.conv2(out))\n        out = self.se(out)\n        out += self.shortcut(x)\n        return self.dropout(self.pool(F.relu(out)))\n\nclass AttentionLayer(nn.Module):\n    def __init__(self, hidden_dim):\n        super().__init__()\n        self.attention = nn.Linear(hidden_dim, 1)\n    def forward(self, x):\n        scores = torch.tanh(self.attention(x))\n        weights = F.softmax(scores.squeeze(-1), dim=1)\n        return torch.sum(x * weights.unsqueeze(-1), dim=1)\n\nclass TwoBranchModel(nn.Module):\n    def __init__(self, pad_len, imu_dim_raw, tof_dim, n_classes, dropouts=[0.3, 0.3, 0.3, 0.3, 0.4, 0.5, 0.3], feature_engineering=True, **kwargs):\n        super().__init__()\n        self.feature_engineering = feature_engineering\n        imu_dim = 32 if feature_engineering else imu_dim_raw\n        self.imu_fe = ImuFeatureExtractor(**kwargs) if feature_engineering else nn.Identity()\n        self.fir_nchan = 7\n        numtaps = 33\n        fir_kernel = torch.tensor(firwin(numtaps, cutoff=1.0, fs=10.0, pass_zero=False), dtype=torch.float32).view(1, 1, -1).repeat(self.fir_nchan, 1, 1)\n        self.register_buffer(\"fir_kernel\", fir_kernel)\n        self.imu_block1 = ResidualSECNNBlock(imu_dim, 64, 3, dropout=dropouts[0])\n        self.imu_block2 = ResidualSECNNBlock(64, 128, 5, dropout=dropouts[1])\n        self.tof_conv1 = nn.Conv1d(tof_dim, 64, 3, padding=1, bias=False)\n        self.tof_bn1, self.tof_pool1, self.tof_drop1 = nn.BatchNorm1d(64), nn.MaxPool1d(2), nn.Dropout(dropouts[2])\n        self.tof_conv2 = nn.Conv1d(64, 128, 3, padding=1, bias=False)\n        self.tof_bn2, self.tof_pool2, self.tof_drop2 = nn.BatchNorm1d(128), nn.MaxPool1d(2), nn.Dropout(dropouts[3])\n        self.bilstm = nn.LSTM(256, 128, bidirectional=True, batch_first=True)\n        self.lstm_dropout = nn.Dropout(dropouts[4])\n        self.attention = AttentionLayer(256)\n        self.dense1, self.bn_dense1, self.drop1 = nn.Linear(256, 256, bias=False), nn.BatchNorm1d(256), nn.Dropout(dropouts[5])\n        self.dense2, self.bn_dense2, self.drop2 = nn.Linear(256, 128, bias=False), nn.BatchNorm1d(128), nn.Dropout(dropouts[6])\n        self.classifier = nn.Linear(128, n_classes)\n\n    def forward(self, x):\n        imu_raw = x[:, :, :self.fir_nchan].transpose(1, 2)\n        tof = x[:, :, self.fir_nchan:].transpose(1, 2)\n        imu_fe = self.imu_fe(imu_raw)\n        filtered = F.conv1d(imu_fe[:, :self.fir_nchan, :], self.fir_kernel, padding=self.fir_kernel.shape[-1] // 2, groups=self.fir_nchan)\n        imu = (torch.cat([filtered, imu_fe[:, self.fir_nchan:, :]], dim=1) - mean_pt) / std_pt\n        x1 = self.imu_block1(imu); x1 = self.imu_block2(x1)\n        x2 = self.tof_drop1(self.tof_pool1(F.relu(self.tof_bn1(self.tof_conv1(tof)))))\n        x2 = self.tof_drop2(self.tof_pool2(F.relu(self.tof_bn2(self.tof_conv2(x2)))))\n        merged = torch.cat([x1, x2], dim=1).transpose(1, 2)\n        lstm_out, _ = self.bilstm(merged); lstm_out = self.lstm_dropout(lstm_out)\n        attended = self.attention(lstm_out)\n        x = self.drop1(F.relu(self.bn_dense1(self.dense1(attended))))\n        x = self.drop2(F.relu(self.bn_dense2(self.dense2(x))))\n        return self.classifier(x)\n\nclass PublicTwoBranchModel(nn.Module):\n    \"\"\"\n    \u2605\u2605\u2605 \u516c\u958b\u3055\u308c\u3066\u3044\u308bPyTorch\u30e2\u30c7\u30eb\uff08\u30e2\u30c7\u30eb\u7fa4C\uff09\u3092\u8aad\u307f\u8fbc\u3080\u305f\u3081\u306e\u3001\u5143\u306e\u30a2\u30fc\u30ad\u30c6\u30af\u30c1\u30e3\u3092\u6301\u3064\u30af\u30e9\u30b9 \u2605\u2605\u2605\n    \"\"\"\n    def __init__(self, pad_len, imu_dim_raw, tof_dim, n_classes, dropouts=[0.3, 0.3, 0.3, 0.3, 0.4, 0.5, 0.3], feature_engineering=True, **kwargs):\n        super().__init__()\n        self.feature_engineering = feature_engineering\n        imu_dim = 32 if feature_engineering else imu_dim_raw\n        self.imu_fe = ImuFeatureExtractor(**kwargs) if feature_engineering else nn.Identity()\n        self.fir_nchan = 7\n        numtaps = 33\n        fir_kernel = torch.tensor(firwin(numtaps, cutoff=1.0, fs=10.0, pass_zero=False), dtype=torch.float32).view(1, 1, -1).repeat(self.fir_nchan, 1, 1)\n        self.register_buffer(\"fir_kernel\", fir_kernel)\n        self.imu_block1 = ResidualSECNNBlock(imu_dim, 64, 3, dropout=dropouts[0])\n        self.imu_block2 = ResidualSECNNBlock(64, 128, 5, dropout=dropouts[1])\n        self.tof_conv1 = nn.Conv1d(tof_dim, 64, 3, padding=1, bias=False)\n        self.tof_bn1, self.tof_pool1, self.tof_drop1 = nn.BatchNorm1d(64), nn.MaxPool1d(2), nn.Dropout(dropouts[2])\n        self.tof_conv2 = nn.Conv1d(64, 128, 3, padding=1, bias=False)\n        self.tof_bn2, self.tof_pool2, self.tof_drop2 = nn.BatchNorm1d(128), nn.MaxPool1d(2), nn.Dropout(dropouts[3])\n        self.bilstm = nn.LSTM(256, 128, bidirectional=True, batch_first=True) # GRU\u3067\u306f\u306a\u304fLSTM\n        self.lstm_dropout = nn.Dropout(dropouts[4])\n        self.attention = AttentionLayer(256) # 128*2 for bidirectional\n        self.dense1, self.bn_dense1, self.drop1 = nn.Linear(256, 256, bias=False), nn.BatchNorm1d(256), nn.Dropout(dropouts[5])\n        self.dense2, self.bn_dense2, self.drop2 = nn.Linear(256, 128, bias=False), nn.BatchNorm1d(128), nn.Dropout(dropouts[6])\n        self.classifier = nn.Linear(128, n_classes)\n\n    def forward(self, x):\n        imu_raw = x[:, :, :self.fir_nchan].transpose(1, 2)\n        tof = x[:, :, self.fir_nchan:].transpose(1, 2)\n        imu_fe = self.imu_fe(imu_raw)\n        filtered = F.conv1d(imu_fe[:, :self.fir_nchan, :], self.fir_kernel, padding=self.fir_kernel.shape[-1] // 2, groups=self.fir_nchan)\n        # mean_pt, std_pt \u306f\u4e8b\u524d\u306b\u5b9a\u7fa9\u3055\u308c\u3066\u3044\u308b\u30b0\u30ed\u30fc\u30d0\u30eb\u5909\u6570\n        imu = (torch.cat([filtered, imu_fe[:, self.fir_nchan:, :]], dim=1) - mean_pt) / std_pt\n        x1 = self.imu_block1(imu); x1 = self.imu_block2(x1)\n        x2 = self.tof_drop1(self.tof_pool1(F.relu(self.tof_bn1(self.tof_conv1(tof)))))\n        x2 = self.tof_drop2(self.tof_pool2(F.relu(self.tof_bn2(self.tof_conv2(x2)))))\n        merged = torch.cat([x1, x2], dim=1).transpose(1, 2)\n        lstm_out, _ = self.bilstm(merged); lstm_out = self.lstm_dropout(lstm_out)\n        attended = self.attention(lstm_out)\n        x = self.drop1(F.relu(self.bn_dense1(self.dense1(attended))))\n        x = self.drop2(F.relu(self.bn_dense2(self.dense2(x))))\n        return self.classifier(x)\n\ndef pad_sequences_torch3(sequences, maxlen, padding='post', truncating='post', value=0.0):\n    result = []\n    for seq in sequences:\n        if len(seq) >= maxlen: seq = seq[:maxlen] if truncating == 'post' else seq[-maxlen:]\n        else:\n            pad_len = maxlen - len(seq)\n            pad_array = np.full((pad_len, seq.shape[1]), value)\n            seq = np.concatenate([seq, pad_array]) if padding == 'post' else np.concatenate([pad_array, seq])\n        result.append(seq)\n    return np.array(result, dtype=np.float32)\n\n# =============================================================================\n# ## \u7279\u5fb4\u91cf\u30a8\u30f3\u30b8\u30cb\u30a2\u30ea\u30f3\u30b0\u95a2\u6570\n# =============================================================================\ndef remove_gravity_from_acc3(acc_data, rot_data):\n    \"\"\"\u52a0\u901f\u5ea6\u30c7\u30fc\u30bf\u304b\u3089\u91cd\u529b\u6210\u5206\u3092\u9664\u53bb\u3059\u308b\"\"\"\n    acc_values = acc_data[['acc_x', 'acc_y', 'acc_z']].values\n    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    linear_accel = np.zeros_like(acc_values)\n    gravity_world = np.array([0, 0, 9.81])\n    for i in range(len(acc_values)):\n        if np.all(np.isnan(quat_values[i])):\n            linear_accel[i, :] = acc_values[i, :]\n            continue\n        try:\n            rotation = R.from_quat(quat_values[i])\n            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)\n            linear_accel[i, :] = acc_values[i, :] - gravity_sensor_frame\n        except (ValueError, IndexError):\n            linear_accel[i, :] = acc_values[i, :]\n    return linear_accel\n\ndef calculate_angular_velocity_from_quat3(rot_data, time_delta=1/200):\n    \"\"\"\u30af\u30a9\u30fc\u30bf\u30cb\u30aa\u30f3\u304b\u3089\u89d2\u901f\u5ea6\u3092\u8a08\u7b97\u3059\u308b\"\"\"\n    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    angular_vel = np.zeros((len(quat_values), 3))\n    for i in range(len(quat_values) - 1):\n        q_t, q_t_plus_dt = quat_values[i], quat_values[i+1]\n        if np.all(np.isnan(q_t)) or np.all(np.isnan(q_t_plus_dt)): continue\n        try:\n            rot_t = R.from_quat(q_t)\n            rot_t_plus_dt = R.from_quat(q_t_plus_dt)\n            delta_rot = rot_t.inv() * rot_t_plus_dt\n            angular_vel[i, :] = delta_rot.as_rotvec() / time_delta\n        except (ValueError, IndexError): pass\n    return angular_vel\n\ndef calculate_angular_distance3(rot_data):\n    \"\"\"\u30af\u30a9\u30fc\u30bf\u30cb\u30aa\u30f3\u304b\u3089\u89d2\u8ddd\u96e2\u3092\u8a08\u7b97\u3059\u308b\"\"\"\n    quat_values = rot_data[['rot_x', 'rot_y', 'rot_z', 'rot_w']].values\n    angular_dist = np.zeros(len(quat_values))\n    for i in range(len(quat_values) - 1):\n        q1, q2 = quat_values[i], quat_values[i+1]\n        if np.all(np.isnan(q1)) or np.all(np.isnan(q2)): continue\n        try:\n            r1 = R.from_quat(q1)\n            r2 = R.from_quat(q2)\n            relative_rotation = r1.inv() * r2\n            angular_dist[i] = np.linalg.norm(relative_rotation.as_rotvec())\n        except (ValueError, IndexError): pass\n    return angular_dist\n\ndef time_sum(x): return K.sum(x, axis=1)\ndef squeeze_last_axis(x): return tf.squeeze(x, axis=-1)\ndef expand_last_axis(x): return tf.expand_dims(x, axis=-1)\n\ndef se_block(x, reduction=8):\n    \"\"\"Squeeze-and-Excitation\u30d6\u30ed\u30c3\u30af\"\"\"\n    ch = x.shape[-1]\n    se = GlobalAveragePooling1D()(x)\n    se = Dense(ch // reduction, activation='relu')(se)\n    se = Dense(ch, activation='sigmoid')(se)\n    se = Reshape((1, ch))(se)\n    return Multiply()([x, se])\n\ndef residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):\n    \"\"\"Residual SE-CNN\u30d6\u30ed\u30c3\u30af\"\"\"\n    shortcut = x\n    # 2\u5c64\u306eConv1D\n    for _ in range(2):\n        x = Conv1D(filters, kernel_size, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x)\n        x = BatchNormalization()(x)\n        x = Activation('relu')(x)\n    # SE\u30d6\u30ed\u30c3\u30af\n    x = se_block(x)\n    # \u30b7\u30e7\u30fc\u30c8\u30ab\u30c3\u30c8\u63a5\u7d9a\n    if shortcut.shape[-1] != filters:\n        shortcut = Conv1D(filters, 1, padding='same', use_bias=False, kernel_regularizer=l2(wd))(shortcut)\n        shortcut = BatchNormalization()(shortcut)\n    x = add([x, shortcut])\n    x = Activation('relu')(x)\n    x = MaxPooling1D(pool_size)(x)\n    x = Dropout(drop)(x)\n    return x\n\ndef attention_layer(inputs):\n    \"\"\"\u30a2\u30c6\u30f3\u30b7\u30e7\u30f3\u5c64\"\"\"\n    score = Dense(1, activation='tanh')(inputs)\n    score = Lambda(squeeze_last_axis)(score)\n    weights = Activation('softmax')(score)\n    weights = Lambda(expand_last_axis)(weights)\n    context = Multiply()([inputs, weights])\n    context = Lambda(time_sum)(context)\n    return context\n\nclass GatedMixupGenerator(Sequence):\n    \"\"\"Mixup\u3068\u30bb\u30f3\u30b5\u30fc\u30de\u30b9\u30ad\u30f3\u30b0\u3092\u9069\u7528\u3059\u308b\u30c7\u30fc\u30bf\u30b8\u30a7\u30cd\u30ec\u30fc\u30bf\"\"\"\n    def __init__(self, X, y, batch_size, imu_dim, class_weight=None, alpha=0.2, masking_prob=0.0):\n        self.X, self.y, self.batch, self.imu_dim = X, y, batch_size, imu_dim\n        self.class_weight, self.alpha, self.masking_prob = class_weight, alpha, masking_prob\n        self.indices = np.arange(len(X))\n\n    def __len__(self):\n        return int(np.ceil(len(self.X) / self.batch))\n\n    def __getitem__(self, i):\n        idx = self.indices[i*self.batch:(i+1)*self.batch]\n        Xb, yb = self.X[idx].copy(), self.y[idx].copy()\n\n        sample_weights = np.ones(len(Xb), dtype='float32')\n        if self.class_weight:\n            sample_weights = np.array([self.class_weight.get(i, 1.0) for i in yb.argmax(axis=1)])\n\n        gate_target = np.ones(len(Xb), dtype='float32')\n        if self.masking_prob > 0:\n            for j in range(len(Xb)):\n                if np.random.rand() < self.masking_prob:\n                    Xb[j, :, self.imu_dim:] = 0\n                    gate_target[j] = 0.0\n\n        if self.alpha > 0:\n            lam = np.random.beta(self.alpha, self.alpha)\n            perm = np.random.permutation(len(Xb))\n            X_mix = lam * Xb + (1 - lam) * Xb[perm]\n            y_mix = lam * yb + (1 - lam) * yb[perm]\n            gate_target_mix = lam * gate_target + (1 - lam) * gate_target[perm]\n            sample_weights_mix = lam * sample_weights + (1 - lam) * sample_weights[perm]\n            return X_mix, {'main_output': y_mix, 'tof_gate': gate_target_mix}, sample_weights_mix\n\n        return Xb, {'main_output': yb, 'tof_gate': gate_target}, sample_weights\n\n    def on_epoch_end(self):\n        np.random.shuffle(self.indices)\n\ndef build_gated_two_branch_model(pad_len, imu_dim, tof_dim, n_classes, wd=1e-4):\n    \"\"\"\n    \u81ea\u4f5c\u306eGated Two-Branch\u30e2\u30c7\u30eb\u3092\u69cb\u7bc9\u3059\u308b\u95a2\u6570\u3002\n    [\u6539\u826f\u70b9] LSTM\u3092GRU\u306b\u5909\u66f4\u3001\u5168\u7d50\u5408\u5c64\u30921\u5c64\u8ffd\u52a0\u3002\n    \"\"\"\n    inp = Input(shape=(pad_len, imu_dim + tof_dim))\n    imu = Lambda(lambda t: t[:, :, :imu_dim])(inp)\n    tof = Lambda(lambda t: t[:, :, imu_dim:])(inp)\n\n    # IMU\u30d6\u30e9\u30f3\u30c1 (Deep)\n    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.1, wd=wd)\n    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.1, wd=wd)\n\n    # TOF/THM\u30d6\u30e9\u30f3\u30c1 (Light) with Gating\n    x2_base = Conv1D(64, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof)\n    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)\n    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)\n    x2_base = Conv1D(128, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x2_base)\n    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)\n    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)\n\n    # Gating\u6a5f\u69cb\n    gate_input = GlobalAveragePooling1D()(tof)\n    gate_input = Dense(16, activation='relu')(gate_input)\n    gate = Dense(1, activation='sigmoid', name='tof_gate')(gate_input)\n    x2 = Multiply()([x2_base, gate])\n\n    # \u30d6\u30e9\u30f3\u30c1\u306e\u30de\u30fc\u30b8\u3068\u5f8c\u7d9a\u5c64\n    merged = Concatenate()([x1, x2])\n    # \u2605\u6539\u826f\u70b9: LSTM -> GRU\n    x = Bidirectional(GRU(256, return_sequences=True, kernel_regularizer=l2(wd)))(merged)\n    x = Dropout(0.45)(x)\n    x = attention_layer(x)\n\n    # \u2605\u6539\u826f\u70b9: \u5168\u7d50\u5408\u5c64\u30921\u5c64\u8ffd\u52a0\u3057\u3066\u8868\u73fe\u529b\u3092\u5411\u4e0a\n    for units, drop in [(512, 0.5), (256, 0.4), (128, 0.3)]:\n        x = Dense(units, use_bias=False, kernel_regularizer=l2(wd))(x)\n        x = BatchNormalization()(x)\n        x = Activation('relu')(x)\n        x = Dropout(drop)(x)\n\n    out = Dense(n_classes, activation='softmax', name='main_output', kernel_regularizer=l2(wd))(x)\n\n    return Model(inputs=inp, outputs=[out, gate])\n\n# -----------------------------------------------------------------------------\n# ### \u63a8\u8ad6\u30e2\u30fc\u30c9 (`TRAIN = False`)\n# -----------------------------------------------------------------------------\n\nprint(\"\u25b6 \u63a8\u8ad6\u30e2\u30fc\u30c9\u958b\u59cb \u2013 \u5b66\u7fd2\u6e08\u307f\u30e2\u30c7\u30eb\u3068\u30a2\u30fc\u30c6\u30a3\u30d5\u30a1\u30af\u30c8\u3092\u8aad\u307f\u8fbc\u307f\u307e\u3059...\")\n\n# --- \u30e2\u30c7\u30eb\u7fa4A (\u81ea\u4f5cTF/Keras\u30e2\u30c7\u30eb) \u306e\u8aad\u307f\u8fbc\u307f ---\nprint(\"  \u30e2\u30c7\u30eb\u7fa4A (\u81ea\u4f5c5-Fold Gated GRU\u30e2\u30c7\u30eb) \u3092\u8aad\u307f\u8fbc\u307f\u4e2d...\")\nfinal_feature_cols_A = np.load(YOUR_MODELS_DIR / \"final_feature_cols.npy\", allow_pickle=True).tolist()\npad_len_A = int(np.load(YOUR_MODELS_DIR / \"sequence_maxlen.npy\"))\nscaler_A = joblib.load(YOUR_MODELS_DIR / \"scaler.pkl\")\ngesture_classes = np.load(YOUR_MODELS_DIR / \"gesture_classes.npy\", allow_pickle=True)\ncustom_objs_A = {'time_sum': time_sum, 'squeeze_last_axis': squeeze_last_axis, 'expand_last_axis': expand_last_axis,\n                 'se_block': se_block, 'residual_se_cnn_block': residual_se_cnn_block, 'attention_layer': attention_layer}\nmodels_A = [load_model(YOUR_MODELS_DIR / f\"final_model_fold_{f}.h5\", compile=False, custom_objects=custom_objs_A) for f in range(N_SPLITS)]\nprint(f\"  > {len(models_A)}\u500b\u306e\u30e2\u30c7\u30eb\u3092\u6b63\u5e38\u306b\u8aad\u307f\u8fbc\u307f\u307e\u3057\u305f\u3002\")\n\n# --- \u30e2\u30c7\u30eb\u7fa4B (\u516c\u958bTF/Keras\u30e2\u30c7\u30eb) \u306e\u8aad\u307f\u8fbc\u307f ---\nprint(\"\\n  \u30e2\u30c7\u30eb\u7fa4B (\u516c\u958bTF/Keras\u30e2\u30c7\u30eb) \u3092\u8aad\u307f\u8fbc\u307f\u4e2d...\")\nfinal_feature_cols_B = np.load(PUBLIC_TF_MODEL_DIR / \"feature_cols.npy\", allow_pickle=True).tolist()\npad_len_B = int(np.load(PUBLIC_TF_MODEL_DIR / \"sequence_maxlen.npy\"))\nscaler_B = joblib.load(PUBLIC_TF_MODEL_DIR / \"scaler.pkl\")\ncustom_objs_B = custom_objs_A # public model\u3082\u540c\u3058\u30ab\u30b9\u30bf\u30e0\u30aa\u30d6\u30b8\u30a7\u30af\u30c8\u3092\u4f7f\u7528\nmodel_B = load_model(PUBLIC_TF_MODEL_DIR / \"gesture_two_branch_mixup.h5\", compile=False, custom_objects=custom_objs_B)\nprint(\"  > 1\u500b\u306e\u30e2\u30c7\u30eb\u3092\u6b63\u5e38\u306b\u8aad\u307f\u8fbc\u307f\u307e\u3057\u305f\u3002\")\n\n# --- \u30e2\u30c7\u30eb\u7fa4C (\u516c\u958bPyTorch\u30e2\u30c7\u30eb) \u306e\u8aad\u307f\u8fbc\u307f ---\nprint(\"\\n  \u30e2\u30c7\u30eb\u7fa4C (\u516c\u958bPyTorch\u30e2\u30c7\u30eb) \u3092\u8aad\u307f\u8fbc\u307f\u4e2d...\")\nfinal_feature_cols_C = np.load(PUBLIC_PT_MODEL_DIR / \"feature_cols.npy\", allow_pickle=True).tolist()\npad_len_C = int(np.load(PUBLIC_PT_MODEL_DIR / \"sequence_maxlen.npy\"))\nscaler_C = joblib.load(PUBLIC_PT_MODEL_DIR / \"scaler.pkl\")\n\npt_models = []\nfor f in range(5):\n    checkpoint = torch.load(PUBLIC_PT_MODEL_DIR / f\"gesture_two_branch_fold{f}.pth\", map_location=device)\n    cfg = {'pad_len': checkpoint['pad_len'], 'imu_dim_raw': checkpoint['imu_dim'],\n           'tof_dim': checkpoint['tof_dim'], 'n_classes': checkpoint['n_classes']}\n    m = PublicTwoBranchModel(**cfg).to(device)\n    m.load_state_dict(checkpoint['model_state_dict'])\n    m.eval()\n    pt_models.append(m)\nprint(f\"  > {len(pt_models)}\u500b\u306e\u30e2\u30c7\u30eb\u3092\u6b63\u5e38\u306b\u8aad\u307f\u8fbc\u307f\u307e\u3057\u305f\u3002\")\n\n# ## predict_3\n\n# In[ ]:\n\n\ndef enumerate_weights(i):\n    import random\n    \n    # directed random scheme 1\n    weights00 = {'A': 0.5285, 'B': 0.1770, 'C': 0.2945}\n    weights01 = {'A': 0.5290, 'B': 0.1780, 'C': 0.2930}\n    weights02 = {'A': 0.5295, 'B': 0.1785, 'C': 0.2920}\n    weights03 = {'A': 0.5305, 'B': 0.1790, 'C': 0.2905}\n    weights04 = {'A': 0.5310, 'B': 0.1795, 'C': 0.2895}\n\n    # directed random scheme 2\n    weights05 = {'A': 0.533, 'B': 0.174, 'C': 0.293}\n    weights06 = {'A': 0.534, 'B': 0.176, 'C': 0.290}\n    weights07 = {'A': 0.535, 'B': 0.177, 'C': 0.288}\n    weights08 = {'A': 0.536, 'B': 0.178, 'C': 0.286}\n    weights09 = {'A': 0.537, 'B': 0.179, 'C': 0.284}\n\n    # directed random scheme 3\n    weights10 = {'A': 0.534, 'B': 0.158,  'C': 0.308}\n    weights11 = {'A': 0.534, 'B': 0.1586, 'C': 0.3074}\n    weights12 = {'A': 0.535, 'B': 0.159,  'C': 0.386}\n    weights13 = {'A': 0.536, 'B': 0.1595, 'C': 0.3045}\n    weights14 = {'A': 0.537, 'B': 0.160,  'C': 0.303}\n\n    # directed random scheme 4\n    weights15 = {'A': 0.527, 'B': 0.185,  'C': 0.288}\n    weights16 = {'A': 0.526, 'B': 0.190,  'C': 0.284}\n    weights17 = {'A': 0.525, 'B': 0.195,  'C': 0.280}\n    weights18 = {'A': 0.524, 'B': 0.200,  'C': 0.276}\n    weights19 = {'A': 0.523, 'B': 0.205,  'C': 0.272}\n\n    import random\n    weights_1 = random.choice([weights00,weights01,weights02,weights03,weights04])\n    weights_2 = random.choice([weights05,weights06,weights07,weights08,weights09])\n    weights_3 = random.choice([weights10,weights11,weights12,weights13,weights14])\n    weights_4 = random.choice([weights15,weights16,weights17,weights18,weights19])\n\n    if i == 1: return weights_1\n    if i == 2: return weights_2\n    if i == 3: return weights_3\n    if i == 4: return weights_4\n        \n    return {'A': 0.53, 'B': 0.18, 'C': 0.29}\n\n# In[ ]:\n\n\n# --- `predict`\u95a2\u6570\u306e\u5b9a\u7fa9 ---\ndef predict3(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:\n    df_seq_orig = sequence.to_pandas()\n    df_seq_A = df_seq_orig.copy()\n    \n    linear_accel_A = remove_gravity_from_acc3(df_seq_A[['acc_x','acc_y','acc_z']], df_seq_A[['rot_x','rot_y','rot_z','rot_w']])\n    df_seq_A['linear_acc_x'], df_seq_A['linear_acc_y'], df_seq_A['linear_acc_z'] = linear_accel_A[:,0], linear_accel_A[:,1], linear_accel_A[:,2]\n    df_seq_A['linear_acc_mag'] = np.linalg.norm(linear_accel_A, axis=1)\n    df_seq_A['linear_acc_mag_jerk'] = df_seq_A['linear_acc_mag'].diff().fillna(0)\n    angular_vel_A = calculate_angular_velocity_from_quat3(df_seq_A[['rot_x','rot_y','rot_z','rot_w']])\n    df_seq_A['angular_vel_x'], df_seq_A['angular_vel_y'], df_seq_A['angular_vel_z'] = angular_vel_A[:,0], angular_vel_A[:,1], angular_vel_A[:,2]\n    df_seq_A['angular_distance'] = calculate_angular_distance3(df_seq_A[['rot_x','rot_y','rot_z','rot_w']])\n    for col in ['rot_x', 'rot_y', 'rot_z', 'rot_w']:\n        df_seq_A[f'{col}_diff'] = df_seq_A[col].diff().fillna(0)\n    cols_for_stats=['linear_acc_mag','linear_acc_mag_jerk','angular_distance']\n    for col in cols_for_stats:\n        df_seq_A[f'{col}_skew'], df_seq_A[f'{col}_kurt'] = df_seq_A[col].skew(), df_seq_A[col].kurtosis()\n    for i in range(1,6):\n        if f'tof_{i}_v0' in df_seq_A.columns:\n            pixel_cols=[f\"tof_{i}_v{p}\" for p in range(64)]; tof_data=df_seq_A[pixel_cols].replace(-1,np.nan)\n            df_seq_A[f'tof_{i}_mean'], df_seq_A[f'tof_{i}_std'], df_seq_A[f'tof_{i}_min'], df_seq_A[f'tof_{i}_max'] = tof_data.mean(axis=1),tof_data.std(axis=1),tof_data.min(axis=1),tof_data.max(axis=1)\n    tof_mean_cols=[f'tof_{i}_mean' for i in range(1,6) if f'tof_{i}_mean' in df_seq_A.columns]\n    if tof_mean_cols:\n        df_seq_A['tof_std_across_sensors']=df_seq_A[tof_mean_cols].std(axis=1)\n        df_seq_A['tof_range_across_sensors']=df_seq_A[tof_mean_cols].max(axis=1)-df_seq_A[tof_mean_cols].min(axis=1)\n    thm_cols=[f'thm_{i}' for i in range(1,6) if f'thm_{i}' in df_seq_A.columns]\n    if thm_cols:\n        df_seq_A['thm_std_across_sensors']=df_seq_A[thm_cols].std(axis=1)\n        df_seq_A['thm_range_across_sensors']=df_seq_A[thm_cols].max(axis=1)-df_seq_A[thm_cols].min(axis=1)\n    # (\u63a8\u8ad6 A)\n    mat_A = df_seq_A[final_feature_cols_A].ffill().bfill().fillna(0).values.astype('float32')\n    mat_A = scaler_A.transform(mat_A)\n    pad_input_A = pad_sequences([mat_A], maxlen=pad_len_A, padding='post', dtype='float32')\n    preds_A_folds = [model.predict(pad_input_A, verbose=0)[0] for model in models_A]\n\n    # --- 2. \u30e2\u30c7\u30eb\u7fa4B (\u516c\u958bTF\u30e2\u30c7\u30eb) \u306e\u4e88\u6e2c ---\n    df_seq_B = df_seq_orig.copy()\n    # (\u7279\u5fb4\u91cf\u751f\u6210 B)\n    df_seq_B['acc_mag']=np.sqrt(df_seq_B['acc_x']**2+df_seq_B['acc_y']**2+df_seq_B['acc_z']**2)\n    df_seq_B['rot_angle']=2*np.arccos(df_seq_B['rot_w'].clip(-1,1))\n    df_seq_B['acc_mag_jerk']=df_seq_B['acc_mag'].diff().fillna(0)\n    df_seq_B['rot_angle_vel']=df_seq_B['rot_angle'].diff().fillna(0)\n    linear_accel_B=remove_gravity_from_acc3(df_seq_B,df_seq_B)\n    df_seq_B['linear_acc_x'],df_seq_B['linear_acc_y'],df_seq_B['linear_acc_z']=linear_accel_B[:,0],linear_accel_B[:,1],linear_accel_B[:,2]\n    df_seq_B['linear_acc_mag']=np.sqrt(df_seq_B['linear_acc_x']**2+df_seq_B['linear_acc_y']**2+df_seq_B['linear_acc_z']**2)\n    df_seq_B['linear_acc_mag_jerk']=df_seq_B['linear_acc_mag'].diff().fillna(0)\n    angular_vel_B=calculate_angular_velocity_from_quat3(df_seq_B)\n    df_seq_B['angular_vel_x'],df_seq_B['angular_vel_y'],df_seq_B['angular_vel_z']=angular_vel_B[:,0],angular_vel_B[:,1],angular_vel_B[:,2]\n    df_seq_B['angular_distance']=calculate_angular_distance3(df_seq_B)\n    for i in range(1,6):\n        if f'tof_{i}_v0' in df_seq_B.columns:\n            pixel_cols=[f\"tof_{i}_v{p}\" for p in range(64)]; tof_data=df_seq_B[pixel_cols].replace(-1,np.nan)\n            df_seq_B[f\"tof_{i}_mean\"],df_seq_B[f\"tof_{i}_std\"],df_seq_B[f\"tof_{i}_min\"],df_seq_B[f\"tof_{i}_max\"]=tof_data.mean(axis=1),tof_data.std(axis=1),tof_data.min(axis=1),tof_data.max(axis=1)\n    # (\u63a8\u8ad6 B)\n    mat_B = df_seq_B[final_feature_cols_B].ffill().bfill().fillna(0).values.astype('float32')\n    mat_B = scaler_B.transform(mat_B)\n    pad_input_B = pad_sequences([mat_B], maxlen=pad_len_B, padding='post', dtype='float32')\n\n    # --- 3. \u30e2\u30c7\u30eb\u7fa4C (\u516c\u958bPyTorch\u30e2\u30c7\u30eb) \u306e\u4e88\u6e2c ---\n    df_seq_C = df_seq_orig.copy() # C\u306f\u7279\u5fb4\u91cf\u751f\u6210\u304c\u4e0d\u8981\u306a\u305f\u3081\u3001\u30b3\u30d4\u30fc\u306e\u307f\n    mat_C = df_seq_C[final_feature_cols_C].ffill().bfill().fillna(0).values.astype('float32')\n    mat_C = scaler_C.transform(mat_C)\n    pad_input_C = pad_sequences_torch3([mat_C], maxlen=pad_len_C, padding='pre', truncating='pre')\n   \n    with torch.no_grad():\n        pt_input = torch.from_numpy(pad_input_C).to(device)\n        preds_C_folds = [model(pt_input) for model in pt_models]\n\n    avg_pred_A = np.mean(preds_A_folds, axis=0)\n    avg_pred_C_logits = torch.median(torch.stack(preds_C_folds), dim=0).values\n    avg_pred_C = torch.softmax(avg_pred_C_logits, dim=1).cpu().numpy()\n    pred_B = model_B.predict(pad_input_B, verbose=0)\n    if isinstance(pred_B, list): pred_B = pred_B[0]\n    \n    # --- 4. \u4e0a\u4f4d\u306b\u8fd1\u3044\u30e9\u30f3\u30c0\u30e0\u306a\u6c7a\u5b9a\u3092\u6307\u793a\u3059\u308b\n    # --- 4. Directed random decision close to top\n    # weights = {'A': 0.53, 'B': 0.18, 'C': 0.29}     # current.Top\n\n    weights = enumerate_weights(2)\n    \n    final_pred_proba = (weights['A'] * avg_pred_A + weights['B'] * pred_B + weights['C'] * avg_pred_C)\n\n    return final_pred_proba\n\n# In[ ]:\n\n\n\n# === Enhancement Block: Temperature Scaling + Simple Stacking + Demographics Priors ===\n# This block overrides `predict` by applying:\n# (1) Per-model temperature scaling (if OOF files present) on logits/probs from predict1/2/3\n# (2) A simple meta-stacker (LogisticRegression) trained on OOF, else falls back to weighted average\n# (3) Demographics-based class priors (trained from train_demographics if present)\n\nimport numpy as np\nimport pandas as pd\nimport os\n\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.preprocessing import OneHotEncoder, StandardScaler\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.pipeline import Pipeline\n\nEPS = 1e-12\n\ndef _prob_to_logits(p):\n    p = np.clip(p, EPS, 1.0 - EPS)\n    if p.ndim == 1:\n        p = p[None, :]\n    p /= p.sum(axis=1, keepdims=True)\n    return np.log(p)\n\ndef _logits_to_prob(z):\n    z = z - z.max(axis=1, keepdims=True)\n    ez = np.exp(z)\n    return ez / ez.sum(axis=1, keepdims=True)\n\ndef load_oof(path):\n    if os.path.exists(path):\n        try:\n            return pd.read_csv(path)\n        except Exception:\n            pass\n    return None\n\nCAND_OOF = [\n    \"/kaggle/input/cmi-oof/oof_predict1.csv\",\n    \"/kaggle/input/cmi-oof/oof_predict2.csv\",\n    \"/kaggle/input/cmi-oof/oof_predict3.csv\",\n]\noof_list = [load_oof(p) for p in CAND_OOF]\noof_list = [df for df in oof_list if df is not None]\n\ntemperatures = {\"p1\": 1.0, \"p2\": 1.0, \"p3\": 1.0}\nif len(oof_list) == 3:\n    for key, df in zip([\"p1\",\"p2\",\"p3\"], oof_list):\n        if \"y\" in df.columns:\n            y = df[\"y\"].values\n            prob_cols = [c for c in df.columns if c.startswith(\"class_\")]\n            P = df[prob_cols].values\n            logits = _prob_to_logits(P)\n            def nll(T):\n                z = logits / T\n                q = _logits_to_prob(z)\n                q = np.clip(q, EPS, 1.0-EPS)\n                return -np.mean(np.log(q[np.arange(len(y)), y]))\n            Ts = np.linspace(0.5, 5.0, 46)\n            vals = [nll(T) for T in Ts]\n            temperatures[key] = float(Ts[int(np.argmin(vals))])\n\nmeta_model = None\nif len(oof_list) == 3 and all(\"y\" in df.columns for df in oof_list):\n    y = oof_list[0][\"y\"].values\n    feats = []\n    for key, df in zip([\"p1\",\"p2\",\"p3\"], oof_list):\n        prob_cols = [c for c in df.columns if c.startswith(\"class_\")]\n        P = df[prob_cols].values\n        z = _prob_to_logits(P) / temperatures[key]\n        feats.append(z)\n    X_meta = np.concatenate(feats, axis=1)\n    try:\n        meta_model = LogisticRegression(max_iter=200, multi_class=\"multinomial\")\n        meta_model.fit(X_meta, y)\n    except Exception:\n        meta_model = None\n\ndemo_prior_model = None\ndemo_cols_cat, demo_cols_num = [], []\ntry:\n    demo_train_path = \"/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv\"\n    label_train_path = \"/kaggle/input/cmi-detect-behavior-with-sensor-data/train_labels.csv\"\n    if os.path.exists(demo_train_path) and os.path.exists(label_train_path):\n        df_demo = pd.read_csv(demo_train_path)\n        df_y = pd.read_csv(label_train_path)\n        df = df_demo.merge(df_y, on=\"sequence_id\", how=\"inner\")\n        for c in df.columns:\n            if c in [\"sequence_id\",\"subject_id\",\"label\"]:\n                continue\n            if df[c].dtype == \"object\":\n                demo_cols_cat.append(c)\n            else:\n                demo_cols_num.append(c)\n        if len(demo_cols_cat)+len(demo_cols_num) > 0:\n            pre = ColumnTransformer([\n                (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), demo_cols_cat),\n                (\"num\", StandardScaler(with_mean=True, with_std=True), demo_cols_num),\n            ])\n            demo_prior_model = Pipeline([(\"pre\", pre),\n                                         (\"clf\", LogisticRegression(max_iter=200, multi_class=\"multinomial\"))])\n            demo_prior_model.fit(df[demo_cols_cat+demo_cols_num], df[\"label\"])\nexcept Exception:\n    demo_prior_model = None\n\ndef apply_demographics_prior(proba, demographics_df):\n    if demo_prior_model is None or demographics_df is None:\n        return proba\n    try:\n        if hasattr(demographics_df, \"to_pandas\"):\n            dp = demographics_df.to_pandas()\n        else:\n            dp = demographics_df\n        import pandas as _pd\n        if isinstance(dp, _pd.DataFrame):\n            x = dp[demo_cols_cat+demo_cols_num].copy()\n            for c in demo_cols_num:\n                if c in x.columns:\n                    x[c] = x[c].astype(float)\n            if len(x) > 1:\n                x_num = x[demo_cols_num].mean(axis=0) if demo_cols_num else _pd.Series([], dtype=float)\n                x_cat = x[demo_cols_cat].iloc[[0]] if demo_cols_cat else _pd.DataFrame(index=[0])\n                x = _pd.concat([x_cat.reset_index(drop=True), _pd.DataFrame([x_num]).reset_index(drop=True)], axis=1)\n            prior = demo_prior_model.predict_proba(x)\n            prior = np.clip(prior, EPS, 1.0)\n            prior = prior / prior.sum(axis=1, keepdims=True)\n            out = proba * prior\n            out = out / out.sum(axis=1, keepdims=True)\n            return out\n    except Exception:\n        pass\n    return proba\n\ndef predict(sequence, demographics):\n    p1 = predict1(sequence, demographics)[0]\n    p2 = predict2(sequence, demographics)[0]\n    p3 = predict3(sequence, demographics)[0]\n\n    z1 = _prob_to_logits(p1[None, :]) / temperatures[\"p1\"]\n    z2 = _prob_to_logits(p2[None, :]) / temperatures[\"p2\"]\n    z3 = _prob_to_logits(p3[None, :]) / temperatures[\"p3\"]\n\n    if meta_model is not None:\n        Xq = np.concatenate([z1, z2, z3], axis=1)\n        pq = meta_model.predict_proba(Xq)\n    else:\n        w = np.array([0.30, 0.35, 0.35], dtype=float)\n        pq = _logits_to_prob(w[0]*z1 + w[1]*z2 + w[2]*z3)\n\n    pq2 = apply_demographics_prior(pq, demographics)\n\n    try:\n        cls_idx = int(np.argmax(pq2, axis=1)[0])\n        return dataset.le.classes_[cls_idx]\n    except Exception:\n        return int(np.argmax(pq2, axis=1)[0])\n\nprint(\"[INFO] Enhanced predict() with T-scaling, stacking, and demographics priors is active.\")\n\n\n# # Submission\n\n# In[ ]:\n\n\nimport numpy as np\n\ndef predict(sequence, demographics):\n\n    import copy\n    \n    pred0 = predict1(sequence, demographics)[0]\n    pred1 = predict2(sequence, demographics)[0]\n    pred2 = predict3(sequence, demographics)[0]\n    \n    m_w,da_w,c_w = [0.271, 0.347, 0.382], [0.70, 0.30], [+0.0021,-0.0007,-0.0014]\n\n    m_wts, preds = np.asarray(m_w), []\n    \n    for a,b,c in zip(pred0,pred1,pred2):    \n        l_abc = [\n            { 'wts':m_wts[0], 'pred':a, 'res':0 },\n            { 'wts':m_wts[1], 'pred':b, 'res':0 },\n            { 'wts':m_wts[2], 'pred':c, 'res':0 },\n        ]\n        l_asc  = sorted(copy.deepcopy(l_abc), key=lambda _:_['pred'],reverse=False)\n        l_desc = sorted(copy.deepcopy(l_abc), key=lambda _:_['pred'],reverse=True)\n        \n        for asc, c_wts in zip(l_asc, c_w): asc ['res'] = asc ['pred'] * (asc ['wts'] +c_wts)\n        for desc,c_wts in zip(l_desc,c_w): desc['res'] = desc['pred'] * (desc['wts'] +c_wts)\n\n        result_asc  = sum([asc ['res'] for asc in l_asc])\n        result_desc = sum([desc['res'] for asc in l_desc])\n\n        result = result_asc * da_w[0] + da_w[1] * result_desc\n \n        preds.append(result)\n        \n    avg_pred =  np.asarray(preds)\n                \n    return dataset.le.classes_[avg_pred.argmax()]\n\n# In[ ]:\n\n\nimport kaggle_evaluation.cmi_inference_server\ninference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)\n\nif os.getenv('KAGGLE_IS_COMPETITION_RERUN'):\n    inference_server.serve()\nelse:\n    inference_server.run_local_gateway(\n        data_paths=(\n            '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv',\n            '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv',\n        )\n    )\n\nif not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):\n    print(pd.read_parquet(\"submission.parquet\"))\n\n# In[ ]:\n\n\n\n# === Upgrade v2: + Uncertainty-aware weighting + Test-Time Smoothing (TTA) ===\nimport numpy as np\nimport pandas as pd\nimport re\ntry:\n    import polars as pl\nexcept Exception:\n    pl = None\n\nTTA_ENABLE = True\nSMOOTH_WIN = 3\nEPS = 1e-12\n\ndef _entropy(p):\n    p = np.clip(p, EPS, 1.0)\n    p = p / p.sum(axis=-1, keepdims=True)\n    return -np.sum(p * np.log(p), axis=-1)\n\ndef _median_roll(a, win=3):\n    s = pd.Series(a)\n    out = s.rolling(win, min_periods=1).median()\n    out = out.fillna(method=\"bfill\").fillna(method=\"ffill\")\n    return out.values\n\ndef _smooth_sequence(sequence, win=3):\n    if hasattr(sequence, \"to_pandas\"):\n        df = sequence.to_pandas().copy()\n    else:\n        df = sequence.copy()\n    pat_tof = re.compile(r\"^tof_\\d+_v\\d+$\")\n    pat_thm = re.compile(r\"^(thm_|thermal_)\")\n    for col in list(df.columns):\n        if getattr(df[col], \"dtype\", None) is not None and str(df[col].dtype).startswith((\"float\",\"int\")):\n            if pat_tof.match(col) or pat_thm.match(col):\n                try:\n                    df[col] = _median_roll(df[col].values, win=win)\n                except Exception:\n                    pass\n    if pl is not None:\n        try:\n            return pl.from_pandas(df)\n        except Exception:\n            return df\n    return df\n\ndef _prob_to_logits(p):\n    p = np.clip(p, EPS, 1.0 - EPS)\n    if p.ndim == 1:\n        p = p[None, :]\n    p = p / p.sum(axis=1, keepdims=True)\n    return np.log(p)\n\ndef _logits_to_prob(z):\n    z = z - z.max(axis=1, keepdims=True)\n    ez = np.exp(z)\n    return ez / ez.sum(axis=1, keepdims=True)\n\ndef predict(sequence, demographics):\n    p1 = predict1(sequence, demographics)[0]\n    p2 = predict2(sequence, demographics)[0]\n    p3 = predict3(sequence, demographics)[0]\n\n    if TTA_ENABLE:\n        seq_s = _smooth_sequence(sequence, win=SMOOTH_WIN)\n        p1_s = predict1(seq_s, demographics)[0]\n        p2_s = predict2(seq_s, demographics)[0]\n        p3_s = predict3(seq_s, demographics)[0]\n        p1 = 0.5*(p1 + p1_s)\n        p2 = 0.5*(p2 + p2_s)\n        p3 = 0.5*(p3 + p3_s)\n\n    z1 = _prob_to_logits(p1) / temperatures.get(\"p1\", 1.0)\n    z2 = _prob_to_logits(p2) / temperatures.get(\"p2\", 1.0)\n    z3 = _prob_to_logits(p3) / temperatures.get(\"p3\", 1.0)\n\n    if 'meta_model' in globals() and meta_model is not None:\n        Xq = np.concatenate([z1, z2, z3], axis=1)\n        pq = meta_model.predict_proba(Xq)\n    else:\n        base = np.array([0.30, 0.35, 0.35], dtype=float)\n        e1, e2, e3 = float(_entropy(p1)), float(_entropy(p2)), float(_entropy(p3))\n        inv = np.array([1.0/(e1+1e-6), 1.0/(e2+1e-6), 1.0/(e3+1e-6)], dtype=float)\n        inv = inv / inv.sum()\n        w = 0.5*base + 0.5*inv\n        pq = _logits_to_prob(w[0]*z1 + w[1]*z2 + w[2]*z3)\n\n    if 'apply_demographics_prior' in globals():\n        pq2 = apply_demographics_prior(pq, demographics)\n    else:\n        pq2 = pq\n\n    try:\n        cls_idx = int(np.argmax(pq2, axis=1)[0])\n        return dataset.le.classes_[cls_idx]\n    except Exception:\n        return int(np.argmax(pq2, axis=1)[0])\n\nprint(\"[INFO] Upgrade v2 active: uncertainty-aware weighting + TTA smoothing (window=3).\")\n\n\n# In[ ]:\n\n\n\n# =======================\n# Improved predict(): context normalization + group-wise T + entropy-weighted ensemble\n# =======================\nimport numpy as np, pandas as pd, os, re\n\ndef _num_cols(df):\n    cols = []\n    for c in df.columns:\n        try:\n            if pd.api.types.is_numeric_dtype(df[c]):\n                cols.append(c)\n        except Exception:\n            pass\n    return cols\n\n_pat_tof = re.compile(r\"^(tof|tof_)\\b|^tof_\\d+(_mean|_std|_min|_max)?$\")\n_pat_thm = re.compile(r\"^(thm_|thermal_)\")\n\ndef _winsorize_inplace(df, cols, p=0.01):\n    for c in cols:\n        try:\n            lo = df[c].quantile(p)\n            hi = df[c].quantile(1-p)\n            df[c] = df[c].clip(lo, hi)\n        except Exception:\n            pass\n\ndef _sequence_baseline(df, cols, k=10):\n    base = {}\n    head = df.head(k)\n    for c in cols:\n        try:\n            base[c] = float(head[c].median(skipna=True))\n        except Exception:\n            base[c] = 0.0\n    return base\n\ndef _context_normalize(sequence, demographics):\n    if not V3_ENABLE_CONTEXT_NORM:\n        return sequence\n    if hasattr(sequence, \"to_pandas\"):\n        df = sequence.to_pandas().copy()\n        to_polars = True\n    else:\n        df = sequence.copy()\n        to_polars = False\n    numc = _num_cols(df)\n    tof_cols = [c for c in numc if _pat_tof.search(c)]\n    thm_cols = [c for c in numc if _pat_thm.search(c)]\n    _winsorize_inplace(df, tof_cols + thm_cols, p=V3_WINSOR_P)\n    base_tof = _sequence_baseline(df, tof_cols, k=10) if len(tof_cols) else {}\n    base_thm = _sequence_baseline(df, thm_cols, k=10) if len(thm_cols) else {}\n    for c in tof_cols:\n        try: df[c] = df[c] - base_tof.get(c, 0.0)\n        except Exception: pass\n    for c in thm_cols:\n        try: df[c] = df[c] - base_thm.get(c, 0.0)\n        except Exception: pass\n\n    arm_len = None\n    try:\n        if demographics is not None:\n            if hasattr(demographics, \"to_pandas\"):\n                demo = demographics.to_pandas()\n            else:\n                demo = demographics\n            for cand in [\"shoulder_to_wrist_cm\", \"elbow_to_wrist_cm\"]:\n                if cand in demo.columns and pd.notnull(demo[cand]).any():\n                    val = float(pd.to_numeric(demo[cand], errors=\"coerce\").dropna().mean())\n                    if val > 0: arm_len = val; break\n    except Exception:\n        arm_len = None\n    if arm_len is not None and len(tof_cols):\n        s = max(arm_len, 1e-3)\n        for c in tof_cols:\n            try: df[c] = df[c] / s\n            except Exception: pass\n\n    if 'polars' in globals() and polars is not None and to_polars:\n        try:\n            import polars as pl\n            return pl.from_pandas(df)\n        except Exception:\n            return df\n    return df\n\ndef _prob_to_logits(p, eps=1e-12):\n    p = np.clip(p, eps, 1.0 - eps)\n    if p.ndim == 1: p = p[None, :]\n    p = p / p.sum(axis=1, keepdims=True)\n    return np.log(p)\n\ndef _logits_to_prob(z):\n    z = z - z.max(axis=1, keepdims=True)\n    ez = np.exp(z); return ez / ez.sum(axis=1, keepdims=True)\n\ndef _entropy(p, eps=1e-12):\n    p = np.clip(p, eps, 1.0); p = p / p.sum(axis=-1, keepdims=True)\n    return -np.sum(p * np.log(p), axis=-1)\n\n# (optional) learn group-wise temperatures from OOF + demographics\ntemperatures_by_group = None\nif V3_USE_GROUP_TS:\n    try:\n        oof_paths = [\n            f'{CFG[\"OOF_DIR\"]}/oof_predict1.csv',\n            f'{CFG[\"OOF_DIR\"]}/oof_predict2.csv',\n            f'{CFG[\"OOF_DIR\"]}/oof_predict3.csv',\n        ]\n        oofs = []\n        for p in oof_paths:\n            if os.path.exists(p):\n                df = pd.read_csv(p)\n                if 'sequence_id' in df.columns and 'y' in df.columns:\n                    oofs.append(df)\n        demo_path = f'{CFG[\"DATASET_DIR\"]}/train_demographics.csv'\n        if len(oofs)==3 and os.path.exists(demo_path):\n            demo = pd.read_csv(demo_path)[[\"sequence_id\",\"adult_child\",\"shoulder_to_wrist_cm\"]]\n            merged = []\n            for i,df in enumerate(oofs,1):\n                prob_cols = [c for c in df.columns if c.startswith(\"class_\")]\n                tmp = df[['sequence_id','y']+prob_cols].merge(demo, on=\"sequence_id\", how=\"left\")\n                tmp['arm_bin'] = (pd.to_numeric(tmp['shoulder_to_wrist_cm'], errors='coerce')>V3_ARM_BIN_THRESHOLD).astype('Int64')\n                tmp['model'] = f\"p{i}\"; merged.append(tmp)\n            full = pd.concat(merged, ignore_index=True)\n            def _best_T_from_oof(P, y, Ts=np.linspace(0.5,5.0,46)):\n                logits = _prob_to_logits(P)\n                n = len(y); bestT, bestNLL = 1.0, 1e9\n                for T in Ts:\n                    q = _logits_to_prob(logits / T)\n                    nll = -np.mean(np.log(np.clip(q[np.arange(n), y], 1e-12, 1.0)))\n                    if nll < bestNLL: bestNLL, bestT = nll, float(T)\n                return bestT\n            temperatures_by_group = {}\n            for key, sub in full.groupby(['model','adult_child','arm_bin'], dropna=False):\n                ch = sub.dropna(subset=[c for c in sub.columns if c.startswith(\"class_\")]+['y'])\n                if len(ch) > 50:\n                    P = ch[[c for c in ch.columns if c.startswith(\"class_\")]].values\n                    y = ch['y'].astype(int).values\n                    Tbest = _best_T_from_oof(P, y)\n                    temperatures_by_group[key] = Tbest\n            print(\"[INFO] Learned group-wise T:\", len(temperatures_by_group or {}))\n    except Exception as e:\n        temperatures_by_group = None\n        print(\"[WARN] Group-wise T disabled:\", e)\n\ndef _select_T(model_key, demographics):\n    if temperatures_by_group is not None:\n        try:\n            if hasattr(demographics, \"to_pandas\"):\n                d = demographics.to_pandas()\n            else:\n                d = demographics\n            ac = d.get(\"adult_child\")\n            if isinstance(ac, pd.Series): ac = ac.iloc[0]\n            ac = int(ac) if pd.notnull(ac) else None\n            ab = None\n            if \"shoulder_to_wrist_cm\" in d:\n                val = pd.to_numeric(d[\"shoulder_to_wrist_cm\"], errors='coerce')\n                val = val.iloc[0] if isinstance(val, pd.Series) else val\n                if pd.notnull(val): ab = int(float(val) > V3_ARM_BIN_THRESHOLD)\n            key = (model_key, ac, ab)\n            if key in temperatures_by_group:\n                return float(temperatures_by_group[key])\n        except Exception:\n            pass\n    return float(temperatures.get(model_key, 1.0))\n\ndef predict(sequence, demographics):\n    # context normalize\n    seqN = _context_normalize(sequence, demographics)\n\n    # base predictions from existing v2 models\n    p1 = predict1(seqN, demographics)[0]\n    p2 = predict2(seqN, demographics)[0]\n    p3 = predict3(seqN, demographics)[0]\n\n    # TTA smoothing if available\n    if TTA_ENABLE and 'SMOOTH_WIN' in globals() and '_smooth_sequence' in globals():\n        try:\n            seq_s = _smooth_sequence(seqN, win=SMOOTH_WIN)\n            p1 = 0.5*(p1 + predict1(seq_s, demographics)[0])\n            p2 = 0.5*(p2 + predict2(seq_s, demographics)[0])\n            p3 = 0.5*(p3 + predict3(seq_s, demographics)[0])\n        except Exception:\n            pass\n\n    # T-scaling (group-wise or global)\n    z1 = _prob_to_logits(p1) / _select_T(\"p1\", demographics)\n    z2 = _prob_to_logits(p2) / _select_T(\"p2\", demographics)\n    z3 = _prob_to_logits(p3) / _select_T(\"p3\", demographics)\n\n    # stacker if present, else entropy-weighted logits blend\n    if 'meta_model' in globals() and meta_model is not None:\n        Xq = np.concatenate([z1, z2, z3], axis=1)\n        pq = meta_model.predict_proba(Xq)\n    else:\n        base = np.array([0.30, 0.35, 0.35], dtype=float)\n        e1, e2, e3 = float(_entropy(p1)), float(_entropy(p2)), float(_entropy(p3))\n        inv = np.array([1.0/(e1+1e-6), 1.0/(e2+1e-6), 1.0/(e3+1e-6)], dtype=float)\n        inv = inv / inv.sum()\n        w = 0.5*base + 0.5*inv\n        pq = _logits_to_prob(w[0]*z1 + w[1]*z2 + w[2]*z3)\n\n    # demographic prior if defined upstream\n    if 'apply_demographics_prior' in globals():\n        pq2 = apply_demographics_prior(pq, demographics)\n    else:\n        pq2 = pq\n\n    try:\n        cls_idx = int(np.argmax(pq2, axis=1)[0])\n        return dataset.le.classes_[cls_idx]\n    except Exception:\n        return int(np.argmax(pq2, axis=1)[0])\n\nprint(\"[INFO] Improved predict() ready (context norm + group-T + entropy-weighted).\")\n\n\n# In[ ]:\n\n\n\n# =======================\n# SUBMISSION RUNNER (v2 compatible)\n# =======================\nimport pandas as pd\n\ndef _infer_sequences_and_demo():\n    seqs, demo = None, None\n    # common dataset holders\n    try:\n        if 'dataset' in globals():\n            ds = dataset\n            if hasattr(ds, \"get_test_sequences\"):\n                seqs = ds.get_test_sequences()\n            elif hasattr(ds, \"full_dataset\") and hasattr(ds.full_dataset, \"get_test_sequences\"):\n                seqs = ds.full_dataset.get_test_sequences()\n            elif hasattr(ds, \"test_sequences\"):\n                seqs = ds.test_sequences\n            if hasattr(ds, \"test_demographics\"):\n                demo = ds.test_demographics\n            elif hasattr(ds, \"full_dataset\") and hasattr(ds.full_dataset, \"test_demographics\"):\n                demo = ds.full_dataset.test_demographics\n    except Exception as e:\n        print(\"[WARN] dataset probing:\", e)\n    return seqs, demo\n\ndef _safe_predict_df(test_sequences, test_demographics):\n    rows = []\n    for sid, seq in test_sequences.items():\n        # slice demo by sequence\n        demo = None\n        if test_demographics is not None:\n            try:\n                if 'pl' in globals():\n                    demo = test_demographics.filter(pl.col(\"sequence_id\")==sid)\n                else:\n                    demo = test_demographics[test_demographics['sequence_id']==sid]\n            except Exception:\n                demo = None\n        y = predict(seq, demo)\n        # subject id lookup best-effort\n        subject_id = 0\n        try:\n            if hasattr(seq, \"to_pandas\"):\n                df = seq.to_pandas()\n            else:\n                df = seq\n            if \"subject_id\" in df.columns:\n                subject_id = int(pd.to_numeric(df[\"subject_id\"], errors=\"coerce\").dropna().iloc[0])\n        except Exception:\n            pass\n        rows.append({\"sequence_id\": sid, \"subject_id\": subject_id, \"label\": y})\n    return pd.DataFrame(rows)\n\nseqs, demo = _infer_sequences_and_demo()\nif seqs is None:\n    print(\"[ERROR] Test sequences not found. Ensure your loader exposes them as dict-like object.\")\nelse:\n    sub = _safe_predict_df(seqs, demo)\n    sub.to_csv(CFG.get(\"SUBMISSION_NAME\",\"submission.csv\"), index=False)\n    print(sub.head())\n    print(\"Saved:\", CFG.get(\"SUBMISSION_NAME\",\"submission.csv\"))\n\n\n\n\n# =============================================================\n# PATCH++  (All-in-One Enhancements)\n# - predict1 -> IMU-only\n# - predict2 -> Late Fusion\n# - predict3 -> Early Fusion (keep original)\n# - Conservative TTA (shift/jitter/mask) for predict1/2/3\n# - Robust logit-space blend + Temperature Scaling (auto-fallback)\n# - Optional MC-Dropout (disabled by default)\n# - No external files needed\n# =============================================================\nprint(\"[PATCH++] Enabling IMU-only/Late/Early mapping + TTA + logit blend + temp scaling\")\n\nimport os as _os, json as _json\nimport numpy as _np\nimport torch as _torch\nimport torch.nn.functional as _F\n\n# -----------------------\n# Config (edit if needed)\n# -----------------------\n_CFG_PATCH = {\n    \"TTA_ENABLE\": True,\n    \"TTA_RUNS\": 6,\n    \"TTA_JITTER_STD\": 0.01,\n    \"TTA_MAX_SHIFT\": 3,\n    \"TTA_MASK_PROB\": 0.05,\n    \"MC_DROPOUT_RUNS\": 0,         # 0 = off\n    \"BLEND_JSON_CANDIDATES\": [\n        \"./blend_and_temp.json\",\n        \"/kaggle/working/blend_and_temp.json\",\n        \"/kaggle/input/blend-and-temp/blend_and_temp.json\",\n        \"/kaggle/input/blend_and_temp/blend_and_temp.json\"\n    ],\n    \"DEFAULT_WEIGHTS\": [0.30, 0.35, 0.35],  # predict1/predict2/predict3\n    \"DEFAULT_TEMP\": 1.0\n}\n\n# -----------------------\n# Utils\n# -----------------------\ndef _softmax(z):\n    z = z - z.max(axis=1, keepdims=True)\n    ez = _np.exp(z)\n    return ez / ez.sum(axis=1, keepdims=True)\n\ndef _to_logits_from_probs(P, eps=1e-12):\n    P = _np.clip(P, eps, 1 - eps)\n    return _np.log(P)  # multi-class safe\n\ndef _load_blend_and_temp():\n    for p in _CFG_PATCH[\"BLEND_JSON_CANDIDATES\"]:\n        try:\n            if _os.path.exists(p):\n                with open(p, \"r\") as f:\n                    cfg = _json.load(f)\n                w = _np.array(cfg.get(\"weights\", _CFG_PATCH[\"DEFAULT_WEIGHTS\"]), dtype=float)\n                T = float(cfg.get(\"temp\", _CFG_PATCH[\"DEFAULT_TEMP\"]))\n                if _np.all(w >= 0) and w.sum() > 0:\n                    w = w / w.sum()\n                else:\n                    w = _np.array(_CFG_PATCH[\"DEFAULT_WEIGHTS\"], dtype=float); w = w / w.sum()\n                return w, T, p\n        except Exception:\n            pass\n    w = _np.array(_CFG_PATCH[\"DEFAULT_WEIGHTS\"], dtype=float); w = w / w.sum()\n    return w, float(_CFG_PATCH[\"DEFAULT_TEMP\"]), None\n\ndef _maybe_to_pandas(df_like):\n    try:\n        import polars as _pl\n        if isinstance(df_like, _pl.DataFrame):\n            return df_like.to_pandas(), \"pl\"\n    except Exception:\n        pass\n    try:\n        import pandas as _pd\n        if isinstance(df_like, _pd.DataFrame):\n            return df_like.copy(), \"pd\"\n    except Exception:\n        pass\n    return None, None\n\ndef _back_from_pandas(df_pd, kind):\n    if kind == \"pl\":\n        try:\n            import polars as _pl\n            return _pl.from_pandas(df_pd)\n        except Exception:\n            return df_pd\n    return df_pd\n\ndef _jitter(arr, std):\n    return arr + _np.random.normal(0.0, std, size=arr.shape).astype(arr.dtype, copy=False)\n\ndef _shift(arr, max_shift):\n    if max_shift <= 0: return arr\n    s = int(_np.random.randint(-max_shift, max_shift+1))\n    if s == 0: return arr\n    if s > 0:\n        return _np.concatenate([_np.zeros_like(arr[:s]), arr[:-s]], axis=0)\n    else:\n        return _np.concatenate([arr[-s:], _np.zeros_like(arr[: -s])], axis=0)\n\ndef _mask(arr, prob):\n    if prob <= 0: return arr\n    m = _np.random.rand(*arr.shape) < prob\n    out = arr.copy()\n    out[m] = 0\n    return out\n\ndef _augment_sequence(df_like, jitter_std, max_shift, mask_prob):\n    df_pd, kind = _maybe_to_pandas(df_like)\n    if df_pd is None:\n        return df_like\n    import pandas as _pd\n    num_cols = [c for c in df_pd.columns if _pd.api.types.is_numeric_dtype(df_pd[c])]\n    if not num_cols:\n        return _back_from_pandas(df_pd, kind)\n    mat = df_pd[num_cols].to_numpy()\n    mat = _jitter(mat, jitter_std)\n    mat = _shift(mat, max_shift)\n    mat = _mask(mat, mask_prob)\n    df_pd[num_cols] = mat\n    return _back_from_pandas(df_pd, kind)\n\ndef _set_models_dropout_train(enable=True):\n    try:\n        groups = []\n        if \"models1\" in globals(): groups.append(models1)\n        if \"models2\" in globals(): groups.append(models2)\n        for ms in groups:\n            for m in ms:\n                for module in m.modules():\n                    if \"dropout\" in module.__class__.__name__.lower():\n                        module.train(enable)\n    except Exception:\n        pass\n\n# -----------------------\n# Base mapping functions\n# -----------------------\nprint(\"[PATCH++] Mapping predict1->IMU-only, predict2->LateFusion, predict3->EarlyFusion\")\n\ndef _predict2_late(sequence, demographics):\n    \"\"\"Late fusion via models2/avg_predict (original).\"\"\"\n    imu, thm, tof = dataset.full_dataset.inference_process(sequence)\n    with _torch.no_grad():\n        imu, thm, tof = to_cuda(imu, thm, tof)\n        logits = avg_predict(models2, imu, thm, tof)\n        probabilities = _F.softmax(logits, dim=1).cpu().numpy()\n    return probabilities\n\ndef _predict1_imu(sequence, demographics):\n    \"\"\"IMU-only: zero out THM/TOF; reuse models2 head to keep flow stable.\"\"\"\n    imu, thm, tof = dataset.full_dataset.inference_process(sequence)\n    try:\n        import numpy as _np\n        thm = _np.zeros_like(thm, dtype=thm.dtype)\n        tof = _np.zeros_like(tof, dtype=tof.dtype)\n    except Exception:\n        pass\n    with _torch.no_grad():\n        imu, thm, tof = to_cuda(imu, thm, tof)\n        try:\n            thm.zero_(); tof.zero_()\n        except Exception:\n            pass\n        logits = avg_predict(models2, imu, thm, tof)\n        probabilities = _F.softmax(logits, dim=1).cpu().numpy()\n    return probabilities\n\n# Keep original predict3 (Early Fusion) as defined above.\n_predict3_early = predict3\n\n# -----------------------\n# TTA wrapper\n# -----------------------\ndef _predict_with_tta(base_fn, sequence, demographics):\n    if not _CFG_PATCH[\"TTA_ENABLE\"] or _CFG_PATCH[\"TTA_RUNS\"] <= 1:\n        return base_fn(sequence, demographics)\n\n    preds = []\n    preds.append(base_fn(sequence, demographics))\n    for _ in range(_CFG_PATCH[\"TTA_RUNS\"] - 1):\n        seq_aug = _augment_sequence(\n            sequence,\n            _CFG_PATCH[\"TTA_JITTER_STD\"],\n            _CFG_PATCH[\"TTA_MAX_SHIFT\"],\n            _CFG_PATCH[\"TTA_MASK_PROB\"]\n        )\n        preds.append(base_fn(seq_aug, demographics))\n    P = _np.stack(preds, axis=0).astype(\"float64\")\n    P = _np.clip(P, 1e-8, 1-1e-8)\n    return P.mean(axis=0)\n\ndef predict1(sequence, demographics):\n    if _CFG_PATCH[\"MC_DROPOUT_RUNS\"] and _CFG_PATCH[\"MC_DROPOUT_RUNS\"] > 0:\n        _set_models_dropout_train(True)\n        ps = []\n        for _ in range(int(_CFG_PATCH[\"MC_DROPOUT_RUNS\"])):\n            ps.append(_predict_with_tta(_predict1_imu, sequence, demographics))\n        _set_models_dropout_train(False)\n        return _np.mean(_np.stack(ps, 0), 0)\n    return _predict_with_tta(_predict1_imu, sequence, demographics)\n\ndef predict2(sequence, demographics):\n    if _CFG_PATCH[\"MC_DROPOUT_RUNS\"] and _CFG_PATCH[\"MC_DROPOUT_RUNS\"] > 0:\n        _set_models_dropout_train(True)\n        ps = []\n        for _ in range(int(_CFG_PATCH[\"MC_DROPOUT_RUNS\"])):\n            ps.append(_predict_with_tta(_predict2_late, sequence, demographics))\n        _set_models_dropout_train(False)\n        return _np.mean(_np.stack(ps, 0), 0)\n    return _predict_with_tta(_predict2_late, sequence, demographics)\n\ndef predict3(sequence, demographics):\n    if _CFG_PATCH[\"MC_DROPOUT_RUNS\"] and _CFG_PATCH[\"MC_DROPOUT_RUNS\"] > 0:\n        _set_models_dropout_train(True)\n        ps = []\n        for _ in range(int(_CFG_PATCH[\"MC_DROPOUT_RUNS\"])):\n            ps.append(_predict_with_tta(_predict3_early, sequence, demographics))\n        _set_models_dropout_train(False)\n        return _np.mean(_np.stack(ps, 0), 0)\n    return _predict_with_tta(_predict3_early, sequence, demographics)\n\nprint(\"[PATCH++] TTA ready.\")\n\n# -----------------------\n# Final predict() override:\n#  - call predict1/2/3\n#  - logit-space blend\n#  - temperature scaling\n# -----------------------\ndef predict(sequence, demographics):\n    p1 = predict1(sequence, demographics)\n    p2 = predict2(sequence, demographics)\n    p3 = predict3(sequence, demographics)\n\n    def _to_2d(p):\n        p = _np.asarray(p)\n        if p.ndim == 1:\n            p = p[None, :]\n        return p\n    p1 = _to_2d(p1); p2 = _to_2d(p2); p3 = _to_2d(p3)\n\n    w, T, src = _load_blend_and_temp()\n    if src:\n        print(f\"[PATCH++] Using blend/temp from: {src} -> w={w.tolist()}, T={T:.3f}\")\n    else:\n        print(f\"[PATCH++] Using default blend/temp -> w={w.tolist()}, T={T:.3f}\")\n\n    Z1, Z2, Z3 = _to_logits_from_probs(p1), _to_logits_from_probs(p2), _to_logits_from_probs(p3)\n    Z = w[0]*Z1 + w[1]*Z2 + w[2]*Z3\n    P = _softmax(Z / max(T, 1e-3))\n    return P\n# =============================================================")
print('Saved original script to:', ORIGINAL_PATH)

In [None]:

# =========================
# 2. Data Discovery/Loading
# =========================
# Kaggle에서 흔한 파일 이름들을 자동 탐색합니다.

def auto_discover():
    patterns = [
        '/kaggle/input/**/train.parquet',
        '/kaggle/input/**/train.csv',
        '/kaggle/input/**/train_*.csv',
        '/kaggle/input/**/train.feather',
        '/kaggle/input/**/train/*.parquet',
        '/kaggle/input/**/train/*.csv',
    ]
    labels_patterns = [
        '/kaggle/input/**/train_labels.csv',
        '/kaggle/input/**/labels.csv',
        '/kaggle/input/**/y.csv'
    ]
    test_patterns = [
        '/kaggle/input/**/test.parquet',
        '/kaggle/input/**/test.csv',
        '/kaggle/input/**/test_*.csv',
        '/kaggle/input/**/test.feather',
        '/kaggle/input/**/test/*.parquet',
        '/kaggle/input/**/test/*.csv',
    ]
    trains = find_files(patterns)
    labels = find_files(labels_patterns)
    tests  = find_files(test_patterns)
    return trains, labels, tests

trains, labels, tests = auto_discover()
print('Found trains:', len(trains))
print('Found labels:', len(labels))
print('Found tests :', len(tests))
if len(trains) > 0: print('e.g.,', trains[:3])
if len(labels) > 0: print('e.g.,', labels[:3])
if len(tests)  > 0: print('e.g.,', tests[:3])

# 간단 로더 (pandas 우선, polars 있으면 대용량에서 교체할 수 있음)
def load_df(path):
    p = Path(path)
    if p.suffix == '.parquet':
        return pd.read_parquet(path)
    elif p.suffix == '.feather':
        import pyarrow.feather as feather
        return feather.read_feather(path)
    else:
        return pd.read_csv(path)

def merge_train_with_labels(train_df, labels_paths):
    if not labels_paths:
        return train_df, None
    # 가장 작은 labels 파일을 우선 사용 (일반적으로 1개)
    lab_path = sorted(labels_paths, key=lambda x: os.path.getsize(x))[0]
    ydf = load_df(lab_path)
    # (id, label) 컬럼 추정
    id_col = None
    for cand in ['id','Id','ID','sample_id','record_id','series_id']:
        if cand in train_df.columns and cand in ydf.columns:
            id_col = cand; break
    if id_col is None:
        # 교집합 키 찾기
        common = [c for c in ydf.columns if c in train_df.columns]
        if common:
            id_col = common[0]
    label_col = None
    for cand in ['label','target','y','class']:
        if cand in ydf.columns:
            label_col = cand; break
    if id_col is None or label_col is None:
        print('(!) labels 병합 실패: id/label 컬럼을 특정할 수 없습니다. labels를 사용하지 않고 진행합니다.')
        return train_df, None
    df = train_df.merge(ydf[[id_col, label_col]], on=id_col, how='left')
    return df, label_col

# 실제 로드
if len(trains) == 0:
    raise FileNotFoundError('train 파일을 찾지 못했습니다. /kaggle/input 경로를 확인하세요.')

train_df = load_df(trains[0])
label_col = None
train_df, label_col = merge_train_with_labels(train_df, labels)

# test는 있을 수도 있고 없을 수도 있음
test_df = None
if len(tests) > 0:
    test_df = load_df(tests[0])

print('train shape:', train_df.shape, '| columns:', len(train_df.columns))
if label_col:
    print('label_col:', label_col, '| #classes:', train_df[label_col].nunique())
else:
    print('label_col: None (supervised 학습이 아닐 수 있음)')
print('test shape :', None if test_df is None else test_df.shape)

In [None]:

# ====================================
# 3. Preprocessing & Feature Selection
# ====================================
# 🔧 Winsorize + Scaling (fit on train, apply to train/test 공통)

assert train_df is not None, "train_df is None"

# label 분리
y = None
if 'label_col' in locals() and label_col is not None and label_col in train_df.columns:
    y = train_df[label_col]
    X = train_df.drop(columns=[label_col])
else:
    # 라벨이 없으면 임시로 마지막 컬럼을 라벨로 가정 (비정상일 경우 즉시 수정 필요)
    # 안전장치: 숫자형 + unique<=20 인 컬럼 있으면 그걸 라벨 후보로 사용
    candidates = [c for c in train_df.columns if train_df[c].dtype.kind in 'iu' and train_df[c].nunique() <= 20]
    if candidates:
        label_col = candidates[-1]
        y = train_df[label_col]
        X = train_df.drop(columns=[label_col])
        print(f'(!) 라벨이 명시되지 않아 임시로 `{label_col}`를 라벨로 사용합니다.')
    else:
# --- Fallback auto-detect for label_col (replaces assertion) ---
import pandas as _pd, numpy as _np, glob as _glob, os as _os

# 1) pick the train dataframe name used upstream
_train_name, _train = None, None
for _name in ['train', 'df_train', 'train_df']:
    if _name in globals():
        _train_name, _train = _name, globals()[_name]
        break

# 2) candidate label names
_candidate_labels = ['label','target','y','class','category','Class','LABEL']

# 3) try to detect existing label col
_detected = None
if _train is not None and isinstance(_train, _pd.DataFrame):
    for _c in _candidate_labels:
        if _c in _train.columns:
            _detected = _c
            break

# 4) if still none, try to merge labels file automatically
if _detected is None and _train is not None:
    _label_path = None
    for _p in ['train_labels.csv', '/kaggle/input/*/train_labels.csv', 'labels.csv', '/kaggle/input/*/labels.csv']:
        _m = _glob.glob(_p)
        if _m:
            _label_path = _m[0]
            break
    if _label_path is not None:
        _labels = _pd.read_csv(_label_path)
        # find an id-like key to merge on
        _id_keys = [c for c in _train.columns if c.lower() in ('id','row_id','sample_id','series_id','subject','pid','uid')]
        _key = None
        for _c in _id_keys:
            if _c in _labels.columns:
                _key = _c
                break
        if _key is not None:
            globals()[_train_name] = _train = _train.merge(_labels, on=_key, how='left')
            for _c in _candidate_labels:
                if _c in _train.columns:
                    _detected = _c
                    break

# 5) set label_col if found; otherwise raise a clearer instruction
if _detected is not None:
    label_col = _detected
    print(f"(fallback) 라벨 컬럼을 자동 탐지했습니다: '{label_col}'")
else:
    raise AssertionError("라벨 컬럼 탐지 실패. 노트북 상단에 `label_col = '정답칼럼명'`을 지정하거나 labels CSV 경로/키를 확인하세요.")
# --- end fallback ---

# id-like 컬럼 제거 (학습에는 불필요할 수 있음)
id_like = [c for c in X.columns if re.search(r'(^|_)(id|ID|Id|series|record|sample|session|subject)($|_)', c)]
X = X.drop(columns=id_like, errors='ignore')
if test_df is not None:
    test_df = test_df.drop(columns=id_like, errors='ignore')

# 숫자형 컬럼만 사용 (기본)
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
X = X[num_cols].copy()
if test_df is not None:
    test_df = test_df[num_cols].copy()

# Winsorize
X = winsorize_df(X, num_cols, 0.005, 0.995)
if test_df is not None:
    test_df = winsorize_df(test_df, num_cols, 0.005, 0.995)

# 스케일링
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
if test_df is not None:
    X_test_scaled = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns, index=test_df.index)
else:
    X_test_scaled = None

# 모달리티 분할 (패턴 기반)
modal = split_modalities(X_scaled.columns.tolist())
log({k: len(v) for k,v in modal.items()})
for k, v in modal.items():
    if len(v) > 0:
        log(f'{k:>6}: {v[:12]}{"..." if len(v)>12 else ""}')

# IMU-only view
imu_cols = modal['imu']
if len(imu_cols) == 0:
    # IMU 컬럼이 없으면 전체에서 top-N 분산 큰 피처를 임시 IMU로 사용 (fallback)
    variances = X_scaled.var().sort_values(ascending=False)
    imu_cols = variances.index[: min(64, len(variances))].tolist()
    print(f'(!) IMU 컬럼을 찾지 못해 고분산 Top-{len(imu_cols)} 컬럼을 임시 IMU 세트로 사용합니다.')

X_imu = X_scaled[imu_cols].copy()
X_all = X_scaled.copy()

# Late Fusion용 가지: (imu / audio / vision / text / other)
branches = {k: X_scaled[v].copy() for k, v in modal.items() if len(v) > 0}
if not branches:
    branches = {'other': X_scaled.copy()}

In [None]:

# =========================
# 4. Dataset & MLP Models
# =========================
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X.values.astype(np.float32)
        self.y = None if y is None else y.values
        self.is_classification = y is not None and y.dtype.kind in 'iu'
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        if self.y is None:
            return torch.from_numpy(self.X[idx])
        else:
            return torch.from_numpy(self.X[idx]), int(self.y[idx])

class MLP(nn.Module):
    def __init__(self, in_dim, hidden=256, dropout=0.2, out_dim=2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, out_dim),
        )
    def forward(self, x):
        return self.net(x)

def train_one_model(X_train, y_train, X_val, y_val, in_dim, n_classes, epochs=40, batch_size=512, lr=1e-3):
    model = MLP(in_dim, hidden=512, dropout=0.3, out_dim=n_classes).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    crit = nn.CrossEntropyLoss()
    
    ds_tr = TabDataset(X_train, y_train)
    ds_va = TabDataset(X_val, y_val)
    dl_tr = DataLoader(ds_tr, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    dl_va = DataLoader(ds_va, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    
    best = (1e9, None)
    for ep in range(epochs):
        model.train()
        total = 0.0
        for xb, yb in dl_tr:
            xb = xb.to(DEVICE); yb = yb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = crit(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
            opt.step()
            total += loss.item() * xb.size(0)
        # valid
        model.eval()
        all_logits = []
        all_y = []
        with torch.no_grad():
            for xb, yb in dl_va:
                xb = xb.to(DEVICE); yb = yb.to(DEVICE)
                lg = model(xb)
                all_logits.append(lg.cpu().numpy())
                all_y.append(yb.cpu().numpy())
        all_logits = np.concatenate(all_logits, axis=0)
        all_y = np.concatenate(all_y, axis=0)
        va_loss = log_loss(all_y, softmax_np(all_logits))
        if va_loss < best[0]:
            best = (va_loss, model.state_dict())
        if (ep+1) % 5 == 0:
            print(f'[ep {ep+1:02d}] train_loss={total/len(ds_tr):.4f} | val_logloss={va_loss:.4f}')
    # load best
    model.load_state_dict(best[1])
    return model

def predict_logits(model, X, batch_size=1024):
    ds = TabDataset(X, None)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    model.eval(); outs = []
    with torch.no_grad():
        for xb in dl:
            xb = xb.to(DEVICE)
            outs.append(model(xb).cpu().numpy())
    return np.concatenate(outs, axis=0)

class TemperatureScaler(nn.Module):
    # Guo et al., On Calibration of Modern Neural Networks
    def __init__(self):
        super().__init__()
        self.temperature = nn.Parameter(torch.ones(1))

    def forward(self, logits):
        return logits / self.temperature

def fit_temperature(logits, labels, max_iter=50):
    logits_t = torch.tensor(logits, dtype=torch.float32)
    labels_t = torch.tensor(labels, dtype=torch.long)
    nll_crit = nn.CrossEntropyLoss()
    scaler = TemperatureScaler()
    opt = torch.optim.LBFGS([scaler.temperature], lr=0.01, max_iter=max_iter)

    def closure():
        opt.zero_grad()
        loss = nll_crit(scaler(logits_t), labels_t)
        loss.backward()
        return loss
    opt.step(closure)
    return scaler.temperature.detach().item()

In [None]:

# =====================================
# 5. CV Training — 3 Pipelines (Model1/2/3)
# =====================================
assert infer_problem_type(y) == 'classification', "현재 버전은 분류만 지원합니다."

n_classes = int(pd.Series(y).nunique())
print('n_classes =', n_classes)

# group-aware CV if possible
group_col = guess_group_column(train_df)
if group_col and group_col in train_df.columns:
    if y.nunique() > 1:
        # sklearn>=1.1 에서 StratifiedGroupKFold 사용 가능
        kf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
        groups = train_df[group_col]
        splitter = kf.split(X_all, y, groups)
        print(f'Using StratifiedGroupKFold by `{group_col}`')
    else:
        kf = GroupKFold(n_splits=5)
        groups = train_df[group_col]
        splitter = kf.split(X_all, groups=groups)
        print(f'Using GroupKFold by `{group_col}` (no stratification)')
else:
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    splitter = kf.split(X_all, y)
    print('Using StratifiedKFold')

# containers
oof_logits_m1 = np.zeros((len(X_all), n_classes), dtype=np.float32)
oof_logits_m2 = np.zeros((len(X_all), n_classes), dtype=np.float32)
oof_logits_m3 = np.zeros((len(X_all), n_classes), dtype=np.float32)

test_logits_m1 = None if X_test_scaled is None else np.zeros((len(X_test_scaled), n_classes), dtype=np.float32)
test_logits_m2 = None if X_test_scaled is None else np.zeros((len(X_test_scaled), n_classes), dtype=np.float32)
test_logits_m3 = None if X_test_scaled is None else np.zeros((len(X_test_scaled), n_classes), dtype=np.float32)

fold = 0
for tr_idx, va_idx in splitter:
    fold += 1
    print(f"\n========== FOLD {fold} ==========")
    X_tr_all, X_va_all = X_all.iloc[tr_idx], X_all.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # ---------- Model1: IMU-only ----------
    X_tr_m1 = X_imu.iloc[tr_idx]; X_va_m1 = X_imu.iloc[va_idx]
    m1 = train_one_model(X_tr_m1, y_tr, X_va_m1, y_va, in_dim=X_tr_m1.shape[1], n_classes=n_classes, epochs=35, lr=5e-4)
    oof_logits_m1[va_idx] = predict_logits(m1, X_va_m1)
    if X_test_scaled is not None:
        test_logits_m1 += predict_logits(m1, X_test_scaled[imu_cols]) / 5.0

    # ---------- Model2: Late Fusion ----------
    # 가지별로 작은 MLP 학습 후 마지막에 합친 표현으로 최종 분류기
    branch_logits_train = []
    branch_logits_valid = []
    branch_logits_test = [] if X_test_scaled is not None else None

    branch_models = []
    for bname, bX in branches.items():
        X_tr_b = bX.iloc[tr_idx]; X_va_b = bX.iloc[va_idx]
        if X_tr_b.shape[1] == 0:
            continue
        mb = train_one_model(X_tr_b, y_tr, X_va_b, y_va, in_dim=X_tr_b.shape[1], n_classes=n_classes, epochs=25, lr=1e-3)
        branch_models.append((bname, mb))
        # 각 브랜치의 마지막 은닉을 쓰고 싶다면 모델을 분리해야 하지만,
        # 간단화를 위해 여기서는 branch logits을 또 하나의 특징으로 사용.
        branch_logits_train.append(predict_logits(mb, X_tr_b))
        branch_logits_valid.append(predict_logits(mb, X_va_b))
        if X_test_scaled is not None:
            branch_logits_test.append(predict_logits(mb, X_test_scaled[bX.columns]))

    # branch logits concat → meta MLP
    Z_tr = np.concatenate(branch_logits_train, axis=1) if branch_logits_train else predict_logits(m1, X_tr_all)
    Z_va = np.concatenate(branch_logits_valid, axis=1) if branch_logits_valid else predict_logits(m1, X_va_all)
    in_dim_meta = Z_tr.shape[1]
    # Meta learner
    X_tr_meta = pd.DataFrame(Z_tr); X_va_meta = pd.DataFrame(Z_va)
    m2 = train_one_model(X_tr_meta, y_tr, X_va_meta, y_va, in_dim=in_dim_meta, n_classes=n_classes, epochs=25, lr=1e-3)
    oof_logits_m2[va_idx] = predict_logits(m2, X_va_meta)
    if X_test_scaled is not None and branch_logits_test is not None:
        Z_te = np.concatenate(branch_logits_test, axis=1)
        X_te_meta = pd.DataFrame(Z_te)
        test_logits_m2 += predict_logits(m2, X_te_meta) / 5.0

    # ---------- Model3: Early Fusion ----------
    m3 = train_one_model(X_tr_all, y_tr, X_va_all, y_va, in_dim=X_tr_all.shape[1], n_classes=n_classes, epochs=40, lr=7e-4)
    oof_logits_m3[va_idx] = predict_logits(m3, X_va_all)
    if X_test_scaled is not None:
        test_logits_m3 += predict_logits(m3, X_test_scaled) / 5.0

# OOF 점수
oof_prob_m1 = softmax_np(oof_logits_m1)
oof_prob_m2 = softmax_np(oof_logits_m2)
oof_prob_m3 = softmax_np(oof_logits_m3)
oof_ll_1 = log_loss(y, oof_prob_m1)
oof_ll_2 = log_loss(y, oof_prob_m2)
oof_ll_3 = log_loss(y, oof_prob_m3)
print(f"\nOOF logloss — M1(IMU): {oof_ll_1:.5f} | M2(Late): {oof_ll_2:.5f} | M3(Early): {oof_ll_3:.5f}")

# Optional: Temperature Scaling (on concatenated logits)
concat_logits = np.concatenate([oof_logits_m1, oof_logits_m2, oof_logits_m3], axis=1)
# 간단: 각 모델 온도 하나씩 맞추는 대신, 모델별 스칼라 온도를 1로 둡니다 (안정성 목표)
# 필요시 개별 스케일링 구현 가능

# 앙상블 (기본: 평균)
oof_ens = (oof_prob_m1 + oof_prob_m2 + oof_prob_m3) / 3.0
oof_ll_ens = log_loss(y, oof_ens)
print(f"OOF logloss — Ensemble(avg): {oof_ll_ens:.5f}")

In [None]:

# =============================
# 6. Predict & Submission Files
# =============================
out_dir = Path('/kaggle/working') if os.path.exists('/kaggle') else Path('.')
out_dir.mkdir(parents=True, exist_ok=True)

# OOF 저장 (진단용)
pd.DataFrame({
    'oof_m1_maxprob': oof_prob_m1.max(axis=1),
    'oof_m2_maxprob': oof_prob_m2.max(axis=1),
    'oof_m3_maxprob': oof_prob_m3.max(axis=1),
}).to_csv(out_dir / 'oof_diagnostics.csv', index=False)

# Test 예측
if X_test_scaled is not None:
    prob_m1 = softmax_np(test_logits_m1)
    prob_m2 = softmax_np(test_logits_m2)
    prob_m3 = softmax_np(test_logits_m3)
    prob_ens = (prob_m1 + prob_m2 + prob_m3) / 3.0

    # Kaggle 규약에 맞게 id/label 구성 추정
    # 가장 흔한 컬럼명 추정
    id_cands = [c for c in ['id','ID','Id','sample_id','record_id','series_id'] if c in (globals().get('test_df', pd.DataFrame())).columns] if test_df is not None else []
    test_id = test_df[id_cands[0]] if (test_df is not None and id_cands) else pd.Series(range(len(prob_ens)), name='id')
    
    # 최빈 클래스 예측
    y_pred_cls = prob_ens.argmax(axis=1)
    sub_basic = pd.DataFrame({'id': test_id, 'label': y_pred_cls})
    sub_basic.to_csv(out_dir / 'submission.csv', index=False)
    print('Saved:', out_dir / 'submission.csv')

    # 요청 포맷: predict1/2/3도 같이 저장
    # (확률 또는 클래스 — 여기선 클래스)
    sub_p123 = pd.DataFrame({
        'id': test_id,
        'predict1': prob_m1.argmax(axis=1),
        'predict2': prob_m2.argmax(axis=1),
        'predict3': prob_m3.argmax(axis=1),
    })
    sub_p123.to_csv(out_dir / 'submission_predict123.csv', index=False)
    print('Saved:', out_dir / 'submission_predict123.csv')
else:
    print('test_df가 없어 제출 파일을 만들지 않았습니다.')