> __Purpose:__ This notebook will implement our MAML functions and test the episodic dataloader structure. Once everything is running, either this NB or a copy will do HPO for the MAML parameters. We will include the MOE layer for now (although placement is probably quite poor), and we will stick with the CNN-MLP architecture since the CNN-LSTM architecture was not performing as well (the latter architecture was probably not fit sufficiently in its HPO).

In [1]:
import os

import sys, copy, json, time, joblib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#code_dir.April_25.MOE. --> Said not to use this... code_dir isn't an actual package...
# Make sure I don't have files named the same thing...
from MOE_multimodal_model_classes import *
from MOE_quick_cls_heads import *
from MOE_training import *
from MOE_configs import *
from multimodal_data_processing import *  # Needed for load_multimodal_dataloaders()
from mamlpp import *
from maml_multimodal_dataloaders import *

current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")

# Add the parent directory folder to the system path
sys.path.append(os.path.abspath(os.path.join('..')))
print(f"CWD after sys path append: {os.getcwd()}")

from configs.hyperparam_tuned_configs import *
from utils.DNN_FT_funcs import *
from utils.gesture_dataset_classes import *

from utils.global_seed import set_seed
set_seed()

timestamp = datetime.now().strftime("%Y%m%d_%H%M")

#############################################################

# ===================== OPTUNA TUNING SCRIPT =====================

# ====== (3) --- Objective: build model, pretrain, finetune, return metric ======
def build_maml_model(base_config):
    config = copy.deepcopy(base_config)

    # From base_config, not sure if they are being used...
    config["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
    config["cluster_iter_str"] = None
    config["feature_engr"] = "None"
    config["time_steps"] = 1
    config["sequence_length"] = 64
    config["num_train_gesture_trials"] = 9
    config["num_ft_gesture_trials"] = 1
    config["num_pretrain_users"] = 24
    config["num_testft_users"] = 4
    config["padding"] = 0 
    config["use_batch_norm"] = False
    config["timestamp"] = timestamp
    config["fc_dropout"] = 0.0
    config["num_classes"] = 10
    config["use_earlystopping"] = True
    config["reset_ft_layers"] = False 
    config["verbose"] = False
    config["num_total_users"] = 32

    # ----- Model layout hyperparams -----
    config["user_emb_dim"]  = 16
    config["num_experts"]   = 8
    config["top_k"]         = 3

    # TODO: Figure out a better user+demo+etc gating mechanism...
    # MOE + User + Gate choice
    config["gate_type"]     = "feature_only"  # What are all the options here? ...
    config["gate_requires_u_user"] =  True
    config["use_u_init_warm_start"] = True
    # User table usage (important for novel users) --> I think this needs to stay True, if False there's no backup method to learn user embeddings rn...
    config["use_user_table"] = True  # TODO: I think this won't even run when False? Passing in None for users...
    config["demo_conditioning"] = 'concat'
    config["u_user_and_demos"] = "mix"
    config["use_u_init_warm_start"] = True
    config["gate_dense_before_topk"] = True 
    config["gate_requires_u_user"] = False if config["gate_type"] == "feature_only" else True
    config["mix_demo_u_alpha"] = 0.5
    # TODO: novel user emb definitely isn't getting trained anywhere in my current code...
    ## This was just a finetuning thing... idk how we should do it in meta learning...
    ## Why cant we just train a NN to do this? Maintain the NN instead of the table? I mean we still have the NN right in nn.embedding()
    #config["alt_or_seq_MOE_user_emb_ft"]= "alternating"

    # Head choice
    config["head_type"]     = "cosine"
    config["init_tau"] = 8.0  
    # Dropout / regularizers
    config["expert_dropout"]     = 0.0  #0.25 
    config["label_smooth"]       = 0.05  
    config["gate_balance_coef"]  = 0.05  

    # Pretraining optim
    config["learning_rate"]      = 1E-3  # This is used by the meta-optimizer... idk if it should be or how/where it is getting used, given LSLR and such...
    config["weight_decay"]       = 0.0  #2.7E-6
    config["optimizer"]          = "adamw" 
    config["lr_scheduler_factor"]= 0.1  
    config["lr_scheduler_patience"]= 4  
    config["earlystopping_patience"]= 8 
    config["earlystopping_min_delta"]= 0.005 
    config['num_epochs'] = 35

    # NEW MULTIMODAL
    # NOTE: GroupNorm uses 8 groups currently, could raise/lower that, but emb_dim must be divisible by num_groups or it will break!!
    config["groupnorm_num_groups"] = 8  # TODO: Is this groupnorm even used...
    config["emb_dim"]       = 64  # TODO: Is this strictly related to multimodal? This is the modality embedding dim??
    # It is probably only worth trying strides of 221 and 211. My data is already downsampled to 64 so no reason to use higher stride idt
    ## Hmm I wonder if the strides need to be the same actually so the feature maps have the same seq lens... not sure...
    config['emg_stride2'] = 1
    config['imu_stride2'] = 2
    # Eh I'll just scale by 2 for now...
    config['emg_CNN_capacity_scaling'] = 1
    config['imu_CNN_capacity_scaling'] = 2
    config["multimodal"] = True
    config['emg_in_ch'] = 16
    config['imu_in_ch'] = 72
    config['demo_in_dim'] = 12

    # NEW(-ish) FIELDS!
    config["pool_mode"] = 'max'
    config["pdrop"] = 0.0  # TODO: This is dropout (not weight decay?) and idk where exactly this is applied... 
    config["mixture_mode"] = 'logits'  # 'logits' | 'probs' | 'logprobs' --> I don't think this is implemented AFAIK
    config["num_pretrain_users"] = 24  
    config["moddrop_p"] = 0.0  # TODO: No idea what this is --? "(probability to drop IMU at train time)"
    config["demo_emb_dim"] = 16
    config["expert_bigger"] = False  # (if True, widen Expert hidden)
    config["expert_bigger_mult"] = 2
    config['log_each_pid_results'] = False

    # ADDING MAML SPECIFIC
    config["meta_learning"] = True
    config["n_way"] = 10
    config["k_shot"] = 1
    config["q_query"] = 9  # TODO: Does this need to be 9? If it set it lower does that just make it faster? Does that impact the model? Slightly noiser eval??
    # TODO: Do the below eps/batch and eps/epoch need to be multiple of each other?
    config["episodes_per_batch_train"] = 12  # Meta learning batch size
    config["episodes_per_epoch_train"] = 1000  # TODO: I have no idea what this should be... this is the max on the number of tasks per EPOCH. So this limits training, if the iterable is way too big. Idk if this is necessary for us or not
    config["num_workers"] = 0  # TODO: Idk what this does
    # Core MAML++
    config["maml_inner_steps"] = 3
    # TODO: Are the first and second order plus MSL not... like almost the same thing? I guess with no MSL there is literally no inner loop??
    config["maml_second_order"] = True                         # enables second-order when DOA switches on
    config["maml_first_order_to_second_order_epoch"] = 50      # DOA threshold (epochs <= this are first-order)
    config["maml_use_msl"] = True                              # MSL (multi-step loss) on
    config["maml_msl_num_epochs"] = 80                         # use MSL during first N epochs; after that, final-step loss only
    config["maml_use_lslr"] = False                             # learn per-parameter, per-step inner LRs
    # TODO: Is this maml_alpha_init being used as a learning rate?
    ## I remember that in PerFedAvg they said beta was around 0.5 or something (IIRC)
    ## Yes this is being used as a learning rate
    ## Gotta sort this out with the other one, idek if the other one is being used anymore...
    config["maml_alpha_init"] = 1E-3                            # fallback α (also eval α if LSLR not used at eval)
    config["enable_inner_loop_optimizable_bn_params"] = False  # by default, do NOT adapt BN in inner loop
    # Eval
    config["maml_inner_steps_eval"] = 3
    config["maml_alpha_init_eval"] = 1E-3
    config["use_cosine_outer_lr"] = False                       # This is cosine-based lr annealing... is this in addition to my lr scheduler....
    config["use_lslr_at_eval"] = False                         # set True if you want to use learned per-parameter step sizes at eval

    # NOTE: Only the non-commented ones here are actually used... (and finetune_strategy might not actually be used yet)
    ### Finetuning regime ############################################
    config["finetune_strategy"]  = "full"  # TODO: I think this doesn't get used with MAML? Could repurpose it to be ANIL or something?
    config['num_ft_epochs'] = 15  # TODO: Pretty sure this isn't doing anything with MAML (currently at least)
    #config["use_dropout_during_peft"] = False  
    #config["ft_learning_rate"]   = 0.00125 # TODO: Not sure if this is used? Regardless would like this to be lower than the pre LR...
    #config["ft_weight_decay"]    = 1E-5
    #config["ft_lr_scheduler_factor"]= 0.1 
    #config["ft_lr_scheduler_patience"]= 4 
    #config["ft_earlystopping_patience"]= 10 
    #config["ft_earlystopping_min_delta"]= 0.008 
    ##############################################################
    # Batch sizes (keep pretrain stable; you can expose if needed)
    #config["batch_size"] = 64  # TODO: Is this used with MAML??
    #config["ft_batch_size"] = 1  # TODO: Is this used with MAML??
    ##############################################################
    # LSTM RELATED (not used here but need to pass in "none")
    # ---- backbone toggle ----
    config["temporal_backbone"] = "none"     # "none" (current TCN-only) | "lstm"
    # ---- LSTM settings (used when temporal_backbone == "lstm") ----
    #config["lstm_hidden"] = 128
    #config["lstm_layers"] = 2
    #config["lstm_bidirectional"] = False
    #config["temporal_pool_mode"] = "last"    # "last" | "mean"   (pool *after* LSTM)
    # ---- MoE placement (you can leave this as-is; we keep MoE at the head) ----
    #config["moe_placement"] = "head"        # ("head" recommended; others optional/unused here)

    config["use_supportquery_for_ft"] = True
    config["emg_imu_pkl_full_path"] = 'C:\\Users\\kdmen\\Box\\Yamagami Lab\\Data\\Meta_Gesture_Project\\filtered_datasets\\metadata_IMU_EMG_allgestures_allusers.pkl'
    config["pwmd_xlsx_filepath"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\Biosignal gesture questionnaire for participants with disabilities.xlsx"
    config["pwoutmd_xlsx_filepath"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\Biosignal gesture questionnaire for participants without disabilities.xlsx"
    config["dfs_save_path"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\April_25\\MOE\\full_datasplit_dfs\\"
    config["dfs_load_path"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\April_25\\MOE\\full_datasplit_dfs\\Initial_Multimodal\\"
    config["saved_df_timestamp"] = '20250917_1217'
    # These need to get set by the json split right... are those set somewhere... --> THESE ARE HARDCODED IN load_multimodal_data_loaders right now!!
    #train_PIDs=['P104', 'P105', 'P106', 'P107', 'P108', 'P109', 'P112', 'P114', 'P115', 'P116', 'P118', 'P119', 'P123', 'P124', 'P125', 'P126', 'P127', 'P128', 'P004', 'P005', 'P006', 'P008', 'P010', 'P011'], 
    #val_PIDs=['P102', 'P110', 'P121', 'P131'], 
    #test_PIDs=['P103', 'P111', 'P122', 'P132'], 

    # ----- Build model -----
    model = MultiModalMoEClassifier(config)
    device = config["device"]

    # Tweak Expert’s dropout inline (uses Expert.drop)
    for exp in model.experts:
        if isinstance(exp.drop, nn.Dropout):
            exp.drop.p = config["expert_dropout"]

    # Swap head if cosine
    if config["head_type"] == "cosine":
        # This is the non-mutlimodal version:
        #swap_expert_head_to_cosine(model, emb_dim=config["emb_dim"], num_classes=config["num_classes"], init_tau=config["init_tau"])
        # This is the mutlimodal version:
        model.swap_expert_head_to_cosine(init_tau=config["init_tau"])

    model.to(device)
    return model, config

#############################################################

# Load in / create the dataloaders
# --------- base config ----------
base_config = {}  #emg_moments_only_MOE_config
# ---- Build model + config for this trial ----
model, config = build_maml_model(base_config)
if config["device"]=="cpu":
    print("HPO is happening on the CPU! Probably ought to switch to GPU!")



The current working directory is: c:\Users\kdmen\Repos\fl-gestures\April_25\MOE
CWD after sys path append: c:\Users\kdmen\Repos\fl-gestures\April_25\MOE
DNN_FT_funcs.py: The current working directory is: c:\Users\kdmen\Repos\fl-gestures\April_25\MOE
Global seed set to 17


In [2]:
# ---- Pretraining data & training ----
# train support, train query, val support (ft), val query (novel test), test support, test query
#train_loader, val_loader, ft_loader, novel_test_loader, test_support_dl, test_query_dl = load_multimodal_data_loaders(config, load_existing_dfs=True)
episodic_train_loader, episodic_val_loader, episodic_test_loader = load_multimodal_data_loaders(config, load_existing_dfs=True)

In [None]:
# Do the meta pretraining
pretrained_model, pretrain_res_dict = MAMLpp_pretrain(model, config, episodic_train_loader, episodic_val_loader=episodic_val_loader)
best_val_acc = pretrain_res_dict["best_val_acc"]
best_state   = pretrain_res_dict["best_state"]


In [None]:
user_loaders = make_user_loaders_from_dataloaders(episodic_val_loader, episodic_test_loader, config)


In [None]:
for pid, (user_val_epi_dl, user_test_epi_dl) in user_loaders.items():
    if user_val_epi_dl is None:
        ft = 0
    else:
        ft = 1
    if user_test_epi_dl is None:
        tst = 0
    else:
        tst = 1

    print(f"{ft}, {tst}")

In [None]:
user_accs = []

for pid, (user_val_epi_dl, user_test_epi_dl) in user_loaders.items():
    # For FixedOneShotPerUserIterable, each dl will usually yield exactly 1 episode

    if user_test_epi_dl is None:
            continue
    
    for episode in user_test_epi_dl:  
        support_batch = episode["support"]
        query_batch   = episode["query"]

        result = mamlpp_finetune_and_eval(
            model=model,
            config=config,
            support_batch=support_batch,
            query_batch=query_batch,
            #use_lslr_at_eval=False,
        )

        user_accs.append(result["acc"])
        # optionally store result["adapted_params"] per pid

mean_acc = float(np.mean(user_accs))
print(user_accs)
print(f"Mean acc: {mean_acc*100:.2f}%")

In [None]:
assert(False)

In [3]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"

In [4]:
import warnings
from optuna.exceptions import ExperimentalWarning

warnings.filterwarnings("ignore", category=ExperimentalWarning)

In [5]:
import sys, copy, json, time, joblib
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

from MOE_multimodal_model_classes import *
from MOE_quick_cls_heads import *
from MOE_training import *
from MOE_configs import *
from multimodal_data_processing import *  # Needed for load_multimodal_dataloaders()
from mamlpp import *
from maml_multimodal_dataloaders import *

current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")

# Add the parent directory folder to the system path
sys.path.append(os.path.abspath(os.path.join('..')))
print(f"CWD after sys path append: {os.getcwd()}")

from configs.hyperparam_tuned_configs import *
from utils.DNN_FT_funcs import *
from utils.gesture_dataset_classes import *

from utils.global_seed import set_seed
set_seed()

timestamp = datetime.now().strftime("%Y%m%d_%H%M")



The current working directory is: c:\Users\kdmen\Repos\fl-gestures\April_25\MOE
CWD after sys path append: c:\Users\kdmen\Repos\fl-gestures\April_25\MOE
Global seed set to 17


## Optuna Hyperparameter Optimization Run

In [6]:
# ===================== OPTUNA TUNING SCRIPT =====================

# ====== (3) --- Objective: build model, pretrain, finetune, return metric ======
def build_model_from_trial(trial, base_config=None):
    if base_config is None:
        config = dict()
    else:   
        config = copy.deepcopy(base_config) 
    config["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
    #config["cluster_iter_str"] = None
    config["feature_engr"] = "None"
    config["time_steps"] = 1
    config["sequence_length"] = 64
    config["num_train_gesture_trials"] = 9
    config["num_ft_gesture_trials"] = 1
    config["num_pretrain_users"] = 24
    config["num_testft_users"] = 4
    config["padding"] = 0 
    config["use_batch_norm"] = False
    config["timestamp"] = timestamp
    config["fc_dropout"] = 0.0
    config["num_classes"] = 10
    config["use_earlystopping"] = True
    config["reset_ft_layers"] = False 
    config["verbose"] = False
    config["num_total_users"] = 32

    # ----- Model layout hyperparams -----
    config["user_emb_dim"]  = trial.suggest_int("user_emb_dim", 12, 48)
    config["num_experts"]   = trial.suggest_int("num_experts", 2, 10)
    config["top_k"]         = trial.suggest_categorical("top_k", [None, 1, 2, 3])

    # Gate choice
    config["gate_type"]     = trial.suggest_categorical("gate_type", ["user_aware", "feature_only", "user_only", "film"])  #, "bilinear"
    config["gate_requires_u_user"] = False if config["gate_type"] == "feature_only" else True
    config["use_u_init_warm_start"] = True #trial.suggest_categorical("use_u_init_warm_start", [True, False])
    # ^ False is broken right now because WithUserOverride doesn't accept None as the init vector
    #if config["gate_type"]=="bilinear":
    #    # Using min here will def break this in Optuna right? ...
    #    config["rank"] = trial.suggest_int("bilinear_rank", 4, min(config["emb_dim"], 16))

    # Head choice
    config["head_type"]     = trial.suggest_categorical("head_type", ["linear", "cosine"])
    if config["head_type"] == "cosine":
        config["init_tau"] = 5.0  #trial.suggest_float("init_tau", 5.0, 30.0)

    # Dropout / regularizers
    config["expert_dropout"]     = 0.25  #trial.suggest_float("expert_dropout", 0.0, 0.40)
    config["label_smooth"]       = 0.1  #trial.suggest_float("label_smooth", 0.0, 0.15)
    config["gate_balance_coef"]  = 0.1  #trial.suggest_float("gate_balance_coef", 0.0, 0.15)

    # Pretraining optim
    config["learning_rate"]      = trial.suggest_float("pre_lr", 5e-6, 5e-4, log=True)
    config["weight_decay"]       = trial.suggest_float("pre_wd", 1e-6, 3e-3, log=True)
    config["optimizer"]          = "adamw"  #trial.suggest_categorical("pre_opt", ["adamw", "adam", "sgd"])
    config["lr_scheduler_factor"]= 0.1  #trial.suggest_categorical("pre_sched_factor", [0.1, 0.2])
    config["lr_scheduler_patience"]= 6  #trial.suggest_int("pre_sched_pat", 4, 10)
    config["earlystopping_patience"]= 8 #trial.suggest_int("pre_es_pat", 6, 14)
    config["earlystopping_min_delta"]= 0.005 #trial.suggest_float("pre_es_delta", 0.001, 0.01)

    # Finetuning regime
    config["finetune_strategy"]  = 'adaptation' #trial.suggest_categorical("finetune_strategy", ["experts_only", "experts_plus_gate", "full"])  #"linear_probing", 
    config["use_dropout_during_peft"] = False  #trial.suggest_categorical("use_dropout_during_peft", [False, True])
    config["ft_learning_rate"]   = trial.suggest_float("ft_lr", 1e-4, 5e-2, log=True)
    config["ft_weight_decay"]    = trial.suggest_float("ft_wd", 1e-6, 5e-3, log=True)
    config["ft_lr_scheduler_factor"]= 0.1  #trial.suggest_categorical("ft_sched_factor", [0.1, 0.25, 0.5])
    config["ft_lr_scheduler_patience"]= 4  #trial.suggest_int("ft_sched_pat", 4, 10)
    config["ft_earlystopping_patience"]= 10  #trial.suggest_int("ft_es_pat", 6, 14)
    config["ft_earlystopping_min_delta"]= 0.008  #trial.suggest_float("ft_es_delta", 0.0005, 0.01)

    # TODO: Surely this isn't used with MAML? We don't do PEFT... so wtf is the user table doing then.......
    #config["alt_or_seq_MOE_user_emb_ft"]= trial.suggest_categorical("alt_or_seq_MOE_user_emb_ft", ["sequential", "alternating"])

    # Batch sizes (keep pretrain stable; you can expose if needed)
    ## TODO: Confirm this has no effect... for MAML it should be fully controlled by num episodes or something??
    config["batch_size"] = 128  #trial.suggest_categorical("pre_bs", [32, 64, 128, 256, 512, 1024])
    config["ft_batch_size"] = 10  #trial.suggest_categorical("ft_bs", [1, 2, 8, 10])

    # User table usage (important for novel users) --> I think this needs to stay True, if False there's no backup method to learn user embeddings rn...
    config["use_user_table"]     = True  #trial.suggest_categorical("use_user_table", [True, False])

    # NEW MULTIMODAL
    # NOTE: GroupNorm uses 8 groups currently, could raise/lower that, but emb_dim must be divisible by num_groups or it will break!!
    config["groupnorm_num_groups"] = trial.suggest_categorical("groupnorm_num_groups", [4, 6, 8, 12])
    #config["emg_emb_dim"]       = trial.suggest_categorical("emg_emb_dim", [72, 96, 120, 192, 216, 288, 360])
    #config["imu_emb_dim"]       = trial.suggest_categorical("imu_emb_dim", [72, 96, 120, 192, 216, 288, 360])
    # Actually I'm gonna keep these the same. Simplifies the network. If IMU >> EMG in the emb dim, it might just overfit to IMU
    config["emb_dim"]       = trial.suggest_categorical("emb_dim", [72, 96, 120, 192, 216, 288, 360])

    # It is probably only worth trying strides of 221 and 211. My data is already downsampled to 64 so no reason to use higher stride idt
    ## Hmm I wonder if the strides need to be the same actually so the feature maps have the same seq lens... not sure...
    config['emg_stride2'] = trial.suggest_int("emg_stride2", 1, 2)
    config['imu_stride2'] = trial.suggest_int("imu_stride2", 1, 2)

    # Eh I'll just scale by 2...
    #config['emg_CNN_capacity'] = trial.suggest_categorical("emg_CNN_capacity", [72, 96, 120, 192, 216, 288, 360])
    #config['imu_CNN_capacity'] = trial.suggest_categorical("imu_CNN_capacity", [72, 96, 120, 192, 216, 288, 360])
    config['emg_CNN_capacity_scaling'] = trial.suggest_categorical("emg_CNN_capacity_scaling", [1, 2, 3])
    config['imu_CNN_capacity_scaling'] = trial.suggest_categorical("imu_CNN_capacity_scaling", [1, 2, 3, 4, 5])

    config["multimodal"] = True
    config['emg_in_ch'] = 16
    config['imu_in_ch'] = 72
    config['demo_in_dim'] = 12
    config['num_epochs'] = 35
    config['num_ft_epochs'] = 15

    # NEW FIELDS!
    config["mix_demo_u_alpha"] = 0.5

    # ADDING MAML SPECIFIC
    # TODO: How do all of these interact???
    config["meta_learning"] = True
    config["n_way"] = 10
    config["k_shot"] = 1
    config["q_query"] = 9  # TODO: Does this need to be 9? If it set it lower does that just make it faster? Does that impact the model? Slightly noiser eval??
    # TODO: Do the below eps/batch and eps/epoch need to be multiple of each other?
    config["episodes_per_batch_train"] = trial.suggest_categorical("episodes_per_batch_train", [10, 100, 500])  # Meta learning batch size
    config["episodes_per_epoch_train"] = trial.suggest_categorical("episodes_per_epoch_train", [100, 1000, 5000])  # TODO: I have no idea what this should be... this is the max on the number of tasks per EPOCH. So this limits training, if the iterable is way too big. Idk if this is necessary for us or not
    config["num_workers"] = 0  # TODO: Idk what this does
    # Core MAML++
    config["maml_inner_steps"] = trial.suggest_int("maml_inner_steps", 1, 5)
    # TODO: Are the first and second order plus MSL not... like almost the same thing? I guess with no MSL there is literally no inner loop??
    config["maml_second_order"] = trial.suggest_categorical("maml_second_order", [True, False])                         # enables second-order when DOA switches on
    config["maml_first_order_to_second_order_epoch"] = trial.suggest_categorical("maml_first_order_to_second_order_epoch", [10, 30, 60, 100])      # DOA threshold (epochs <= this are first-order)
    config["maml_use_msl"] = trial.suggest_categorical("maml_use_msl", [True, False])                              # MSL (multi-step loss) on
    config["maml_msl_num_epochs"] = trial.suggest_categorical("maml_msl_num_epochs", [10, 30, 60, 100])                         # use MSL during first N epochs; after that, final-step loss only
    config["maml_use_lslr"] = trial.suggest_categorical("maml_use_lslr", [True, False])                             # learn per-parameter, per-step inner LRs
    # TODO: Is this maml_alpha_init being used as a learning rate?
    ## I remember that in PerFedAvg they said beta was around 0.5 or something (IIRC)
    ## Yes this is being used as a learning rate
    ## Gotta sort this out with the other one, idek if the other one is being used anymore...
    config["maml_alpha_init"] = 1E-3                            # fallback α (also eval α if LSLR not used at eval)
    config["enable_inner_loop_optimizable_bn_params"] = False  # by default, do NOT adapt BN in inner loop
    # Eval
    config["maml_inner_steps_eval"] = trial.suggest_int("maml_inner_steps_eval", 1, 5)
    config["maml_alpha_init_eval"] = 1E-3
    config["use_cosine_outer_lr"] = False                       # This is cosine-based lr annealing... is this in addition to my lr scheduler....
    config["use_lslr_at_eval"] = False                         # set True if you want to use learned per-parameter step sizes at eval

    # OPTUNA
    config["pool_mode"] = trial.suggest_categorical("pool_mode", ['avg', 'max', 'avgmax']) 
    config["pdrop"] = 0.1  # TODO: No idea what this is...
    config["mixture_mode"] = 'logits'  # 'logits' | 'probs' | 'logprobs' --> I don't think this is implemented AFAIK
    config["use_user_table"] = True  # TODO: I think this won't even run when False? Passing in None for users...
    config["num_pretrain_users"] = 24  
    config["moddrop_p"] = 0.15  # TODO: No idea what this is --? "(probability to drop IMU at train time)"
    config["demo_emb_dim"] = 16
    config["demo_conditioning"] = trial.suggest_categorical("demo_conditioning", ['concat', 'film'])
    config["expert_bigger"] = False  # (if True, widen Expert hidden)
    config["expert_bigger_mult"] = 2
    config["u_user_and_demos"] = trial.suggest_categorical("u_user_and_demos", ["demo", "mix", "u_user"])  # (ie table and u_user_overwriting, ie the default version) 

    # Thse should be in there but don't seem to be printed? Idk...
    config["use_u_init_warm_start"] = True
    config["gate_dense_before_topk"] = True 
    config["gate_requires_u_user"] = False if config["gate_type"] == "feature_only" else True
    config['log_each_pid_results'] = False
    #config['saved_df_timestamp'] = '20250917_1217'  # Apparently this is already input/default somewhere

    # NEW FOR LSTM VERSION! I AM NOT USING THE LSTM VERSION IN THIS NB!
    # ---- backbone toggle ----
    config["temporal_backbone"] = "none"     # "none" (current TCN-only) | "lstm"
    # ---- LSTM settings (used when temporal_backbone == "lstm") ----
    #config["lstm_hidden"] = 128
    #config["lstm_layers"] = 2
    #config["lstm_bidirectional"] = False
    #config["temporal_pool_mode"] = "last"    # "last" | "mean"   (pool *after* LSTM)
    # ---- MoE placement (you can leave this as-is; we keep MoE at the head) ----
    #config["moe_placement"] = "head"        # ("head" recommended; others optional/unused here)

    config["use_supportquery_for_ft"] = True
    config["emg_imu_pkl_full_path"] = 'C:\\Users\\kdmen\\Box\\Yamagami Lab\\Data\\Meta_Gesture_Project\\filtered_datasets\\metadata_IMU_EMG_allgestures_allusers.pkl'
    config["pwmd_xlsx_filepath"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\Biosignal gesture questionnaire for participants with disabilities.xlsx"
    config["pwoutmd_xlsx_filepath"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\Biosignal gesture questionnaire for participants without disabilities.xlsx"
    config["dfs_save_path"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\April_25\\MOE\\full_datasplit_dfs\\"
    config["dfs_load_path"] = "C:\\Users\\kdmen\\Repos\\fl-gestures\\April_25\\MOE\\full_datasplit_dfs\\Initial_Multimodal\\"
    config["saved_df_timestamp"] = '20250917_1217'

    # ----- Build model -----
    model = MultiModalMoEClassifier(config)
    device = config["device"]

    # Tweak Expert’s dropout inline (uses Expert.drop)
    for exp in model.experts:
        if isinstance(exp.drop, nn.Dropout):
            exp.drop.p = config["expert_dropout"]

    # Swap head if cosine
    if config["head_type"] == "cosine":
        swap_expert_head_to_cosine(model, emb_dim=config["emb_dim"], num_classes=config["num_classes"], init_tau=config["init_tau"])

    model.to(device)
    return model, config




In [7]:
def objective(trial):
    # --------- base config ----------
    #base_config = emg_moments_only_MOE_config

    # ---- Build model + config for this trial ----
    model, config = build_model_from_trial(trial)#, base_config)

    print(f"CONFIG[GATE_TYPE]: {config['gate_type']}")

    if config["device"]=="cpu":
        print("HPO is happening on the CPU! Probably ought to switch to GPU!")

    # ---- Pretraining data & training ----
    episodic_train_loader, episodic_val_loader, episodic_test_loader = load_multimodal_data_loaders(config, load_existing_dfs=True)

    # Do the meta pretraining
    pretrained_model, pretrain_res_dict = MAMLpp_pretrain(model, config, episodic_train_loader, episodic_val_loader=episodic_val_loader)
    best_val_acc = pretrain_res_dict["best_val_acc"]
    best_state   = pretrain_res_dict["best_state"]

    # TODO: Idk if I want to do this... I don't really care about the pretrained network...
    # Report intermediate score for pruning
    #trial.report(best_val_acc, step=0)
    #if trial.should_prune():
    #    raise optuna.TrialPruned()

    user_loaders = make_user_loaders_from_dataloaders(episodic_val_loader, episodic_test_loader, config)

    user_accs = []

    for pid, (user_val_epi_dl, user_test_epi_dl) in user_loaders.items():
        # For FixedOneShotPerUserIterable, each dl will usually yield exactly 1 episode

        if user_test_epi_dl is None:
                continue
        
        for episode in user_test_epi_dl:  
            support_batch = episode["support"]
            query_batch   = episode["query"]

            result = mamlpp_finetune_and_eval(
                model=model,
                config=config,
                support_batch=support_batch,
                query_batch=query_batch,
                #use_lslr_at_eval=False,
            )

            user_accs.append(result["acc"])
            # optionally store result["adapted_params"] per pid

    mean_acc = float(np.mean(user_accs))
    print(user_accs)
    print(f"Mean acc: {mean_acc*100:.2f}%")

    # ---- Log ancillary info for analysis ----
    trial.set_user_attr("pretrain_val_acc", float(best_val_acc))
    trial.set_user_attr("per_user_accs", user_accs)

    return mean_acc


In [8]:
# ====== (4) --- Main: study, sampler, pruner & run ======
def run_study(study_name="multimodal_moe_ft_HPO", storage=None, n_trials=2):

    sampler = TPESampler(n_startup_trials=12, multivariate=True, group=True)
    pruner  = MedianPruner(n_startup_trials=8, n_warmup_steps=0)

    study = optuna.create_study(
        study_name=study_name,
        direction="maximize",
        sampler=sampler,
        pruner=pruner,
        storage=storage,          # e.g., "sqlite:///optuna_moe.db"
        load_if_exists=True
    )
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)

    print("Best trial:")
    bt = study.best_trial
    print("  value (finetune acc):", bt.value)
    print("  params:")
    for k, v in bt.params.items():
        print(f"    {k}: {v}")
    print("  pretrain_val_acc:", bt.user_attrs.get("pretrain_val_acc"))

    return study


In [None]:
# X trials took Y minutes

if __name__ == "__main__":
    study_res = run_study()

[I 2025-11-20 20:51:40,710] A new study created in memory with name: multimodal_moe_ft_HPO


CONFIG[GATE_TYPE]: user_aware
MAML Pretraining: Epoch 1 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 1638.28s
Train loss/acc: 2.4111, 12.22%
Val completed in 1.36s
Val loss/acc: 2.3488, 12.78%
Epoch completed in 1639.66s

MAML Pretraining: Epoch 2 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 1643.55s
Train loss/acc: 2.3810, 12.86%
Val completed in 1.14s
Val loss/acc: 2.3504, 13.06%
Epoch completed in 1644.69s

MAML Pretraining: Epoch 3 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 1635.13s
Train loss/acc: 2.3813, 12.89%
Val completed in 1.40s
Val loss/acc: 2.3327, 14.44%
Epoch completed in 1636.54s

MAML Pretraining: Epoch 4 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 1637.06s
Train loss/acc: 2.3814, 13

  param_grad = param.grad
  param_grad = param.grad


Val completed in 1.02s
Val loss/acc: 2.3226, 20.28%
Epoch completed in 2643.48s

MAML Pretraining: Epoch 12 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 2644.58s
Train loss/acc: 2.2428, 19.94%


  param_grad = param.grad
  param_grad = param.grad


Val completed in 1.70s
Val loss/acc: 2.3266, 19.44%
Epoch completed in 2646.28s

MAML Pretraining: Epoch 13 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0
Train completed in 2623.21s
Train loss/acc: 2.2097, 22.61%


  param_grad = param.grad
  param_grad = param.grad


Val completed in 1.08s
Val loss/acc: 2.3208, 23.33%
Epoch completed in 2624.30s

MAML Pretraining: Epoch 14 of 35
Checking params to see if they are causing blow up:
meta_opt.param_groups: 1
lslr_params: 0


In [None]:
joblib.dump(study_res, f"C:\\Users\\kdmen\\Repos\\fl-gestures\\April_25\\results\\MOE\\{timestamp}_MultiModalMOE_optuna_study.pkl")


In [None]:
print("Best trial:") 
trial = study_res.best_trial
print(f"Value: {trial.value}")
print("Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

In [None]:
from optuna.visualization import plot_optimization_history, plot_param_importances


In [None]:
plot_optimization_history(study_res).show()


In [None]:
plot_param_importances(study_res).show()

In [None]:
optuna.visualization.plot_parallel_coordinate(study_res)

In [None]:
optuna.visualization.plot_slice(study_res, params=['emb_dim', 'user_emb_dim', 'emg_CNN_capacity_scaling', 'imu_CNN_capacity_scaling', 'pool_mode'])

In [None]:
optuna.visualization.plot_slice(study_res, params=['num_experts', 'top_k', 'gate_type'])

In [None]:
optuna.visualization.plot_slice(study_res, params=['alt_or_seq_MOE_user_emb_ft', 'finetune_strategy'])

In [None]:
optuna.visualization.plot_slice(study_res, params=['pre_lr', 'pre_wd', 'ft_lr', 'ft_wd'])

In [None]:
#optuna.visualization.plot_slice(study_res, params=['pre_sched_factor', 'pre_sched_pat', 'pre_es_pat', 'pre_es_delta'])


In [None]:
#optuna.visualization.plot_slice(study_res, params=['ft_sched_factor', 'ft_sched_pat', 'ft_es_pat', 'ft_es_delta'])

In [None]:
import optuna
import pandas as pd
from optuna.trial import TrialState
from optuna.study import StudyDirection

def top_k_trials(study: optuna.Study, k: int = 5, target_attr: str | None = None):
    trials = [t for t in study.get_trials(deepcopy=False) if t.state == TrialState.COMPLETE]

    def key_fn(t):
        if target_attr is None:
            return t.value
        return t.user_attrs.get(target_attr, float("-inf"))

    reverse = (target_attr is not None) or (study.direction == StudyDirection.MAXIMIZE)
    trials_sorted = sorted(trials, key=key_fn, reverse=reverse)
    return trials_sorted[:k]

def summarize_top_k(study: optuna.Study, k: int = 5, target_attr: str | None = None):
    top = top_k_trials(study, k, target_attr=target_attr)

    rows = []
    for t in top:
        row = {
            "trial": t.number,
            "objective_value": t.value,  # whatever objective() returned
            "duration_s": (t.datetime_complete - t.datetime_start).total_seconds(),
        }
        if target_attr is not None:
            row[f"target:{target_attr}"] = t.user_attrs.get(target_attr)
        row.update({f"param.{k}": v for k, v in t.params.items()})
        row.update({f"attr.{k}": v for k, v in t.user_attrs.items()})
        rows.append(row)

    df = pd.DataFrame(rows).set_index("trial")

    # Sort display
    if target_attr is None:
        ascending = (study.direction == StudyDirection.MINIMIZE)
        df = df.sort_values("objective_value", ascending=ascending)
    else:
        df = df.sort_values(f"target:{target_attr}", ascending=False)

    # Union of all params across top-k
    all_params = set()
    for t in top:
        all_params.update(t.params.keys())

    # Medians/modes across top-k (skip params missing in some trials)
    numeric, categorical = {}, {}
    for p in all_params:
        vals = [t.params[p] for t in top if p in t.params]
        if not vals:
            continue
        if all(isinstance(v, (int, float)) for v in vals):
            numeric[p] = pd.Series(vals).median()
        else:
            categorical[p] = pd.Series(vals).mode().iloc[0]

    # Exact agreement across trials that *have* the param
    exact_agreement = {}
    for p in all_params:
        vals = {t.params[p] for t in top if p in t.params}
        exact_agreement[p] = len(vals) == 1

    return df, numeric, categorical, exact_agreement


In [None]:
df_top5, medians, modes, exact = summarize_top_k(study_res, k=5)

print("Top-5 trials:")
df_top5.head()


In [None]:
print("\nNumeric parameter medians (top-5):", medians)
print("Categorical parameter modes (top-5):", modes)
print("Exact agreement across all top-5:", {p: a for p, a in exact.items() if a})

In [None]:
t = study_res.best_trial
print("direction:", study_res.direction)
print("value:", t.value)                  # <- this is the metric you returned
print("params:", t.params)
print("user_attrs:", t.user_attrs)        # <- anything you stored manually
print("intermediate values:", t.intermediate_values)  # pruning steps, if any


In [None]:
from optuna.visualization import plot_contour

groups = {
    "rep": ["alt_or_seq_MOE_user_emb_ft", "finetune_strategy", "num_experts", "top_k"],
}
for name, params in groups.items():
    print(f"{name}: {params}")
    fig = plot_contour(study_res, params=params)
fig.update_layout(width=1200, height=1400)  # 2–3× taller/wider
fig.show()