<a href="https://colab.research.google.com/github/kotaro-desu/google-colab/blob/main/Untitled40.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna timm

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [2]:
#!/usr/bin/env python3
"""
EfficientNetV2-S Multi-Output Binary Classification with Optuna (Dataset v5 - Fixed Resolution)
Predicts rainfall presence (Rain/Clear) for 9 regions (3x3 grid) simultaneously.

[V5 Changes Applied]
- Fixed Resolution: 300px (Progressive learning removed)
- LR Scheduler: CosineAnnealingLR or OneCycleLR
- Updated Hyperparameters: Dropout, Augment Magnitude, Freeze Ratio (0.0-1.0)
- Epoch Monitor: Warns if too many trials reach max epochs.
- Robust Integrity Check & Caching preserved.
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torchvision import transforms
import timm
import numpy as np
import pickle
from pathlib import Path
from tqdm import tqdm
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import argparse
import json
from PIL import Image
import io
import sys
import time
import gc
import shutil
import random
import os
import sqlite3
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# --- GLOBAL CONFIG (V5) ---
LOCAL_CACHE_DIR = Path('/content/temp_dataset_cache')
RESOLUTION = 300
MAX_EPOCHS = 100
PATIENCE = 5
EPOCH_MONITOR_START = 15
EPOCH_MONITOR_THRESHOLD = 0.5

def set_seed(seed=42):
    """Sets the seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"Global seed set to: {seed}")

# --- V5: AUGMENTATION (Dynamic Magnitude) ---
def get_transforms(augment=True, magnitude=10):
    if augment:
        return transforms.Compose([
            transforms.RandAugment(num_ops=2, magnitude=magnitude),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    else:
        return transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

# --- HELPER: FREEZE LAYERS ---
def freeze_layers(model, freeze_ratio=0.0):
    if freeze_ratio <= 0.0:
        for param in model.parameters():
            param.requires_grad = True
        return

    base = model.base_model
    conv_layers = []
    for name, module in base.named_modules():
        if isinstance(module, nn.Conv2d):
            conv_layers.append((name, module))

    num_layers = len(conv_layers)
    num_freeze = int(num_layers * freeze_ratio)

    frozen_names = set()
    for name, module in conv_layers[:num_freeze]:
        for param in module.parameters():
            param.requires_grad = False
        frozen_names.add(name.split('.')[0])

    for name, param in base.named_parameters():
        block_name = name.split('.')[0]
        if block_name in frozen_names:
            param.requires_grad = False

# --- INTEGRITY CHECK ---
def check_pickle_integrity(path):
    if not path.exists():
        return False
    try:
        with open(path, 'rb') as f:
            _ = pickle.load(f)
        return True
    except (EOFError, pickle.UnpicklingError, Exception) as e:
        print(f"    [Corrupt] File is broken: {path} ({e})")
        return False

# --- DATA LOADING FUNCTIONS ---
def get_file_list(base_dir, split, resolution):
    data_dir = Path(base_dir) / 'npz_datasets_v4' / f'npz_datasets_v4_universal_{resolution}px'
    if split == 'train':
        patterns = ['train_rain_*.npz', 'train_clear_*.npz']
    elif split == 'val_balanced':
        patterns = ['val_balanced_rain_*.npz', 'val_balanced_clear_*.npz']
    else:
        patterns = [f'{split}_*.npz']
    files = []
    for pat in patterns:
        files.extend(sorted(data_dir.glob(pat)))
    return files

def create_subset_two_pass(resolution, org_name, base_dir, split, target_size='3x', seed=42):
    np.random.seed(seed)
    random.seed(seed)

    files = get_file_list(base_dir, split, resolution)
    if not files:
        print(f"      [WARNING] No files found for {split} {resolution}px")
        return None, None

    # --- PASS 1: SCAN METADATA ---
    print(f"      [Pass 1] Scanning {split} labels...")
    file_map = []
    all_univ_levels = []
    all_org_labels = []
    key_error_reported = False

    for f_idx, chunk_file in enumerate(tqdm(files, desc="Scanning Labels", leave=False)):
        try:
            with np.load(chunk_file, allow_pickle=True) as data:
                levels_dict = data['levels'].item()
                if org_name not in levels_dict:
                    if not key_error_reported:
                        print(f"      [SKIP] Key '{org_name}' missing")
                        key_error_reported = True
                    continue
                lbls = levels_dict[org_name]
                univ = levels_dict['Universal']
                file_map.append({'path': chunk_file, 'count': len(lbls)})
                all_org_labels.append(lbls)
                all_univ_levels.append(univ)
        except Exception as e:
            print(f"      [ERROR] reading {chunk_file}: {e}")
            continue

    if not all_org_labels: return None, None

    full_org_labels = np.concatenate(all_org_labels, axis=0)
    full_univ_levels = np.concatenate(all_univ_levels, axis=0)
    total_samples = len(full_org_labels)

    base_count = 10000
    if target_size == '1x': target_count = base_count * 2
    elif target_size == '2x': target_count = base_count * 4
    elif target_size == '3x': target_count = base_count * 6
    else: target_count = base_count * 2

    if split == 'val_balanced': target_count = 20000
    if split == 'val_imbalanced': target_count = total_samples

    if split == 'val_imbalanced':
        if total_samples > target_count:
            selected_global_indices = np.random.choice(total_samples, target_count, replace=False)
        else:
            selected_global_indices = np.arange(total_samples)
    else:
        target_per_class = target_count // 2
        max_univ = full_univ_levels.max(axis=1)
        rain_indices = np.where(max_univ > 0)[0]
        clear_indices = np.where(max_univ == 0)[0]

        if len(rain_indices) >= target_per_class:
            sel_rain = np.random.choice(rain_indices, target_per_class, replace=False)
        else:
            sel_rain = np.concatenate([rain_indices, np.random.choice(rain_indices, target_per_class - len(rain_indices), replace=True)])

        if len(clear_indices) >= target_per_class:
            sel_clear = np.random.choice(clear_indices, target_per_class, replace=False)
        else:
            sel_clear = np.concatenate([clear_indices, np.random.choice(clear_indices, target_per_class - len(clear_indices), replace=True)])

        selected_global_indices = np.concatenate([sel_rain, sel_clear])

    np.random.shuffle(selected_global_indices)
    final_count = len(selected_global_indices)
    print(f"      Selected {final_count} samples (Target: {target_size})")

    del full_univ_levels, all_univ_levels, all_org_labels
    gc.collect()

    # --- PASS 2: EXTRACT IMAGES ---
    print(f"      [Pass 2] Extracting images...")
    files_to_load = {}
    sorted_indices = np.sort(selected_global_indices)

    current_file_idx = 0
    current_file_start = 0
    current_file_end = file_map[0]['count']

    for global_idx in sorted_indices:
        while global_idx >= current_file_end:
            current_file_idx += 1
            if current_file_idx >= len(file_map): break
            current_file_start = current_file_end
            current_file_end += file_map[current_file_idx]['count']

        if current_file_idx >= len(file_map): break
        local_idx = global_idx - current_file_start
        f_path = file_map[current_file_idx]['path']
        if f_path not in files_to_load: files_to_load[f_path] = []
        files_to_load[f_path].append(local_idx)

    sample_path = list(files_to_load.keys())[0]
    with np.load(sample_path, allow_pickle=True) as data:
        img_shape = data['images'][0].shape

    print(f"      Allocating memory for {final_count} images of shape {img_shape}...")
    final_images_arr = np.empty((final_count, *img_shape), dtype=np.uint8)
    final_labels_arr = np.empty((final_count, 9), dtype=np.int64)

    current_fill_idx = 0
    for f_path, local_indices in tqdm(files_to_load.items(), desc="Extracting", leave=False):
        try:
            with np.load(f_path, allow_pickle=True) as data:
                raw_imgs = data['images']
                levels_dict = data['levels'].item()
                raw_lbls = levels_dict[org_name]
                idx_arr = np.array(local_indices)
                batch_size = len(idx_arr)
                final_images_arr[current_fill_idx : current_fill_idx + batch_size] = raw_imgs[idx_arr]
                final_labels_arr[current_fill_idx : current_fill_idx + batch_size] = raw_lbls[idx_arr]
                current_fill_idx += batch_size
        except Exception: continue

    np.random.seed(seed)
    shuffle_idx = np.arange(final_count)
    np.random.shuffle(shuffle_idx)
    final_images_arr = final_images_arr[shuffle_idx]
    final_labels_arr = final_labels_arr[shuffle_idx]

    gc.collect()
    return final_images_arr, final_labels_arr

def prepare_and_cache_split(split, resolution, org_name, base_dir, drive_cache_dir, seed, target_size=None):
    file_name = f"{split}_{resolution}.pkl"
    local_path = LOCAL_CACHE_DIR / file_name
    drive_path = drive_cache_dir / file_name

    if local_path.exists():
        if check_pickle_integrity(local_path):
            print(f"    [Found] Local cache for {split} {resolution}px")
            return
        else:
            local_path.unlink()

    if drive_path.exists():
        print(f"    [Found] Drive cache for {split} {resolution}px. Copying...")
        try:
            shutil.copy(drive_path, local_path)
            if check_pickle_integrity(local_path): return
            else: local_path.unlink()
        except Exception: pass

    print(f"    [Create] Generating {split} {resolution}px data...")
    imgs, lbls = create_subset_two_pass(resolution, org_name, base_dir, split, target_size=target_size, seed=seed)

    if imgs is not None:
        with open(local_path, 'wb') as f:
            pickle.dump({'images': imgs, 'labels': lbls}, f, protocol=4)
        try:
            shutil.copy(local_path, drive_path)
        except Exception: pass
        del imgs, lbls
        gc.collect()

def prepare_all_resolutions(org_name, base_dir, seed=42):
    """V5: Prepares ONLY the fixed resolution (300px)."""
    print(f"\n{'='*20} PRE-CACHING DATASET (V5 Fixed Res, Seed={seed}) {'='*20}")
    LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    drive_cache_dir = Path(base_dir) / 'dataset_cache'
    drive_cache_dir.mkdir(parents=True, exist_ok=True)

    res = RESOLUTION
    print(f"\n>> Checking Resolution: {res}px")
    prepare_and_cache_split('train', res, org_name, base_dir, drive_cache_dir, seed, target_size='3x')
    prepare_and_cache_split('val_balanced', res, org_name, base_dir, drive_cache_dir, seed, target_size='1x')
    prepare_and_cache_split('val_imbalanced', res, org_name, base_dir, drive_cache_dir, seed)
    print(f"\n{'='*20} CACHING CHECK COMPLETE {'='*20}\n")

class CachedBinaryDataset(Dataset):
    def __init__(self, split, resolution, transform=None, dataset_size='1x', seed=42):
        self.transform = transform
        self.seed = seed
        cache_path = LOCAL_CACHE_DIR / f"{split}_{resolution}.pkl"
        if not cache_path.exists():
            raise FileNotFoundError(f"Cache not found: {cache_path}")
        with open(cache_path, 'rb') as f:
            data = pickle.load(f)
        self.images = data['images']
        self.labels = data['labels']
        if split == 'train' and dataset_size != '3x':
            self._downsample_from_cache(dataset_size)
        # Binary Classification: > 0 is Rain (1), else Clear (0)
        self.labels_binary = (self.labels > 0).astype(np.int64)

    def _downsample_from_cache(self, target_size):
        np.random.seed(self.seed)
        total_available = len(self.images)
        target_count = 20000 if target_size == '1x' else 40000 if target_size == '2x' else None
        if target_count and target_count < total_available:
            indices = np.random.permutation(total_available)[:target_count]
            self.images = self.images[indices]
            self.labels = self.labels[indices]

    def __len__(self): return len(self.images)
    def __getitem__(self, idx):
        img = Image.fromarray(self.images[idx])
        if self.transform: img = self.transform(img)
        return img, torch.from_numpy(self.labels_binary[idx]).long()

class MultiOutputEfficientNetBinary(nn.Module):
    def __init__(self, model_name, dropout=0.0, pretrained=True,
                 head_layers=1, head_hidden_dim=512, head_dropout=0.2):
        super().__init__()
        self.base_model = timm.create_model(model_name, pretrained=pretrained, drop_rate=dropout, num_classes=0)
        in_features = self.base_model.num_features

        if head_layers == 1:
            self.head = nn.Linear(in_features, 9 * 2)
        elif head_layers == 2:
            self.head = nn.Sequential(
                nn.Linear(in_features, head_hidden_dim),
                nn.BatchNorm1d(head_hidden_dim),
                nn.SiLU(),
                nn.Dropout(head_dropout),
                nn.Linear(head_hidden_dim, 9 * 2)
            )
        elif head_layers == 3:
            self.head = nn.Sequential(
                nn.Linear(in_features, head_hidden_dim),
                nn.BatchNorm1d(head_hidden_dim),
                nn.SiLU(),
                nn.Dropout(head_dropout),
                nn.Linear(head_hidden_dim, head_hidden_dim),
                nn.BatchNorm1d(head_hidden_dim),
                nn.SiLU(),
                nn.Dropout(head_dropout),
                nn.Linear(head_hidden_dim, 9 * 2)
            )

    def forward(self, x):
        features = self.base_model(x)
        logits = self.head(features)
        return logits.view(-1, 9, 2)

# --- V5: EPOCH MONITOR CALLBACK ---
class EpochMonitorCallback:
    def __init__(self, start_trial=EPOCH_MONITOR_START, threshold=EPOCH_MONITOR_THRESHOLD):
        self.start_trial = start_trial
        self.threshold = threshold

    def __call__(self, study, trial):
        completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
        if len(completed_trials) < self.start_trial:
            return

        max_epoch_count = 0
        for t in completed_trials:
            if 'final_epoch' in t.user_attrs and t.user_attrs['final_epoch'] >= MAX_EPOCHS:
                max_epoch_count += 1

        ratio = max_epoch_count / len(completed_trials)
        if ratio > self.threshold:
            print(f"\n[WARNING] {ratio*100:.1f}% of trials reached max epochs! Consider increasing MAX_EPOCHS.")

# --- V5: TRAIN FUNCTION (With Scheduler) ---
def train_one_epoch(model, loader, criterion, optimizer, scheduler, scaler, device):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_targets = []

    for images, labels in tqdm(loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad(set_to_none=True)

        with autocast():
            outputs = model(images)
            loss = criterion(outputs.view(-1, 2), labels.view(-1))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # OneCycleLR steps per batch
        if scheduler is not None and isinstance(scheduler, optim.lr_scheduler.OneCycleLR):
            scheduler.step()

        running_loss += loss.item()
        with torch.no_grad():
            probs = torch.softmax(outputs, dim=2)[:, :, 1]
            preds = (probs > 0.5).long()
            all_preds.append(preds.cpu().numpy())
            all_targets.append(labels.cpu().numpy())

    all_preds = np.concatenate(all_preds).flatten()
    all_targets = np.concatenate(all_targets).flatten()
    acc = (all_preds == all_targets).mean() * 100.0
    return running_loss / len(loader), acc

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_probs = []
    all_targets = []

    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Validation", leave=False):
            images, labels = images.to(device), labels.to(device)
            with autocast():
                outputs = model(images)
                loss = criterion(outputs.view(-1, 2), labels.view(-1))
            running_loss += loss.item()
            probs = torch.softmax(outputs, dim=2)[:, :, 1]
            all_probs.append(probs.cpu().numpy())
            all_targets.append(labels.cpu().numpy())

    all_probs = np.concatenate(all_probs).flatten()
    all_targets = np.concatenate(all_targets).flatten()

    best_f1 = 0.0
    best_thresh = 0.5
    best_metrics = {}

    thresholds = np.arange(0.1, 0.95, 0.05)
    for th in thresholds:
        preds = (all_probs > th).astype(int)
        f1 = f1_score(all_targets, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = th
            best_metrics = {
                'acc': (preds == all_targets).mean() * 100.0,
                'precision': precision_score(all_targets, preds, zero_division=0),
                'recall': recall_score(all_targets, preds, zero_division=0)
            }

    avg_loss = running_loss / len(loader)
    try:
        roc_auc = roc_auc_score(all_targets, all_probs)
        pr_auc = average_precision_score(all_targets, all_probs)
    except: roc_auc, pr_auc = 0.0, 0.0

    return avg_loss, best_metrics.get('acc', 0), best_metrics.get('precision', 0), \
           best_metrics.get('recall', 0), best_f1, roc_auc, pr_auc, best_thresh

# --- V5: OPTUNA OBJECTIVE ---
def objective(trial, device, seed):
    # --- V5 Hyperparameters ---
    dataset_size = trial.suggest_categorical('dataset_size', ['1x', '2x', '3x'])
    lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [64, 128])
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'AdamW', 'SGD'])
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)

    # New V5 Params
    dropout = trial.suggest_float('dropout', 0.1, 0.4)
    freeze_ratio = trial.suggest_float('freeze_ratio', 0.0, 1.0)
    augment_magnitude = trial.suggest_int('augment_magnitude', 5, 15)
    scheduler_type = trial.suggest_categorical('scheduler', ['cosine', 'onecycle'])

    head_layers = trial.suggest_int('head_layers', 1, 3)
    head_hidden_dim = trial.suggest_categorical('head_hidden_dim', [512, 768, 1024, 1280])
    head_dropout = trial.suggest_float('head_dropout', 0.1, 0.5)

    print(f"\nTrial {trial.number}: size={dataset_size}, lr={lr:.2e}, batch={batch_size}, opt={optimizer_name}, "
          f"sched={scheduler_type}, mag={augment_magnitude}, drop={dropout:.2f}, freeze={freeze_ratio:.2f}")

    # Initialize Model with Fixed Resolution V5 Config
    model = MultiOutputEfficientNetBinary('tf_efficientnetv2_s', dropout=dropout, pretrained=True,
                                          head_layers=head_layers, head_hidden_dim=head_hidden_dim,
                                          head_dropout=head_dropout).to(device)
    freeze_layers(model, freeze_ratio)
    criterion = nn.CrossEntropyLoss()

    if optimizer_name == 'Adam': optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    elif optimizer_name == 'AdamW': optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    else: optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)

    # Compile for speed (if available)
    try:
        model = torch.compile(model, mode='reduce-overhead')
    except Exception: pass

    try:
        train_ds = CachedBinaryDataset('train', RESOLUTION, get_transforms(True, augment_magnitude), dataset_size, seed)
        val_bal_ds = CachedBinaryDataset('val_balanced', RESOLUTION, get_transforms(False), seed=seed)
        val_imbal_ds = CachedBinaryDataset('val_imbalanced', RESOLUTION, get_transforms(False), seed=seed)
    except FileNotFoundError: raise optuna.TrialPruned()

    num_workers = 4
    train_loader = DataLoader(train_ds, batch_size, True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    val_bal_loader = DataLoader(val_bal_ds, batch_size*2, False, num_workers=num_workers, pin_memory=True)
    val_imbal_loader = DataLoader(val_imbal_ds, batch_size*2, False, num_workers=num_workers, pin_memory=True)

    # Scheduler Setup
    if scheduler_type == 'cosine':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCHS, eta_min=1e-6)
    else:
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, epochs=MAX_EPOCHS, steps_per_epoch=len(train_loader))

    scaler = GradScaler()
    best_val_f1 = 0.0
    patience_counter = 0
    final_epoch = 0

    # Single Loop Training (V5)
    for epoch in range(MAX_EPOCHS):
        final_epoch = epoch + 1

        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, scheduler, scaler, device)

        # Step scheduler (Cosine only, OneCycle steps per batch)
        if scheduler is not None and not isinstance(scheduler, optim.lr_scheduler.OneCycleLR):
            scheduler.step()

        # Validation
        vb_metrics = validate(model, val_bal_loader, criterion, device)
        # vi_metrics = validate(model, val_imbal_loader, criterion, device) # Optional to save time

        vb_loss, vb_acc, vb_prec, vb_rec, vb_f1, vb_auc, vb_pr, vb_th = vb_metrics

        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch+1}/{MAX_EPOCHS}: Loss:{train_loss:.4f} | LR:{current_lr:.2e} | ValF1:{vb_f1:.4f}(@{vb_th:.2f})")

        trial.report(vb_f1, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

        if vb_f1 > best_val_f1:
            best_val_f1 = vb_f1
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            print(f"  Early stopping at epoch {epoch+1}")
            break

    trial.set_user_attr('final_epoch', final_epoch)

    del train_ds, val_bal_ds, val_imbal_ds, train_loader, val_bal_loader, val_imbal_loader
    gc.collect(); torch.cuda.empty_cache()

    return best_val_f1

def main(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--org', type=str, required=True)
    parser.add_argument('--n_trials', type=int, default=100)
    parser.add_argument('--study_name', type=str, default=None)
    parser.add_argument('--base_dir', type=str, default='/content/drive/MyDrive/XRAIN/yano/20250601~20251020_dataset/')
    parser.add_argument('--seed', type=int, default=42)

    if argv is not None: args = parser.parse_args(argv)
    else: args = parser.parse_args()

    set_seed(args.seed)
    study_name = args.study_name or f'effv2_s_{args.org}_v5_binary_fixed'
    print(f"Study: {study_name}, Seed: {args.seed}, Org: {args.org}, Res: {RESOLUTION}px")

    # 1. Prepare Cache (V5: Fixed 300px)
    prepare_all_resolutions(args.org, args.base_dir, seed=args.seed)

    # 2. Run Optuna
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    output_dir = Path(args.base_dir) / 'output'
    progress_dir = output_dir / 'progress' / 'binary_v5'
    progress_dir.mkdir(parents=True, exist_ok=True)

    db_path = progress_dir / f"{study_name}.db"
    storage_url = f"sqlite:///{db_path}"

    try:
        study = optuna.create_study(
            study_name=study_name,
            storage=storage_url,
            load_if_exists=True,
            direction='maximize',
            sampler=TPESampler(seed=args.seed),
            pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )
    except Exception:
        if db_path.exists(): shutil.move(str(db_path), str(db_path.with_suffix('.db.bak')))
        study = optuna.create_study(
            study_name=study_name,
            storage=storage_url,
            direction='maximize',
            sampler=TPESampler(seed=args.seed),
            pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
        )

    remaining = args.n_trials - len(study.trials)
    if remaining > 0:
        study.optimize(lambda t: objective(t, device, args.seed),
                       n_trials=remaining,
                       callbacks=[EpochMonitorCallback()],
                       catch=(RuntimeError,))

    if len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]) > 0:
        print(f"Best F1: {study.best_value:.4f}")
        print(f"Best params: {study.best_params}")
        with open(output_dir / f'optuna_results_{args.org}_v5_binary.json', 'w') as f:
            json.dump({
                'org': args.org,
                'best_trial': study.best_trial.number,
                'best_f1': study.best_value,
                'best_params': study.best_params,
                'seed': args.seed
            }, f, indent=2)

if __name__ == '__main__':
    if 'ipykernel' in sys.modules:
        main(['--org', 'JMA', '--seed', '42'])
    else:
        main()

Global seed set to: 42
Study: effv2_s_JMA_v5_binary_fixed, Seed: 42, Org: JMA, Res: 300px


>> Checking Resolution: 300px
    [Found] Drive cache for train 300px. Copying...
    [Found] Drive cache for val_balanced 300px. Copying...
    [Found] Drive cache for val_imbalanced 300px. Copying...




[I 2025-11-29 01:43:15,007] Using an existing study with name 'effv2_s_JMA_v5_binary_fixed' instead of creating a new one.



Trial 74: size=3x, lr=7.15e-04, batch=128, opt=Adam, sched=cosine, mag=6, drop=0.31, freeze=0.66


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/86.5M [00:00<?, ?B/s]

  scaler = GradScaler()
  with autocast():
  return torch._C._get_cublas_allow_tf32()
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


Epoch 1/100: Loss:0.4504 | LR:7.15e-04 | ValF1:0.7563(@0.45)


  with autocast():
  with autocast():
  with autocast():


Epoch 2/100: Loss:0.4202 | LR:7.14e-04 | ValF1:0.7628(@0.35)


  with autocast():
  with autocast():


Epoch 3/100: Loss:0.4091 | LR:7.13e-04 | ValF1:0.7632(@0.45)


  with autocast():
  with autocast():


Epoch 4/100: Loss:0.4031 | LR:7.12e-04 | ValF1:0.7678(@0.40)


  with autocast():
[W 2025-11-29 01:56:56,224] Trial 74 failed with parameters: {'dataset_size': '3x', 'lr': 0.000714803094926685, 'batch_size': 128, 'optimizer': 'Adam', 'weight_decay': 3.5300280258141837e-06, 'dropout': 0.309553345371676, 'freeze_ratio': 0.6645870472609519, 'augment_magnitude': 6, 'scheduler': 'cosine', 'head_layers': 3, 'head_hidden_dim': 1280, 'head_dropout': 0.17765512344617737} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-4070541459.py", line 620, in <lambda>
    study.optimize(lambda t: objective(t, device, args.seed),
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-4070541459.py", line 536, in objective
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer,

KeyboardInterrupt: 