In [15]:
# 📦 导入库
import os
import socket
from pathlib import Path
from datetime import datetime, timedelta
import json
import cv2
import h5py
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import numpy as np
from PIL import Image
from pathlib import Path
import torch
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2

from torchvision import transforms, models
from torchvision.datasets import ImageFolder
from torchvision.models import get_model_weights

from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import r2_score
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import GroupKFold


# =========================================
# ⚙️ 全局配置
# =========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ 使用设备: {device}")


✅ 使用设备: cuda


In [16]:
# 初始化
if socket.gethostname() == 'hao-2':
    dir = Path('D:/DATA_hao/Kaggle_/csiro-biomass/')
    DIRS = {
    "dir":        dir,                                       
    "train":     Path(dir, "train"),                              
    "test":     Path(dir, "test"),                              
    "model":     Path(dir,"DualStream_multihead"),              
    "data":     Path(dir),   
    }
else:
    dir = Path('/kaggle/input/csiro-biomass')
    DIRS = {
    "dir":        dir,                                       
    "train":     Path(dir, "train"),                              
    "test":     Path(dir, "test"),                              
    "model":     Path('/kaggle/input', "dualstream-multihead-model"),              
    "data":     Path("/kaggle/working/"),   
    }


# 打印时一行一个地址
print("✅ 路径：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")

✅ 路径：

dir          : D:\DATA_hao\Kaggle_\csiro-biomass
train        : D:\DATA_hao\Kaggle_\csiro-biomass\train
test         : D:\DATA_hao\Kaggle_\csiro-biomass\test
model        : D:\DATA_hao\Kaggle_\csiro-biomass\DualStream_multihead
data         : D:\DATA_hao\Kaggle_\csiro-biomass


In [17]:
# 小函数
def show_df_info(df, name: str):
    """
    打印单个 DataFrame 的形状与列名信息。
    参数:
        df   : pandas.DataFrame
        name : 显示名称（字符串）
    """
    print(f"📊 {name:<16} shape: {str(df.shape):<16}  列名: {df.columns.tolist()}")



def move_column_first(df, col_name):
    """
    将 DataFrame 中指定列移动到最前面。
    参数:
        df (pd.DataFrame): 原始数据框
        col_name (str): 要移动到最前面的列名
    返回:
        pd.DataFrame: 调整后的新 DataFrame
    """
    if col_name not in df.columns:
        raise ValueError(f"列 '{col_name}' 不存在于 DataFrame 中。")

    cols = [col_name] + [c for c in df.columns if c != col_name]
    return df[cols]



# 🧮 后处理函数（恢复 5 个目标）
def recover_all_targets(df_pred_3):
    df = df_pred_3.copy()
    df["Dry_Clover_g"] = np.maximum(0, df["GDM_g"] - df["Dry_Green_g"])
    df["Dry_Dead_g"] = np.maximum(0, df["Dry_Total_g"] - df["GDM_g"])
    return df[["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]]



# 数据集、模型、训练 定义

In [18]:
# 🧠 MyDualStreamModel：双流 + 多头回归 + 内部训练逻辑
class WeightedSmoothL1Loss(nn.Module):
    def __init__(self, weights):
        super().__init__()
        self.weights = list(weights.values())
        self.loss_fn = nn.SmoothL1Loss(reduction="none")

    def forward(self, pred, target):
        losses = self.loss_fn(pred, target)
        weighted = sum(losses[:, i] * w for i, w in enumerate(self.weights))
        return weighted.mean()





class MyDualStreamModel(nn.Module):
    def __init__(self, 
                backbone_name="convnext_tiny", 
                pretrained=True, 
                freeze_ratio=0.8,
                weights_dict=None):
        """
        参数:
        - backbone_name: timm 模型名称 (如 convnext_tiny, resnet50)
        - pretrained: 是否加载 ImageNet 权重
        - freeze_ratio: 冻结比例（0~1）
        - weights_dict: 各目标权重 (dict), 用于 WeightedSmoothL1Loss
        """
        super().__init__()

        # 1️⃣ Backbone
        self.backbone = timm.create_model(backbone_name, pretrained=pretrained, num_classes=0)
        in_dim = self.backbone.num_features

        # 2️⃣ 冻结部分参数
        params = list(self.backbone.parameters())
        freeze_until = int(len(params) * freeze_ratio)
        for i, p in enumerate(params):
            p.requires_grad = i >= freeze_until  # 前部分冻结，后部分可学习

        # 3️⃣ 双流融合
        self.fusion_dim = in_dim * 2

        # 4️⃣ 三个输出 Head
        def make_head():
            return nn.Sequential(
                nn.Linear(self.fusion_dim, 512),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(512, 128),
                nn.ReLU(),
                nn.Linear(128, 1)
            )

        self.head_total = make_head()
        self.head_gdm   = make_head()
        self.head_green = make_head()

        # 5️⃣ 损失函数（Weighted SmoothL1Loss）
        self.loss_fn = WeightedSmoothL1Loss(weights_dict) if weights_dict else nn.SmoothL1Loss()



    # ------------------------------------------------------------
    # 🔁 Forward
    # ------------------------------------------------------------
    def forward(self, img_left, img_right):
        feat_left  = self.backbone(img_left)
        feat_right = self.backbone(img_right)
        fused = torch.cat([feat_left, feat_right], dim=1)

        total = self.head_total(fused)
        gdm   = self.head_gdm(fused)
        green = self.head_green(fused)
        preds = torch.cat([green, gdm, total], dim=1)
        return preds  # shape: [batch, 3]

    # ------------------------------------------------------------
    # 🧮 损失计算（内部调用）
    # ------------------------------------------------------------
    def compute_loss(self, preds, targets):
        return self.loss_fn(preds, targets)


In [19]:
# 数据集加载定义
class DualStreamDataset(Dataset):
    def __init__(self, df, image_dir, target_cols=None, transform=None):
        """
        df: DataFrame，包含 image_path 列
        image_dir: 图像目录
        target_cols: 如果是训练集，指定目标列
        transform: Albumentations 变换
        """
        self.df = df
        self.image_dir = image_dir
        self.target_cols = target_cols
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = Path(self.image_dir, str(row["image_path"]))
        
        # ====== 1️⃣ 安全加载 ======
        if not img_path.exists():
            print(f"⚠️ 图片不存在: {img_path}")
            image = np.zeros((1000, 2000, 3), dtype=np.uint8)
        else:
            try:
                image = Image.open(img_path).convert("RGB")
            except Exception as e:
                print(f"⚠️ 无法读取图片: {img_path} ({e})")
                image = np.zeros((1000, 2000, 3), dtype=np.uint8)

        # ====== 2️⃣ 确保转换为 NumPy 数组 ======
        image = np.array(image)  # 转换为 NumPy 数组
        h, w, _ = image.shape
        mid = w // 2
        
        # 拆分成左右两个 patch
        img_left = image[:, :mid]
        img_right = image[:, mid:]

        # ====== 4️⃣ 应用 Albumentations 变换 ======
        if self.transform:
            img_left = self.transform(image=img_left)["image"]
            img_right = self.transform(image=img_right)["image"]

        # ====== 5️⃣ 返回结果 ======
        if self.target_cols is not None:
            targets = torch.tensor(row[self.target_cols].astype(float).values, dtype=torch.float32)
            return img_left, img_right, targets
        else:
            return img_left, img_right


In [20]:
# Albumentations 变换   训练集、验证集、测试TTA
def get_train_transforms(size=768):
    return A.Compose([
        A.Resize(size, size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ColorJitter(p=0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])


def get_valid_transforms(size=768):
    return A.Compose([
        A.Resize(size, size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])



tta_transforms = {
    "base": A.Compose([
        A.Resize(768, 768),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ]),
    "hflip": A.Compose([
        A.Resize(768, 768),
        A.HorizontalFlip(p=1),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ]),
    "vflip": A.Compose([
        A.Resize(768, 768),
        A.VerticalFlip(p=1),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
}

In [21]:
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

def compute_fold_cv_score(valid_df, all_preds, all_targets):
    """
    计算单个 Fold 的 Weighted R² 分数（与 Kaggle Metric 对齐）

    参数:
        valid_df      : 当前 fold 的验证 DataFrame（含真实值5列）
        all_preds     : 模型预测结果 (list of numpy arrays, shape=[N,3])
        all_targets   : 真实目标 (list of numpy arrays, shape=[N,3])

    返回:
        weighted_r2   : 加权 R² 分数
        r2_each       : 各目标单独 R²
    """
    preds_array = np.concatenate(all_preds)
    targets_array = np.concatenate(all_targets)

    # 构建真实值表
    df_val = valid_df.copy()
    df_val[["Dry_Green_g", "GDM_g", "Dry_Total_g"]] = targets_array

    # 构建预测表
    df_pred = df_val.copy()
    df_pred["Dry_Green_g"] = preds_array[:, 0]
    df_pred["GDM_g"]       = preds_array[:, 1]
    df_pred["Dry_Total_g"] = preds_array[:, 2]

    # 根据关系式补齐
    df_pred["Dry_Clover_g"] = df_pred["GDM_g"] - df_pred["Dry_Green_g"]
    df_pred["Dry_Dead_g"]   = df_pred["Dry_Total_g"] - df_pred["GDM_g"]

    # 计算各列R²
    target_cols = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]
    r2_each = {col: r2_score(df_val[col], df_pred[col]) for col in target_cols}

    # 加权平均（权重与 Kaggle 一致）
    weights = {
        "Dry_Green_g": 0.1,
        "Dry_Dead_g": 0.1,
        "Dry_Clover_g": 0.1,
        "GDM_g": 0.2,
        "Dry_Total_g": 0.5,
    }
    weighted_r2 = sum(r2_each[k] * w for k, w in weights.items())
    return weighted_r2, r2_each


In [22]:
# 🔹 单轮训练
def train_one_epoch(model, dataloader, optimizer, device, scaler):
    model.train()
    running_loss = []

    for img_left, img_right, targets in dataloader:
        img_left, img_right, targets = (
            img_left.to(device, non_blocking=True),
            img_right.to(device, non_blocking=True),
            targets.to(device, non_blocking=True),
        )

        optimizer.zero_grad(set_to_none=True)  # ✅ 更高效清空梯度
        # ✅ AMP混合精度上下文
        with autocast():
            preds = model(img_left, img_right)
            loss = model.compute_loss(preds, targets)

        scaler.scale(loss).backward()
        # scaler.unscale_(optimizer)  # 可选：如果想加梯度裁剪，可在此解缩放
        scaler.step(optimizer)
        scaler.update()

        running_loss.append(loss.item())

    return float(np.mean(running_loss))

In [23]:
# 🔹 单轮验证 + 本地CV
def validate_one_epoch(model, dataloader, valid_df, device):
    model.eval()
    val_losses, all_preds, all_targets = [], [], []

    with torch.no_grad():
        for img_left, img_right, targets in dataloader:
            img_left, img_right, targets = (
                img_left.to(device, non_blocking=True),
                img_right.to(device, non_blocking=True),
                targets.to(device, non_blocking=True),
            )
            preds = model(img_left, img_right)
            val_loss = model.compute_loss(preds, targets).item()
            val_losses.append(val_loss)
            
            all_preds.append(preds.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    avg_val_loss = float(np.mean(val_losses))
    weighted_r2, _ = compute_fold_cv_score(valid_df, all_preds, all_targets)
    return avg_val_loss, weighted_r2



In [None]:
# 🔹 主函数：KFold 训练
def train_with_groupkfold(
    df_train,
    save_dir,
    model_target_cols,
    get_train_transforms,
    get_valid_transforms,
    weights,
    freeze_ratio=0.8,
    batch_size=32,
    epochs=50,
    lr=1e-4,
    device=None,
    n_splits=5,
    save_interval=20,
):
    start_time = time.time()

    gkf = GroupKFold(n_splits=n_splits)

    df = df_train.copy()
    groups = df["Sampling_Date"]

    # 用于保存每折 训练损失  验证  本地CV
    fold_train_losses, fold_val_losses, fold_cv_scores = [], [], []

    for fold, (train_idx, val_idx) in enumerate(gkf.split(df, groups=groups)):
        train_df = df.iloc[train_idx].reset_index(drop=True)
        valid_df = df.iloc[val_idx].reset_index(drop=True)

        train_dataset = DualStreamDataset(train_df, DIRS["dir"], model_target_cols, transform=get_train_transforms(768))
        valid_dataset = DualStreamDataset(valid_df, DIRS["dir"], model_target_cols, transform=get_valid_transforms(768))

        # ✅ 增加 pin_memory 提高主机→GPU 传输速度
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

        # ✅ 模型优化：channels_last 内存布局 + AMP 兼容
        model = MyDualStreamModel("convnext_tiny", pretrained=True, freeze_ratio=freeze_ratio, weights_dict=weights)
        model = model.to(device).to(memory_format=torch.channels_last)

        optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
        
        # ✅ 初始化缩放器（用于FP16梯度稳定）
        scaler = GradScaler()
        
        # 用于保存当前折 训练损失  验证  本地CV
        train_losses, val_losses, cv_scores = [], [], []

        for epoch in range(epochs):
            
            avg_train_loss = train_one_epoch(model, train_loader, optimizer, device, scaler)
            avg_val_loss, weighted_r2 = validate_one_epoch(model, valid_loader, valid_df, device)

            train_losses.append(avg_train_loss)
            val_losses.append(avg_val_loss)
            cv_scores.append(weighted_r2)

            # ===  保存  ===
            if (epoch + 1) % save_interval == 0:
                save_path = save_dir / f"model_weights_fold{fold}_epoch{epoch+1}.pt"
                torch.save(model.state_dict(), save_path)

            # === 时间计算 ===
            elapsed = time.time() - start_time
            progress = (epoch + 1) + fold * epochs
            all_progress = epochs * n_splits
            eta_seconds = elapsed / progress * (all_progress - progress)
            eta_time = datetime.now() + timedelta(seconds=eta_seconds)
            now_str = datetime.now().strftime("%H:%M:%S")
            eta_str = eta_time.strftime("%H:%M:%S")

            # === 🖨️ 打印信息（带时间 + 预计结束时间） ===
            print(
                f"[{now_str}]🧩[{progress/all_progress*100:.2f}%] Fold{fold+1:2d}/{n_splits} "
                f"Epoch{epoch+1:3d}/{epochs} | "
                f"Train={avg_train_loss:.4f} | "
                f"Val={avg_val_loss:.4f} | "
                f"CV={weighted_r2:.4f} | "
                f"{elapsed / progress:.2f}s/it | "
                f"ETA≈{eta_str}",
                end="\r",
                flush=True
            )

        # 保存完整 fold
        torch.save(model.state_dict(), save_dir / f"model_weights_fold{fold}_final.pt")
        fold_train_losses.append(train_losses)
        fold_val_losses.append(val_losses)
        fold_cv_scores.append(cv_scores)


    # 🔹 保存结果
    max_epochs = max(len(x) for x in fold_train_losses)
    df_out = pd.DataFrame({"Epoch": range(1, max_epochs + 1)})

    for i, (train_list, val_list, cv_list) in enumerate(zip(fold_train_losses, fold_val_losses, fold_cv_scores), start=1):
        df_out[f"Train_Loss_Fold{i}"] = train_list + [None]*(max_epochs-len(train_list))
        df_out[f"Val_Loss_Fold{i}"]   = val_list   + [None]*(max_epochs-len(val_list))
        df_out[f"CV_Fold{i}"]         = cv_list    + [None]*(max_epochs-len(cv_list))

    out_path = Path(save_dir, "fold_metrics.xlsx")
    df_out.to_excel(out_path, index=False)
    print(f"✅ 训练日志已保存: {out_path}")


In [25]:
# ⚙️ 模型与训练配置

# 1️⃣ 损失权重设置（针对主要目标）
weights = {
    "Dry_Green_g" : 0.1,
    "GDM_g"       : 0.2,
    "Dry_Total_g" : 0.5,
}

# 2️⃣ 模型预测与训练目标列
model_target_cols = [
    "Dry_Green_g",
    "GDM_g",
    "Dry_Total_g",
]

target_cols = [
    "Dry_Green_g",
    "Dry_Dead_g",
    "Dry_Clover_g",
    "GDM_g",
    "Dry_Total_g",
]

# 3️⃣ 训练超参数配置
config = {
    "epochs"       : 300,
    "freeze_ratio" : 0.8,
    "batch_size"   : 32,
    "lr"           : 1e-4,
    "n_splits"     : 5,
    "save_interval":20,
}


In [26]:
# 📘 数据读取与预处理

# 1️⃣ 读取原始数据
df_file_path = Path(DIRS["dir"]) / "train.csv"
df = pd.read_csv(df_file_path)
show_df_info(df, "train.csv")

# 2️⃣ 提取唯一 ID（例如 "ID1011485656__Dry_Green_g" → "ID1011485656"）
df["ID"] = df["sample_id"].str.split("__").str[0]

# 3️⃣ 将 ID 列移动到最前面
df = move_column_first(df, "ID")
show_df_info(df, "df")

# 4️⃣ 目标值透视（行转列）
df_targets = (
    df
    .pivot_table(
        index="ID",
        columns="target_name",
        values="target",
        aggfunc="first"
    )
    .reset_index()
)
df_targets.columns.name = None  # 去掉多级列名层次
show_df_info(df_targets, "df_targets")

# 5️⃣ 提取元信息（每个 ID 仅保留一行）
meta_cols = [
    "ID", "image_path", "Sampling_Date", "State",
    "Species", "Pre_GSHH_NDVI", "Height_Ave_cm"
]
df_meta = df[meta_cols].drop_duplicates(subset="ID")
show_df_info(df_meta, "df_meta")

# 6️⃣ 合并元信息与目标数据
df_train = pd.merge(df_meta, df_targets, on="ID", how="left")
show_df_info(df_train, "df_train")


📊 train.csv        shape: (1785, 9)         列名: ['sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']
📊 df               shape: (1785, 10)        列名: ['ID', 'sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']
📊 df_targets       shape: (357, 6)          列名: ['ID', 'Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
📊 df_meta          shape: (357, 7)          列名: ['ID', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']
📊 df_train         shape: (357, 12)         列名: ['ID', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']


# 训练部分 本地运行

In [27]:
# 启动训练 🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀
if socket.gethostname() == 'hao-2':
    time_str = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    print(time_str)

    history_DIR = Path(DIRS['model'], time_str)
    os.makedirs(history_DIR, exist_ok=True)

    # 保存当前配置
    config_path = history_DIR / "config.json"
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, indent=4, ensure_ascii=False)
    print(f"✅ 配置文件已保存到: {config_path}")

    # 🚀 调用主函数
    fold_train_losses, fold_val_losses = train_with_groupkfold(
        df_train             = df_train,
        save_dir             = history_DIR,
        model_target_cols    = model_target_cols,
        get_train_transforms = get_train_transforms,
        get_valid_transforms = get_valid_transforms,
        weights              = weights,
        freeze_ratio         = config["freeze_ratio"],
        batch_size           = config["batch_size"],
        epochs               = config["epochs"],
        lr                   = config["lr"],
        device               = device,
        n_splits             = config["n_splits"],
        save_interval        = config["save_interval"],
    )

    print("\n✅ 全部训练完成！结果保存在：", history_DIR)


2025-11-01 18-29-10
✅ 配置文件已保存到: D:\DATA_hao\Kaggle_\csiro-biomass\DualStream_multihead\2025-11-01 18-29-10\config.json
[18:31:26] [0.33%]🧩 Fold  1/5 Epoch   5/300 | Train=15.8310 | Val=10.5612 | CV=0.0657 | 27.13s/it | ETA≈05:47:194

KeyboardInterrupt: 

# 预测部分 

In [None]:
# 📘 数据读取与预处理（测试集）

# 1️⃣ 读取原始数据
df_file_path = Path(DIRS["dir"]) / "test.csv"
df = pd.read_csv(df_file_path)
show_df_info(df, "test.csv")

# 2️⃣ 提取唯一 ID（例如 "ID1011485656__Dry_Green_g" → "ID1011485656"）
df["ID"] = df["sample_id"].str.split("__").str[0]

# 3️⃣ 将 ID 列移动到最前面
df = move_column_first(df, "ID")

# 4️⃣ 初始化目标列（test 集无目标值）
df["target"] = 0
show_df_info(df, "df")

# 5️⃣ 目标列透视（行转列结构保持一致）
df_targets = (
    df
    .pivot_table(
        index="ID",
        columns="target_name",
        values="target",
        aggfunc="first"
    )
    .reset_index()
)
df_targets.columns.name = None  # 去掉多级列名层次
show_df_info(df_targets, "df_targets")

# 6️⃣ 提取元信息（每个 ID 仅保留一行）
meta_cols = [
    "ID",
    "image_path",
]
df_meta = df[meta_cols].drop_duplicates(subset="ID")
show_df_info(df_meta, "df_meta")

# 7️⃣ 合并元信息与目标数据
df_test = pd.merge(df_meta, df_targets, on="ID", how="left")
show_df_info(df_test, "df_test")


In [None]:
# 基于 model  transform  model_dir  预测

def predict_ensemble_df(df_test, transform, model, model_target_cols, model_dir, device, batch_size=32, img_size=768):

    model_dir = model_dir
    print(f"模型目录: {model_dir}")
    assert model_dir.exists(), f"❌ 模型目录不存在: {model_dir}"

    # 🔍 搜索所有 fold 模型
    model_paths = sorted(model_dir.glob("model_weights_fold*.pt"))
    if not model_paths:
        raise FileNotFoundError(f"❌ 未找到模型文件: {model_dir}/model_weights_fold*.pt")

    print(f"🔹 检测到 {len(model_paths)} 个模型:")
    for p in model_paths:
        print("   -", p.name)

    # 3️⃣ 构建测试数据集
    test_dataset = DualStreamDataset(
        df_test,
        image_dir=DIRS["dir"],
        target_cols=None,
        transform=transform   
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True
    )

    # 存储每个fold的预测
    fold_preds = []

    for fold, model_path in enumerate(model_paths):
        print(f"🚀 加载模型 {fold+1}/{len(model_paths)}: {model_path.name}")

        # 1️⃣ 加载模型结构
        model = model

        # 2️⃣ 加载权重
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        model.eval()

        # 3️⃣ 推理
        preds_list = []
        with torch.no_grad():
            for img_left, img_right in test_loader:
                img_left, img_right = img_left.to(device, non_blocking=True), img_right.to(device, non_blocking=True)
                preds = model(img_left, img_right)
                preds_list.append(preds.cpu().numpy())

        fold_pred = np.concatenate(preds_list, axis=0)
        fold_preds.append(fold_pred)

    # 4️⃣ 多模型平均
    preds_mean = np.mean(fold_preds, axis=0)
    df_pred3 = pd.DataFrame(preds_mean, columns=model_target_cols)





    # 恢复完整的 5 个目标列
    df_pred5 = recover_all_targets(df_pred3)
    show_df_info(df_pred5, "df_pred5 ")


    # 追加样本 ID 并调整列顺序
    df_pred5["ID"] = df_test["ID"]
    df_pred5 = df_pred5[["ID"] + target_cols]




    # 打印结果预览
    show_df_info(df_pred5, "final df_pred5")

    return df_pred5


In [None]:
# 🧠 模型加载与 TTA 推理


# 1️⃣ 加载模型结构
model = MyDualStreamModel("convnext_tiny", 
                          pretrained=False, 
                          freeze_ratio=config["freeze_ratio"], 
                          weights_dict=weights)
model = model.to(device)
model = model.to(memory_format=torch.channels_last)


# 2️⃣ 设置模型目录（根据运行环境自动切换）
if socket.gethostname() == "hao-2":
    model_dir = Path(DIRS["model"] , "2025-10-31 23-01-43")
else:
    model_dir = DIRS["model"]



# 3️⃣ 执行 TTA（Test-Time Augmentation）推理
tta_preds = []

for name, tform in tta_transforms.items():
    print(f"\n🚀 Running TTA: {name}")

    transform  = tform
    df_pred5   = predict_ensemble_df(
        df_test           = df_test,
        transform         = transform,
        model             = model,
        model_target_cols = model_target_cols,
        model_dir         = model_dir,
        device            = device,
    )
    
    # ✅ 输出阶段性结果
    print(f"\n📄 当前 TTA 模式 [{name}] 的预测结果预览：")
    print(df_pred5.head())

    tta_preds.append(df_pred5[target_cols].values)

    print(f"\n📦 当前已收集的 TTA 结果数量：{len(tta_preds)}")
    print(f"📊 当前累计结果形状：{np.array(tta_preds).shape}")
    print("-" * 60)
    print("\n\n\n")


# 4️⃣ 汇总 TTA 结果并计算平均预测
print("\n📦 聚合全部 TTA 结果：")
print(f"共有 {len(tta_preds)} 组预测结果。")
for i, arr in enumerate(tta_preds):
    print(f"  └─ 第 {i+1} 组预测: {arr}")

mean_preds = np.mean(tta_preds, axis=0)

print("\n🧮 计算平均值完成：")
print(mean_preds)
print(f"\n✅ 聚合完成，mean_preds 形状：{mean_preds.shape}")









# 5️⃣ 生成最终预测 DataFrame
df_pred_final = df_pred5.copy()
df_pred_final[target_cols] = mean_preds

print("\n🧾 最终预测 DataFrame 预览：")
print(df_pred_final.head())
show_df_info(df_pred_final, "df_pred_final")






In [None]:
# 📤 5️⃣ 生成 Kaggle 提交文件 submission.csv



df = df_pred_final






# 按指定顺序展开
ordered_target_cols = [
    "Dry_Clover_g",  # 1️⃣
    "Dry_Dead_g",    # 2️⃣
    "Dry_Green_g",   # 3️⃣
    "Dry_Total_g",   # 4️⃣
    "GDM_g"          # 5️⃣
]

df_submit = (
    df
    .melt(id_vars="ID", value_vars=ordered_target_cols,
          var_name="target_name", value_name="target")
)

# 组合成 Kaggle 所需的 sample_id
df_submit["sample_id"] = df_submit["ID"] + "__" + df_submit["target_name"]

df_submit = move_column_first(df_submit, "target")
df_submit = move_column_first(df_submit, "sample_id")

# 只保留 Kaggle 要的两列
df_submit = df_submit[["sample_id", "target"]]
df_submit
# 按 sample_id 排序（可选）
# df_submit = df_submit.sort_values("sample_id").reset_index(drop=True)

# 保存文件
df_submit.to_csv("submission.csv", index=False)
print("✅ 已生成提交文件 submission.csv")
