In [1]:
import os
import socket
from pathlib import Path

import pandas as pd
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.models as models
import h5py

import torch
from torchvision import models
from torchvision.models import get_model_weights
from pathlib import Path
import os

from tqdm import tqdm
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ 使用设备: {device}")

  from .autonotebook import tqdm as notebook_tqdm


✅ 使用设备: cuda


In [2]:
# 初始化
if socket.gethostname() == 'hao-2':
    dir = Path('D:/DATA_hao/Kaggle_/csiro-biomass/')
else:
    dir = Path(os.getcwd())

DIRS = {
    "dir":        dir,                                       
    "train":     Path(dir, "train"),                              
    "test":     Path(dir, "test"),                              
    "model":     Path(dir, "feature_extractor_models"),   
}
# 自动创建目录
for key, path in DIRS.items():
    os.makedirs(path, exist_ok=True)

# 打印时一行一个地址
print("✅ 路径已创建：\n")
for key, path in DIRS.items():
    print(f"{key:<12} : {path}")

def move_column_first(df, col_name):
    """
    将 DataFrame 中指定列移动到最前面。
    参数:
        df (pd.DataFrame): 原始数据框
        col_name (str): 要移动到最前面的列名
    返回:
        pd.DataFrame: 调整后的新 DataFrame
    """
    if col_name not in df.columns:
        raise ValueError(f"列 '{col_name}' 不存在于 DataFrame 中。")

    cols = [col_name] + [c for c in df.columns if c != col_name]
    return df[cols]



✅ 路径已创建：

dir          : D:\DATA_hao\Kaggle_\csiro-biomass
train        : D:\DATA_hao\Kaggle_\csiro-biomass\train
test         : D:\DATA_hao\Kaggle_\csiro-biomass\test
model        : D:\DATA_hao\Kaggle_\csiro-biomass\feature_extractor_models


In [None]:
# 加载保存 图像->特征 提取模型与预训练参数

# | 模型                | 输出维度 | 若拆成两块后拼接 | 备注         |
# | ----------------- | ---- | -------- | ---------- |
# | `resnet50`        | 2048 | **4096** | 特征最丰富，语义层强 |
# | `efficientnet_b3` | 1536 | **3072** | 结构更轻，泛化好   |
# | `convnext_tiny`   | 768  | **1536** | 更现代架构，轻量快速 |



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ 使用设备: {device}")


# =========================================
# 🧠 2️⃣ 加载模型与权重
# =========================================
# ✅ ResNet50
resnet_weights = get_model_weights("resnet50").IMAGENET1K_V1
resnet50 = models.resnet50(weights=resnet_weights)
resnet50.fc = torch.nn.Identity()  # 去掉分类头
resnet50.eval().to(device)

# ✅ EfficientNet-B3
effnet_weights = get_model_weights("efficientnet_b3").IMAGENET1K_V1
effnetb3 = models.efficientnet_b3(weights=effnet_weights)
effnetb3.classifier = torch.nn.Identity()
effnetb3.eval().to(device)

# ✅ ConvNeXt-Tiny
convnext_weights = get_model_weights("convnext_tiny").IMAGENET1K_V1
convnext_t = models.convnext_tiny(weights=convnext_weights)
convnext_t.classifier = torch.nn.Identity()
convnext_t.eval().to(device)

print("✅ 所有模型加载完成，已设置为特征提取模式。")


# =========================================
# 💾 3️⃣ 保存模型（结构 + 参数）
# =========================================
base_dir = DIRS["model"]
os.makedirs(base_dir, exist_ok=True)

torch.save(resnet50, Path(base_dir , "resnet50_feature_extractor.pth"))
torch.save(effnetb3, Path(base_dir , "efficientnet_b3_feature_extractor.pth"))
torch.save(convnext_t, Path(base_dir , "convnext_tiny_feature_extractor.pth"))

print(f"💾 模型已保存到本地路径: {base_dir.resolve()}")


✅ 使用设备: cuda
✅ 所有模型加载完成，已设置为特征提取模式。
💾 模型已保存到本地路径: D:\DATA_hao\Kaggle_\csiro-biomass\feature_extractor_models


In [14]:
def show_df_info(df, name: str):
    """
    打印单个 DataFrame 的形状与列名信息。
    参数:
        df   : pandas.DataFrame
        name : 显示名称（字符串）
    """
    print(f"📊 {name:<16} shape: {str(df.shape):<16}  列名: {df.columns.tolist()}")



def move_column_first(df, col_name):
    """
    将 DataFrame 中指定列移动到最前面。
    参数:
        df (pd.DataFrame): 原始数据框
        col_name (str): 要移动到最前面的列名
    返回:
        pd.DataFrame: 调整后的新 DataFrame
    """
    if col_name not in df.columns:
        raise ValueError(f"列 '{col_name}' 不存在于 DataFrame 中。")

    cols = [col_name] + [c for c in df.columns if c != col_name]
    return df[cols]


from sklearn.metrics import r2_score

def weighted_r2(y_true_df, y_pred_df):
    weights = {
        "Dry_Green_g": 0.1,
        "Dry_Dead_g": 0.1,
        "Dry_Clover_g": 0.1,
        "GDM_g": 0.2,
        "Dry_Total_g": 0.5
    }

    r2_dict = {}
    for col in weights.keys():
        r2_dict[col] = r2_score(y_true_df[col], y_pred_df[col])

    weighted_score = sum(r2_dict[k] * w for k, w in weights.items())
    return weighted_score, r2_dict



处理文本数据

In [11]:
df_train = pd.read_csv(DIRS["dir"] / "train.csv")


print(df_train.columns.tolist())

df_train.head(20)


['sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']


Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,ID1011485656__Dry_Green_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,ID1011485656__Dry_Total_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,ID1011485656__GDM_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.275
5,ID1012260530__Dry_Clover_g,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,Dry_Clover_g,0.0
6,ID1012260530__Dry_Dead_g,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,Dry_Dead_g,0.0
7,ID1012260530__Dry_Green_g,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,Dry_Green_g,7.6
8,ID1012260530__Dry_Total_g,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,Dry_Total_g,7.6
9,ID1012260530__GDM_g,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,GDM_g,7.6


In [15]:
df_train["ID"] = df_train["sample_id"].str.split("__").str[0]
df_train = move_column_first(df_train, "ID")

show_df_info(df_train, "训练集")

📊 训练集              shape: (1785, 10)        列名: ['ID', 'sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']


In [16]:
# 1️⃣ 首先行转列（pivot目标部分）
df_targets = (
    df_train
    .pivot_table(index="ID", columns="target_name", values="target", aggfunc="first")
    .reset_index()
)
df_targets.columns.name = None  # 清理多级列名

# 2️⃣ 从 df_train 中提取每个 ID 的其他非目标信息（它们五行是相同的，取 first 即可）
meta_cols = ['ID', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']
df_meta = df_train[meta_cols].drop_duplicates(subset='ID')  # 或 groupby('ID').first().reset_index()

# 3️⃣ 合并两部分
df_wide = pd.merge(df_meta, df_targets, on='ID', how='left')

df_wide.head(20)


Unnamed: 0,ID,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g
0,ID1011485656,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,0.0,31.9984,16.2751,48.2735,16.275
1,ID1012260530,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,0.0,0.0,7.6,7.6,7.6
2,ID1025234388,train/ID1025234388.jpg,2015/9/1,WA,SubcloverDalkeith,0.38,1.0,6.05,0.0,0.0,6.05,6.05
3,ID1028611175,train/ID1028611175.jpg,2015/5/18,Tas,Ryegrass,0.66,5.0,0.0,30.9703,24.2376,55.2079,24.2376
4,ID1035947949,train/ID1035947949.jpg,2015/9/11,Tas,Ryegrass,0.54,3.5,0.4343,23.2239,10.5261,34.1844,10.9605
5,ID1036339023,train/ID1036339023.jpg,2015/9/30,Vic,Phalaris_Clover,0.82,7.0,23.0755,2.6135,32.191,57.88,55.2665
6,ID1049634115,train/ID1049634115.jpg,2015/7/2,Vic,Ryegrass_Clover,0.72,2.0,1.5083,3.0167,13.575,18.1,15.0833
7,ID1051144034,train/ID1051144034.jpg,2015/9/1,WA,SubcloverLosa,0.8,1.0,55.32,0.0,0.0,55.32,55.32
8,ID1052620238,train/ID1052620238.jpg,2015/5/18,Tas,Ryegrass,0.68,5.0,0.0,11.2291,20.1707,31.3998,20.1707
9,ID105271783,train/ID105271783.jpg,2015/6/30,Vic,Phalaris_Clover,0.86,4.0,5.2698,8.5635,27.6667,41.5,32.9365


In [17]:
df_wide["my_file_path"] = df_wide["image_path"].apply(lambda x: Path(DIRS["train"], x.split("/")[-1]))


df_wide["my_file_path"][1]



WindowsPath('D:/DATA_hao/Kaggle_/csiro-biomass/train/ID1012260530.jpg')

In [18]:
import os
from PIL import Image
from collections import Counter

def analyze_image_sizes(folder_path):
    """
    统计指定文件夹中所有图片的尺寸分布
    参数：
        folder_path (str): 图片所在文件夹路径
    返回：
        size_counter (Counter): 尺寸分布计数器
    """
    size_counter = Counter()
    total = 0

    for fname in os.listdir(folder_path):
        if not fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        fpath = os.path.join(folder_path, fname)
        try:
            with Image.open(fpath) as img:
                size = img.size  # (width, height)
                size_counter[size] += 1
                total += 1
        except Exception as e:
            print(f"⚠️ 无法读取图片: {fname} ({e})")

    print(f"\n📊 共统计到 {total} 张图片。")
    print(f"🔹 不同尺寸数量: {len(size_counter)} 种\n")

    # 显示前几个最常见的尺寸
    for (w, h), count in size_counter.most_common(10):
        print(f"  尺寸 {w}×{h} ：{count} 张")

    return size_counter


# =============================
# 🎯 使用示例
## 替换为你的图片文件夹路径
size_distribution = analyze_image_sizes(DIRS["train"])    


📊 共统计到 357 张图片。
🔹 不同尺寸数量: 1 种

  尺寸 2000×1000 ：357 张


In [None]:

# ======================================
# 1️⃣ 从本地加载模型
# ======================================
def load_saved_model(name, base_dir, device):
    path = base_dir / f"{name}_feature_extractor.pth"
    if not path.exists():
        raise FileNotFoundError(f"❌ 未找到模型文件: {path}")
    model = torch.load(path, map_location=device)
    model.eval().to(device)
    print(f"✅ 已加载模型: {name}")
    return model

resnet50   = load_saved_model("resnet50", DIRS["model"], device)
effnetb3   = load_saved_model("efficientnet_b3", DIRS["model"], device)
convnext_t = load_saved_model("convnext_tiny", DIRS["model"], device)

# ======================================
# 2️⃣ 图像预处理（官方推荐）
# ======================================
# 这些参数与你当初下载权重时默认 transforms 一致
resnet_transform = transforms.Compose([
    transforms.Resize(232),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
effnet_transform = transforms.Compose([
    transforms.Resize(300),
    transforms.CenterCrop(300),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
convnext_transform = transforms.Compose([
    transforms.Resize(232),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# ======================================
# 3️⃣ 单张图像的特征提取函数
# ======================================
def extract_feature(image_path, model, transform):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"⚠️ 无法读取图片: {image_path} ({e})")
        return None

    x = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = model(x).squeeze().cpu().numpy()
    return feat

# ======================================
# 4️⃣ 批量特征提取
# ======================================
def extract_all_features(df, path_col, models_dict, transforms_dict):
    all_features, ids = [], []

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Extracting multi-model features"):
        img_path = row[path_col]
        if not os.path.exists(img_path):
            print(f"⚠️ 图片不存在: {img_path}")
            continue

        feats = []
        for name in models_dict.keys():
            model = models_dict[name]
            transform = transforms_dict[name]
            f = extract_feature(img_path, model, transform)
            if f is not None:
                feats.append(f)

        # 拼接多个模型特征
        if len(feats) > 0:
            all_features.append(np.concatenate(feats))
            ids.append(row["ID"])

    # 保存 npy 文件
    all_features = np.vstack(all_features)
    print(f"\n✅ 特征处理完成 shape = {all_features.shape}")
    return all_features, ids


# ======================================
# 5️⃣ 模型 & 变换配置
# ======================================
models_dict = {
    "resnet50": resnet50,
    "effnetb3": effnetb3,
    "convnext_t": convnext_t
}
transforms_dict = {
    "resnet50": resnet_transform,
    "effnetb3": effnet_transform,
    "convnext_t": convnext_transform
}

# ======================================
# 6️⃣ 运行特征提取
# ======================================
# ⚠️ 确认 df_wide["my_file_path"] 是图片完整路径
# 示例: "D:/data/Kaggle_/csiro-biomass/train/ID1011485656.jpg"

features, ids = extract_all_features(
    df=df_wide,
    path_col="my_file_path",
    models_dict=models_dict,
    transforms_dict=transforms_dict
)

# ======================================
# 7️⃣ 保存为 DataFrame
# ======================================
# 将特征矩阵转为 DataFrame
df_img_feats = pd.DataFrame(features, columns=[f"img_feat_{i + 1}" for i in range(features.shape[1])])

# 添加 ID 列
df_img_feats["ID"] = ids

# 调整列顺序，让 ID 放在最前
df_img_feats = move_column_first(df_img_feats, "ID")

# 保存为 Feather 文件（高效的二进制格式）
df_img_feats.to_feather(Path(DIRS["dir"] / "train_img_feats.feather"))

show_df_info(df_img_feats, "图像特征")
print("✅ 多模型特征提取完成并已保存！")


✅ 已加载模型: resnet50
✅ 已加载模型: efficientnet_b3
✅ 已加载模型: convnext_tiny


Extracting multi-model features: 100%|██████████| 357/357 [00:44<00:00,  8.10it/s]



✅ 特征处理完成 shape = (357, 4352)
📦 df_img_feats shape: (357, 4353)
✅ 多模型特征提取完成并已保存！


In [34]:
import pandas as pd
df_img_feats = pd.read_feather(Path(DIRS["dir"]) / "train_img_feats.feather")
show_df_info(df_img_feats, "图像特征")

📊 图像特征             shape: (357, 4353)       列名: ['ID', 'img_feat_1', 'img_feat_2', 'img_feat_3', 'img_feat_4', 'img_feat_5', 'img_feat_6', 'img_feat_7', 'img_feat_8', 'img_feat_9', 'img_feat_10', 'img_feat_11', 'img_feat_12', 'img_feat_13', 'img_feat_14', 'img_feat_15', 'img_feat_16', 'img_feat_17', 'img_feat_18', 'img_feat_19', 'img_feat_20', 'img_feat_21', 'img_feat_22', 'img_feat_23', 'img_feat_24', 'img_feat_25', 'img_feat_26', 'img_feat_27', 'img_feat_28', 'img_feat_29', 'img_feat_30', 'img_feat_31', 'img_feat_32', 'img_feat_33', 'img_feat_34', 'img_feat_35', 'img_feat_36', 'img_feat_37', 'img_feat_38', 'img_feat_39', 'img_feat_40', 'img_feat_41', 'img_feat_42', 'img_feat_43', 'img_feat_44', 'img_feat_45', 'img_feat_46', 'img_feat_47', 'img_feat_48', 'img_feat_49', 'img_feat_50', 'img_feat_51', 'img_feat_52', 'img_feat_53', 'img_feat_54', 'img_feat_55', 'img_feat_56', 'img_feat_57', 'img_feat_58', 'img_feat_59', 'img_feat_60', 'img_feat_61', 'img_feat_62', 'img_feat_63', 'img_feat

In [35]:
# 按 ID 合并图像特征与表格特征


show_df_info(df_wide, "df_wide")
show_df_info(df_img_feats, "df_img_feats")


df_merged = df_wide.merge(df_img_feats, on="ID", how="left")

print(f"✅ 合并完成: {df_merged.shape}")
show_df_info(df_merged, "df_merged")




📊 df_wide          shape: (357, 13)         列名: ['ID', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g', 'my_file_path']
📊 df_img_feats     shape: (357, 4353)       列名: ['ID', 'img_feat_1', 'img_feat_2', 'img_feat_3', 'img_feat_4', 'img_feat_5', 'img_feat_6', 'img_feat_7', 'img_feat_8', 'img_feat_9', 'img_feat_10', 'img_feat_11', 'img_feat_12', 'img_feat_13', 'img_feat_14', 'img_feat_15', 'img_feat_16', 'img_feat_17', 'img_feat_18', 'img_feat_19', 'img_feat_20', 'img_feat_21', 'img_feat_22', 'img_feat_23', 'img_feat_24', 'img_feat_25', 'img_feat_26', 'img_feat_27', 'img_feat_28', 'img_feat_29', 'img_feat_30', 'img_feat_31', 'img_feat_32', 'img_feat_33', 'img_feat_34', 'img_feat_35', 'img_feat_36', 'img_feat_37', 'img_feat_38', 'img_feat_39', 'img_feat_40', 'img_feat_41', 'img_feat_42', 'img_feat_43', 'img_feat_44', 'img_feat_45', 'img_feat_46', 'img_feat_47', 'img_feat_48', 'img_feat

In [None]:
'ID', 'image_path','my_file_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm',
'Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g',
'img_feat_1',

0.008888091

✅ 特征矩阵 X: (357, 4354), 目标矩阵 y: (357, 5)
📊 特征矩阵 X:          shape: (357, 4354)       列名: ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'img_feat_1', 'img_feat_2', 'img_feat_3', 'img_feat_4', 'img_feat_5', 'img_feat_6', 'img_feat_7', 'img_feat_8', 'img_feat_9', 'img_feat_10', 'img_feat_11', 'img_feat_12', 'img_feat_13', 'img_feat_14', 'img_feat_15', 'img_feat_16', 'img_feat_17', 'img_feat_18', 'img_feat_19', 'img_feat_20', 'img_feat_21', 'img_feat_22', 'img_feat_23', 'img_feat_24', 'img_feat_25', 'img_feat_26', 'img_feat_27', 'img_feat_28', 'img_feat_29', 'img_feat_30', 'img_feat_31', 'img_feat_32', 'img_feat_33', 'img_feat_34', 'img_feat_35', 'img_feat_36', 'img_feat_37', 'img_feat_38', 'img_feat_39', 'img_feat_40', 'img_feat_41', 'img_feat_42', 'img_feat_43', 'img_feat_44', 'img_feat_45', 'img_feat_46', 'img_feat_47', 'img_feat_48', 'img_feat_49', 'img_feat_50', 'img_feat_51', 'img_feat_52', 'img_feat_53', 'img_feat_54', 'img_feat_55', 'img_feat_56', 'img_feat_57', 'img_feat_58', 'img_feat_59', '

In [48]:
# =========================================
# 📦 导入依赖
# =========================================
import os
import torch
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


# =========================================
# ⚙️ 0️⃣ 全局参数配置
# =========================================
weights = {
    "Dry_Green_g": 0.1,
    "Dry_Dead_g": 0.1,
    "Dry_Clover_g": 0.1,
    "GDM_g": 0.2,
    "Dry_Total_g": 0.5
}


params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "max_depth": 8,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 2.0,
    "alpha": 0.5,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "seed": 42
}

n_splits = 5
num_boost_round = 5000
early_stopping_rounds = 200




# =========================================
# 🧩 1️⃣ 数据准备
# =========================================
# ⚠️ 请确保 df_merged 已经存在，并包含以下列：
# ['ID', 'image_path', 'my_file_path', 'Sampling_Date', 'State', 'Species'] + target_cols + 特征列

target_cols = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]
exclude_cols = target_cols + ['ID', 'image_path', 'my_file_path', 'Sampling_Date', 'State', 'Species']
feature_cols = [c for c in df_merged.columns if c not in exclude_cols]

X = df_merged[feature_cols]
y = df_merged[target_cols]

show_df_info(X, "特征矩阵 X:")
show_df_info(y, "目标矩阵 y:")


# =========================================
# 🧮 2️⃣ Weighted R² 函数
# =========================================
def weighted_r2(y_true_df, y_pred_df, weights):
    r2_dict = {}
    for col in weights.keys():
        r2_dict[col] = r2_score(y_true_df[col], y_pred_df[col])
    weighted_score = sum(r2_dict[k] * w for k, w in weights.items())
    return weighted_score, r2_dict


# =========================================
# 🔁 3️⃣ KFold 交叉验证训练
# =========================================
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros_like(y)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=================== Fold {fold+1}/{n_splits} ===================")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    val_preds = np.zeros_like(y_val)

    # 针对每个目标独立训练模型
    for t, target in enumerate(target_cols):
        dtrain = xgb.DMatrix(X_train, label=y_train[target])
        dval = xgb.DMatrix(X_val, label=y_val[target])

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False
        )
        print(f"   ✅ 目标 '{target}' 模型训练完成，最佳迭代次数: {model.best_iteration}")

        # 验证集预测
        val_preds[:, t] = model.predict(dval)

    # 本折验证表现
    y_val_df = pd.DataFrame(y_val.values, columns=target_cols)
    y_pred_df = pd.DataFrame(val_preds, columns=target_cols)

    weighted_score, detail = weighted_r2(y_val_df, y_pred_df, weights)
    fold_scores.append(weighted_score)

    print(f"🎯 Fold {fold+1} Weighted R²: {weighted_score:.4f}")
    for k, v in detail.items():
        print(f"   {k}: {v:.4f}")

    # 记录 OOF 预测
    oof_preds[val_idx] = val_preds


# =========================================
# 📈 4️⃣ 全部折的综合评估
# =========================================
y_true_df = pd.DataFrame(y.values, columns=target_cols)
y_pred_df = pd.DataFrame(oof_preds, columns=target_cols)

overall_score, overall_detail = weighted_r2(y_true_df, y_pred_df, weights)

print("\n=================== 最终结果 ===================")
print(f"平均 Fold Weighted R²: {np.mean(fold_scores):.4f}")
print(f"整体 OOF Weighted R²: {overall_score:.4f}")
for k, v in overall_detail.items():
    print(f"  {k}: {v:.4f}")

# 保存预测
y_pred_df["ID"] = df_merged["ID"]
y_pred_df.to_csv("oof_predictions.csv", index=False)
print("\n✅ 已保存 OOF 预测结果到 oof_predictions.csv")


📊 特征矩阵 X:          shape: (357, 4354)       列名: ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'img_feat_1', 'img_feat_2', 'img_feat_3', 'img_feat_4', 'img_feat_5', 'img_feat_6', 'img_feat_7', 'img_feat_8', 'img_feat_9', 'img_feat_10', 'img_feat_11', 'img_feat_12', 'img_feat_13', 'img_feat_14', 'img_feat_15', 'img_feat_16', 'img_feat_17', 'img_feat_18', 'img_feat_19', 'img_feat_20', 'img_feat_21', 'img_feat_22', 'img_feat_23', 'img_feat_24', 'img_feat_25', 'img_feat_26', 'img_feat_27', 'img_feat_28', 'img_feat_29', 'img_feat_30', 'img_feat_31', 'img_feat_32', 'img_feat_33', 'img_feat_34', 'img_feat_35', 'img_feat_36', 'img_feat_37', 'img_feat_38', 'img_feat_39', 'img_feat_40', 'img_feat_41', 'img_feat_42', 'img_feat_43', 'img_feat_44', 'img_feat_45', 'img_feat_46', 'img_feat_47', 'img_feat_48', 'img_feat_49', 'img_feat_50', 'img_feat_51', 'img_feat_52', 'img_feat_53', 'img_feat_54', 'img_feat_55', 'img_feat_56', 'img_feat_57', 'img_feat_58', 'img_feat_59', 'img_feat_60', 'img_feat_61', 'img_feat_6

In [49]:
# =========================================
# 📦 导入依赖
# =========================================
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from torch import nn
from torch.utils.data import Dataset, DataLoader


# =========================================
# ⚙️ 0️⃣ 全局参数配置
# =========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ 使用设备: {device}")

weights = {
    "Dry_Green_g": 0.1,
    "Dry_Dead_g": 0.1,
    "Dry_Clover_g": 0.1,
    "GDM_g": 0.2,
    "Dry_Total_g": 0.5
}
# 网络与训练参数
n_splits = 5
epochs = 150
batch_size = 32
lr = 1e-3
early_stop_patience = 15
save_dir = "./nn_models"
os.makedirs(save_dir, exist_ok=True)


# =========================================
# 🧩 1️⃣ 数据准备
# =========================================
# ⚠️ df_merged 已包含: ['ID', 'Sampling_Date', 'State', 'Species', ...] + target_cols


target_cols = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]

exclude_cols = ['ID', 'image_path', 'my_file_path', 'Sampling_Date', 'State', 'Species'] + target_cols
feature_cols = [c for c in df_merged.columns if c not in exclude_cols]

X = df_merged[feature_cols].astype(np.float32)
y = df_merged[target_cols].astype(np.float32)

print(f"✅ 特征维度: {X.shape}, 目标维度: {y.shape}")


# =========================================
# 🧮 2️⃣ Weighted R² 函数
# =========================================
def weighted_r2(y_true, y_pred, weights):
    r2_dict = {}
    for i, col in enumerate(weights.keys()):
        r2_dict[col] = r2_score(y_true[:, i], y_pred[:, i])
    weighted_score = sum(r2_dict[k] * w for k, w in weights.items())
    return weighted_score, r2_dict


# =========================================
# 🧠 3️⃣ 自定义 Dataset
# =========================================
class PastureDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        else:
            return self.X[idx], self.y[idx]


# =========================================
# 🧩 4️⃣ 定义多层感知机模型 (MLP)
# =========================================
class MLPRegressor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        return self.net(x)


# =========================================
# 🔁 5️⃣ KFold 交叉验证训练
# =========================================
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
oof_preds = np.zeros_like(y)
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\n=================== Fold {fold+1}/{n_splits} ===================")

    # 数据划分
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_ds = PastureDataset(X_train, y_train)
    val_ds = PastureDataset(X_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)

    # 初始化模型
    model = MLPRegressor(input_dim=X.shape[1], output_dim=len(target_cols)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    best_val_loss = np.inf
    best_epoch = 0
    patience_counter = 0

    # ====== 训练循环 ======
    for epoch in range(epochs):
        model.train()
        train_losses = []

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        # ===== 验证 =====
        model.eval()
        val_losses = []
        preds_list, true_list = [], []

        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_losses.append(criterion(preds, yb).item())
                preds_list.append(preds.cpu().numpy())
                true_list.append(yb.cpu().numpy())

        val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}/{epochs} | val_loss: {val_loss:.5f}")

        # ===== 早停机制 =====
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(save_dir, f"fold{fold+1}_best.pt"))
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                print(f"⏸️ 早停于第 {epoch+1} 轮 (最佳: {best_epoch+1}, val_loss={best_val_loss:.5f})")
                break

    # ===== 加载最佳模型并预测验证集 =====
    model.load_state_dict(torch.load(os.path.join(save_dir, f"fold{fold+1}_best.pt")))
    model.eval()
    with torch.no_grad():
        val_preds = model(torch.tensor(X_val.values, dtype=torch.float32).to(device)).cpu().numpy()

    oof_preds[val_idx] = val_preds

    # ===== 计算当前折的 R² =====
    weighted_score, detail = weighted_r2(y_val.values, val_preds, weights)
    fold_scores.append(weighted_score)

    print(f"🎯 Fold {fold+1} Weighted R²: {weighted_score:.4f}")
    for k, v in detail.items():
        print(f"   {k}: {v:.4f}")


# =========================================
# 📈 6️⃣ 全部折的综合评估
# =========================================
overall_score, overall_detail = weighted_r2(y.values, oof_preds, weights)
print("\n=================== 最终结果 ===================")
print(f"平均 Fold Weighted R²: {np.mean(fold_scores):.4f}")
print(f"整体 OOF Weighted R²: {overall_score:.4f}")
for k, v in overall_detail.items():
    print(f"  {k}: {v:.4f}")

# 保存预测结果
df_pred = pd.DataFrame(oof_preds, columns=target_cols)
df_pred["ID"] = df_merged["ID"]
df_pred.to_csv("nn_oof_predictions.csv", index=False)
print("\n✅ 已保存 OOF 预测结果到 nn_oof_predictions.csv")


✅ 使用设备: cuda
✅ 特征维度: (357, 4354), 目标维度: (357, 5)

Epoch 1/150 | val_loss: 913.23733
Epoch 2/150 | val_loss: 721.94633
Epoch 3/150 | val_loss: 465.04848
Epoch 4/150 | val_loss: 401.18000
Epoch 5/150 | val_loss: 272.13526
Epoch 6/150 | val_loss: 220.03103
Epoch 7/150 | val_loss: 165.04851
Epoch 8/150 | val_loss: 150.93578
Epoch 9/150 | val_loss: 140.33591
Epoch 10/150 | val_loss: 150.60328
Epoch 11/150 | val_loss: 126.39777
Epoch 12/150 | val_loss: 122.14347
Epoch 13/150 | val_loss: 112.30439
Epoch 14/150 | val_loss: 102.55896
Epoch 15/150 | val_loss: 104.96118
Epoch 16/150 | val_loss: 91.26711
Epoch 17/150 | val_loss: 88.75067
Epoch 18/150 | val_loss: 87.85561
Epoch 19/150 | val_loss: 93.68116
Epoch 20/150 | val_loss: 102.09514
Epoch 21/150 | val_loss: 93.44430
Epoch 22/150 | val_loss: 95.35359
Epoch 23/150 | val_loss: 89.92953
Epoch 24/150 | val_loss: 91.57091
Epoch 25/150 | val_loss: 98.51040
Epoch 26/150 | val_loss: 104.06390
Epoch 27/150 | val_loss: 86.18943
Epoch 28/150 | val_loss: