#### 组合优化-多目标优化————采选策略

<h5> 1.创建数据

In [5]:
import random
import numpy as np

def generate_feature_stats(num_points=100):
    """生成特征统计字典模板，包含分布数据"""
    mean = random.uniform(0, 1)
    std = random.uniform(0, 0.5)
    min_val = random.uniform(0, 0.5)
    max_val = random.uniform(0.5, 1)
    
    # 生成符合正态分布的随机数据（截断到[min_val, max_val]范围内）
    distribution = np.random.normal(mean, std, num_points)
    distribution = np.clip(distribution, min_val, max_val).tolist()
    
    return {
        'mean': mean,
        'std': std,
        'min': min_val,
        'max': max_val,
        'distribution': distribution
    }

def generate_batch_stats(num_points=100):
    """生成批次统计信息，包含分布数据"""
    mean = random.randint(100, 300)
    std = random.uniform(50, 100)
    min_val = random.randint(20, 50)
    max_val = random.randint(400, 600)
    
    # 生成符合正态分布的随机数据（截断到[min_val, max_val]范围内）
    distribution = np.random.normal(mean, std, num_points)
    distribution = np.clip(distribution, min_val, max_val).tolist()
    
    return {
        'mean': mean,
        'std': std,
        'min': min_val,
        'max': max_val,
        'distribution': distribution
    }

def build_train_stats():
    """构建训练集统计信息"""
    return {
        'features': {
            'ssc_score': generate_feature_stats(),
            'occlusion_score': generate_feature_stats(),
            'crown_count_score': generate_feature_stats(),
            'diversity_score': generate_feature_stats(),
            'area_var_score': generate_feature_stats(),
            'density_var_score': generate_feature_stats()
        },
        'batch_statistics': generate_batch_stats()
    }

def build_unlabeled_stats(num_images=100):
    """构建未标注池统计信息"""
    # 生成图片映射关系
    image_names = [f"image{str(i).zfill(3)}.jpg" for i in range(num_images)]
    
    return {
        'features': {
            'ssc_score': generate_feature_stats(),
            'occlusion_score': generate_feature_stats(),
            'crown_count_score': generate_feature_stats(),
            'diversity_score': generate_feature_stats(),
            'area_var_score': generate_feature_stats(),
            'density_var_score': generate_feature_stats()
        },
        'image_mapping': {
            'image_names': image_names,
            'indices': {name: idx for idx, name in enumerate(image_names)}
        }
    }

# 使用示例 ----------------------
if __name__ == "__main__":
    # 生成训练集统计信息
    train_data = build_train_stats()
    
    # 生成未标注池统计信息（假设包含100张图片）
    unlabeled_data = build_unlabeled_stats(num_images=100)
    
    # 查看结构
    print("=== Train Stats ===")
    print(train_data['features']['ssc_score'].keys())  # 检查一个特征的键
    
    print("\n=== Unlabeled Stats ===")
    print(unlabeled_data['features']['ssc_score'].keys())  # 检查一个特征的键
    
    # 打印一个特征的分布数据示例
    print("\n=== Example Distribution ===")
    print(f"Length of distribution: {len(train_data['features']['ssc_score']['distribution'])}")
    print(f"First 5 values: {train_data['features']['ssc_score']['distribution'][:5]}")

=== Train Stats ===
dict_keys(['mean', 'std', 'min', 'max', 'distribution'])

=== Unlabeled Stats ===
dict_keys(['mean', 'std', 'min', 'max', 'distribution'])

=== Example Distribution ===
Length of distribution: 100
First 5 values: [0.5219020058988002, 0.5219020058988002, 0.35369467660522885, 0.35369467660522885, 0.5219020058988002]


In [6]:
train_data

{'features': {'ssc_score': {'mean': 0.6015779086148545,
   'std': 0.4837145105410226,
   'min': 0.35369467660522885,
   'max': 0.5219020058988002,
   'distribution': [0.5219020058988002,
    0.5219020058988002,
    0.35369467660522885,
    0.35369467660522885,
    0.5219020058988002,
    0.5219020058988002,
    0.35369467660522885,
    0.5219020058988002,
    0.5083530568514337,
    0.5219020058988002,
    0.5219020058988002,
    0.5219020058988002,
    0.5219020058988002,
    0.5219020058988002,
    0.5219020058988002,
    0.35369467660522885,
    0.5219020058988002,
    0.5219020058988002,
    0.35369467660522885,
    0.5219020058988002,
    0.35369467660522885,
    0.35369467660522885,
    0.35369467660522885,
    0.5219020058988002,
    0.5219020058988002,
    0.35369467660522885,
    0.5219020058988002,
    0.5219020058988002,
    0.44103670895132496,
    0.5219020058988002,
    0.4415922505154477,
    0.35369467660522885,
    0.5219020058988002,
    0.5219020058988002,
    0.3536

In [7]:
unlabeled_data

{'features': {'ssc_score': {'mean': 0.4431163232809183,
   'std': 0.30989690844568696,
   'min': 0.026560484188134392,
   'max': 0.8102752020289621,
   'distribution': [0.7557244975504172,
    0.5473350678891128,
    0.28891571716803255,
    0.7692137153666828,
    0.7419180685276283,
    0.48911560608645355,
    0.4454711594801215,
    0.22646000042367795,
    0.18010722916387994,
    0.5861623680335799,
    0.5382165327449027,
    0.7521374453817198,
    0.4188511665362323,
    0.16116128297644072,
    0.13072119925977732,
    0.5488470736209836,
    0.500673533570888,
    0.6922279680377822,
    0.8102752020289621,
    0.1559662728268778,
    0.5336345297085888,
    0.33788066432877006,
    0.179489287310589,
    0.8102752020289621,
    0.34032287957505125,
    0.44352173245462195,
    0.3304813307289298,
    0.42817218694692005,
    0.6659115681973231,
    0.7015463661895718,
    0.8102752020289621,
    0.5079969478924097,
    0.027576770888232793,
    0.6049094521142914,
    0.427

<h4>方案 1：组合优化（PuLP库）

In [12]:
from pulp import LpProblem, LpMaximize, LpVariable, lpSum
import numpy as np

def combinatorial_selection(train_stats, unlabeled_stats, select_num=5, 
                          feature_weights=None, ssc_weight=1.0):
    """
    改进版组合优化选择器 - 同时考虑：
    1. 最大化未标注样本的ssc_score
    2. 最小化与训练集的特征分布差异
    
    Args:
        train_stats: 训练集统计信息
        unlabeled_stats: 未标注池统计信息
        select_num: 选择数量
        feature_weights: 特征权重字典，格式如 {'occlusion_score': 0.5, ...}
        ssc_weight: ssc_score在目标函数中的权重
    """
    # 数据准备
    ssc_scores = np.array(unlabeled_stats['features']['ssc_score']['distribution'])
    feature_names = ['occlusion_score', 'crown_count_score', 
                    'diversity_score', 'area_var_score', 
                    'density_var_score']
    
    # 创建优化问题
    prob = LpProblem("EnhancedActiveLearning", LpMaximize)
    
    # 定义决策变量
    n_samples = len(ssc_scores)
    x = [LpVariable(f"x_{i}", cat='Binary') for i in range(n_samples)]
    
    # ========== 目标函数 ==========
    # 1. SSC得分部分
    ssc_part = lpSum([x[i] * ssc_scores[i] for i in range(n_samples)])
    
    # 2. 特征分布匹配部分（关键改进）
    distribution_penalty = 0
    for f_name in feature_names:
        # 训练集特征均值和标准差
        train_mean = train_stats['features'][f_name]['mean']
        train_std = train_stats['features'][f_name]['std'] + 1e-6  # 避免除零
        
        # 未标注池特征值
        unlabeled_values = np.array(unlabeled_stats['features'][f_name]['distribution'])
        
        # 标准化后的差异（Mahalanobis距离思想）
        normalized_diff = [(unlabeled_values[i] - train_mean)/train_std 
                          for i in range(n_samples)]
        
        # 加权差异惩罚
        weight = feature_weights.get(f_name, 0.2)  # 默认权重0.2
        distribution_penalty += weight * lpSum([x[i] * abs(normalized_diff[i]) 
                                              for i in range(n_samples)])
    
    # 综合目标函数
    prob += ssc_weight * ssc_part - (1 - ssc_weight) * distribution_penalty
    
    # ========== 约束条件 ==========
    # 1. 选择数量约束
    prob += lpSum(x) == select_num
    
    # 2. 可选：确保至少选择每个特征的一定比例
    # for f_name in feature_names:
    #     prob += lpSum([x[i] * unlabeled_stats['features'][f_name]['distribution'][i] 
    #                   for i in range(n_samples)]) >= min_requirement[f_name]
    
    # 求解问题
    prob.solve()
    
    # 结果解析
    selected_indices = [i for i in range(n_samples) if x[i].value() == 1]
    selected_names = [unlabeled_stats['image_mapping']['image_names'][i] 
                     for i in selected_indices]
    
    # 生成选择报告
    report = {
        'selected_count': len(selected_indices),
        'avg_ssc': np.mean(ssc_scores[selected_indices]),
        'feature_changes': {}
    }
    
    for f_name in feature_names:
        original_mean = train_stats['features'][f_name]['mean']
        new_mean = np.mean([unlabeled_stats['features'][f_name]['distribution'][i] 
                          for i in selected_indices])
        report['feature_changes'][f_name] = {
            'change': new_mean - original_mean,
            'change_pct': (new_mean - original_mean) / (original_mean + 1e-6) * 100
        }
    
    return selected_names, report

# 使用示例
if __name__ == "__main__":
    # 假设已经加载了train_stats和unlabeled_stats
    # feature_weights = {
    #     'occlusion_score': 1,
    #     'crown_count_score': 1,
    #     'diversity_score': 1,
    #     'area_var_score': 1,
    #     'density_var_score': 1
    # }
    feature_weights = {
        'occlusion_score': 0.3,    # 遮挡分数
        'crown_count_score': 0.25, # 树冠数量分数
        'diversity_score': 0.2,    # 多样性分数
        'area_var_score': 0.15,    # 区域变异分数
        'density_var_score': 0.1   # 密度变异分数
    }
    selected_samples, report = combinatorial_selection(
        train_stats=train_data,
        unlabeled_stats=unlabeled_data,
        select_num=10,
        feature_weights=feature_weights,
        ssc_weight=0.7  # 70%权重给ssc_score
    )
    
    print(f"选中的样本: {selected_samples}")
    print("\n选择报告:")
    print(f"选中样本数量: {report['selected_count']}")
    print(f"平均SSC得分: {report['avg_ssc']:.2f}")
    print("\n特征变化情况:")
    for metric, data in report['feature_changes'].items():
        print(f"{metric}: 变化量 {data['change']:.4f} (变化百分比 {data['change_pct']:.2f}%)")

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /home/a6000/miniconda3/envs/sual/lib/python3.9/site-packages/pulp/apis/../solverdir/cbc/linux/i64/cbc /tmp/14a2c6d5be884e308cd3ba90e5eb363c-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /tmp/14a2c6d5be884e308cd3ba90e5eb363c-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 6 COLUMNS
At line 407 RHS
At line 409 BOUNDS
At line 510 ENDATA
Problem MODEL has 1 rows, 100 columns and 100 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 1.25806 - 0.00 seconds
Cgl0004I processed model has 1 rows, 100 columns (100 integer (100 of which binary)) and 100 elements
Cbc0038I Initial state - 0 integers unsatisfied sum - 0
Cbc0038I Solution found of -1.25806
Cbc0038I Before mini branch and bound, 100 integers at bound fixed and 0 continuous
Cbc0038I Mini branch and bound did not improv

1. 方差分数的本质

    方差（Variance）：衡量数据的离散程度，值越大表示数据分布越不均匀（差异越大）。

    在树冠检测中的意义：

        area_var_score：树冠面积的方差（如大/小树冠的混合程度）。

        density_var_score：树冠密度的方差（如稀疏/密集区域的分布差异）。

2. 正变化（如 +24.42%, +60.78%）
含义

选中样本的方差 高于 原始未标注池的平均水平。
可能原因

    主动选择了极端样本（如极大/极小树冠、极密/极疏区域）。

    数据集原本分布较均匀，但选择策略故意引入更多变异。

应用意义

    正向作用：

        提升模型对极端情况的识别能力（如检测异常树冠）。

        增强对复杂场景的泛化性（如森林中不同密度区域）。

    潜在风险：

        过度关注变异可能导致模型忽略常见模式。

示例

若 density_var_score 增加60.78%，说明选中的样本包含：

    更多密集树冠（如热带雨林）和稀疏树冠（如干旱地区）的混合。

3. 负变化（如未出现，但假设为 -X%）
含义

选中样本的方差 低于 原始未标注池的平均水平。
可能原因

    主动选择了分布均匀的样本（如树冠大小/密度相近的区域）。

    数据集原本差异大，但选择策略倾向于平滑分布。

应用意义

    正向作用：

        减少模型对极端值的敏感度，提高稳定性。

        更适合训练基础特征（如平均树冠大小）。

    潜在风险：

        模型可能无法处理真实场景中的复杂变异。

示例

若 area_var_score 降低，说明选中的样本中：

    树冠面积接近平均值（如人工林），缺乏超大/超小树冠。

4. 与其他指标的关联分析

报告中其他指标的变化需结合方差分数综合理解：

    crown_count_score 大幅增加（+271.47%）：

        可能选择了树冠数量多且空间分布极不均匀的区域（如森林边缘+内部混合）。

    diversity_score 降低（-53.87%）：

        虽然方差增加，但树种/形态多样性可能下降（如选择同一树种的不同密度区域）。

5. 实际应用建议

    若需检测复杂场景：

        接受方差正变化，但需监控模型对均匀区域的性能。

    若需通用模型：

        平衡方差变化（如通过加权目标函数控制方差与其他指标的权衡）。

<h4> 方案 2：多目标优化（Pymoo库）

In [None]:
from pymoo.core.problem import ElementwiseProblem
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.optimize import minimize

class MultiObjectiveProblem(ElementwiseProblem):
    def __init__(self, ssc_scores, features, select_num):
        self.ssc = ssc_scores
        self.features = features
        self.select_num = select_num
        n_var = len(ssc_scores)
        super().__init__(n_var=n_var, n_obj=2, n_constr=1, xl=0, xu=1)

    def _evaluate(self, x, out, *args, **kwargs):
        # 目标1: 最大化 ssc_score
        obj1 = -np.sum(x * self.ssc)  # 负号转为最小化
        
        # 目标2: 最小化特征方差
        var_penalty = 0
        for f in self.features.values():
            selected = f[x > 0.5]
            var_penalty += np.var(selected)
        obj2 = var_penalty
        
        # 约束：必须选择指定数量
        count = np.sum(x > 0.5)
        out["F"] = [obj1, obj2]
        out["G"] = [abs(count - self.select_num)]

problem = MultiObjectiveProblem(ssc_scores, features, select_num=5)
algorithm = NSGA2(pop_size=50)
res = minimize(problem, algorithm, ('n_gen', 100))