In [None]:
from mlutils import *
import shap
from tqdm import tqdm

In [2]:
test_x,test_y=x_y_split(test_data_path, scaler=joblib.load(scaler_model_path))
train_x,train_y=x_y_split(train_data_path, scaler=joblib.load(scaler_model_path))

In [3]:
# 计算每列的第4百分位数,确保最终不合格比例为10%
# A, B, C = np.percentile(test_y, 4, axis=0)

# print(f"A: {A}, B: {B}, C: {C}")

# # 找到不合格样本
# unqualified_samples = np.sum(test_y < [A, B, C], axis=1) 
# #得到不合格样本的索引
# unqualified_index = np.where(unqualified_samples > 0)

# unqualified_ratio = np.mean(unqualified_samples)

# print(f"不合格样本占比: {unqualified_ratio}")

In [4]:
class Config:
    # 目标性能阈值（根据需求修改）
    TARGETS = np.array([134, 278, 22.5])  # 三个性能指标的最低要求
    
    # SHAP参数
    SAMPLE_RATIO = 0.1     # 背景数据采样比例
    N_SUMMARY = 100        # SHAP背景数据压缩量
    TOP_K = 5              # 每个样本选择的关键参数数量
    
    # 贝叶斯优化参数
    N_INIT = 5             # 初始随机采样次数
    N_ITER = 20            # 优化迭代次数
    PARAM_BUFFER = 0.1     # 参数范围缓冲比例（历史数据范围±10%）

In [5]:
# 识别不合格样本
def find_unqualified_samples(y, targets):
    """找到至少有一个性能指标不达标的样本"""
    unqualified_mask = np.any(y < targets, axis=1)
    return np.where(unqualified_mask)[0]

In [6]:
model_name='Random Forest'
model=models[model_name]
model=joblib.load(pre_model_path + model_name + '.pkl')

In [7]:
#随机提取train_x的0.1倍样本
train_x_sample=train_x[np.random.choice(train_x.shape[0], int(train_x.shape[0]*0.1), replace=False)]
train_x_sample_summary = shap.sample(train_x_sample, 100)
#使用shap的kernel explainer对混合模型
explainer = shap.KernelExplainer(model.predict, train_x_sample_summary)
def cal_shap_values(x):
    return explainer.shap_values(x)
#计算第一个测试集样本的shap值
# shap_values=cal_shap_values(test_x[0])
# print(shap_values.shape)  
# top_indices = np.argsort(np.abs(shap_values).mean(1))[::-1][:Config.TOP_K]
# print(top_indices)

In [8]:
def generate_param_bounds(sample, X_data, top_indices, buffer_ratio):
    """
    作用：根据历史数据和当前样本生成优化参数范围
    sample: 当前样本
    X_data: 历史数据
    top_indices: 优化参数索引
    buffer——ratio: 动态范围比例
    """
    bounds = {}
    for idx in top_indices:
        # 全局数据范围（考虑工艺限制）
        global_min = X_data[:, idx].min()
        global_max = X_data[:, idx].max()
        
        # 当前值
        current_val = sample[idx]
        
        # 动态范围：当前值±buffer_ratio范围的全局裁剪
        buffer_range = (global_max - global_min) * buffer_ratio
        min_val = max(global_min, current_val - buffer_range)
        max_val = min(global_max, current_val + buffer_range)
        
        bounds[f'x{idx}'] = (min_val, max_val)
    return bounds

In [9]:
from bayes_opt import BayesianOptimization
def bayesian_optimization(sample, model,X_data):
    '''X_data:历史数据'''
    #计算shap值
    shap_values = cal_shap_values(sample)
    #取前五位重要性的索引,这地方为mean(1)，因为三个目标列，shap_values形状为(n_feature,3),需要对每一列的重要性求和，然后取平均
    top_indices = np.argsort(np.abs(shap_values).mean(1))[::-1][:Config.TOP_K]

    #生成优化参数范围
    bounds = generate_param_bounds(sample, X_data, top_indices, 0.1)
    
    # 定义优化目标
    def optimization_function(**params):
        # 构建完整参数向量
        x_new = sample.copy()
        for key, val in params.items():
            idx = int(key[1:])
            x_new[idx] = val
        
        # 预测性能
        y_pred = model.predict(x_new.reshape(1, -1))[0]
        
        # 计算与目标的差值
        diffs = y_pred - Config.TARGETS
        return min(diffs)  # 关键优化目标：使最小值≥0
    
    #贝叶斯优化
    optimizer = BayesianOptimization(
        f=optimization_function,
        pbounds=bounds,
        verbose=0,
        random_state=1,
    )
    optimizer.maximize(init_points=Config.N_INIT, n_iter=Config.N_ITER)
    if optimizer.max['target'] >= 0:
        # 优化成功
        best_params = optimizer.max['params']
        x_optimized = sample.copy()
        for key, val in best_params.items():
            idx = int(key[1:])
            x_optimized[idx] = val
        status = "success"
    else:
        # 优化失败
        x_optimized = None
        status = "failed"
    return x_optimized, status,top_indices
    


In [10]:
# #对单个不合格样本进行优化
# unoptimized_sample = test_x[13]
# optimized_sample, status,top_indices = bayesian_optimization(unoptimized_sample, model,train_x)
# print(f"优化状态: {status}")
# optimized_pred = model.predict(optimized_sample.reshape(1, -1))[0]
# print(f"优化前性能: {model.predict(unoptimized_sample.reshape(1, -1))[0]}")
# print(f"优化后性能: {optimized_pred}")

In [11]:
def optimize_many():
    #找到不合格样本
    unqualified_index = find_unqualified_samples(test_y, Config.TARGETS)
    
    # 结果存储
    results = []
    
    # 逐样本优化
    for idx in tqdm(unqualified_index, desc="Processing samples"):
        original_sample = test_x[idx]
        original_perf = test_y[idx]
        
        # 执行优化
        optimized_sample, status, top_indices = bayesian_optimization(
            original_sample, model, train_x_sample
        )
        
        # 记录结果
        if status == "success":
            optimized_perf = model.predict(optimized_sample.reshape(1, -1))[0]
            param_changes = []
            for p_idx in top_indices:
                orig_val = original_sample[p_idx]
                opt_val = optimized_sample[p_idx]
                change_pct = (opt_val - orig_val) / orig_val * 100
                param_changes.append({
                    'param_index': p_idx,
                    'original': orig_val,
                    'optimized': opt_val,
                    'change_pct': change_pct
                })
            
            results.append({
                'sample_id': idx,
                'status': status,
                'original_performance': original_perf,
                'optimized_performance': optimized_perf,
                'param_changes': param_changes,
                'top_params': top_indices.tolist()
            })
        else:
            results.append({
                'sample_id': idx,
                'status': status,
                'original_performance': original_perf,
                'optimized_performance': None,
                'param_changes': None,
                'top_params': top_indices.tolist()
            })
    
    # 保存结果到DataFrame
    df_results = pd.DataFrame(results)
    
    # 输出统计信息
    success_rate = (df_results['status'] == 'success').mean()
    print(f"\n优化完成！成功率：{success_rate:.1%}")
    
    # 保存到CSV（示例）
    df_results.to_csv("optimization_results.csv", index=False)
    
    return df_results

In [12]:
optimize_many()

Processing samples: 100%|██████████| 408/408 [55:40<00:00,  8.19s/it]


优化完成！成功率：98.3%





Unnamed: 0,sample_id,status,original_performance,optimized_performance,param_changes,top_params
0,4,success,"[471.0, 570.0, 21.5]","[462.39, 564.07, 25.625]","[{'param_index': 6, 'original': 0.8, 'optimize...","[6, 2, 0, 1, 5]"
1,13,success,"[458.0, 562.0, 19.0]","[443.87, 538.24, 30.41]","[{'param_index': 6, 'original': 0.6909091, 'op...","[6, 0, 2, 1, 5]"
2,15,success,"[133.0, 275.0, 46.5]","[160.62, 324.25, 45.895]","[{'param_index': 2, 'original': 0.01, 'optimiz...","[2, 0, 5, 1, 12]"
3,27,success,"[131.0, 280.0, 47.5]","[160.75, 325.39, 45.685]","[{'param_index': 2, 'original': 0.015000002, '...","[2, 0, 5, 1, 12]"
4,38,success,"[129.0, 279.0, 46.0]","[161.59, 325.0, 45.505]","[{'param_index': 2, 'original': 0.015000002, '...","[2, 0, 5, 1, 12]"
...,...,...,...,...,...,...
403,4401,success,"[471.0, 814.0, 21.5]","[464.9, 688.85, 28.16333333333333]","[{'param_index': 2, 'original': 0.87, 'optimiz...","[2, 0, 1, 3, 5]"
404,4404,success,"[133.0, 279.0, 50.0]","[159.85, 325.57, 47.09]","[{'param_index': 2, 'original': 0.020000001, '...","[2, 0, 5, 1, 12]"
405,4405,success,"[136.0, 272.0, 50.0]","[161.3, 323.81, 48.485]","[{'param_index': 2, 'original': 0.015000002, '...","[2, 0, 5, 1, 12]"
406,4407,success,"[481.0, 579.0, 20.0]","[464.52, 556.27, 23.02]","[{'param_index': 2, 'original': 0.47, 'optimiz...","[2, 6, 0, 1, 12]"
