In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd
from deap import base, creator, tools
import random

input_file = 'data/TN_NH3_N2O' # 输入数据文件夹
# target = ['TN loss (%)', 'NH3-N loss (%)', 'N2O-N loss (%)']
target = 'N2O-N loss (%)' # 要预测啥就换个名字
data_path = f'{input_file}/data_for_{target}.csv'
# 以LGB举例
model_path = f'output/TN_NH3_N2O/model_{target}/lgb_pred.csv'

In [None]:
data_all_ef = pd.read_csv(data_path)
X_train = data_all_ef.iloc[:, :-1]
y_train = data_all_ef.iloc[:, -1]

In [None]:
# 数据集
X = X_train
y = y_train

# 定义优化目标（最小化问题）
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # 创建了一个新的适应度类对象，名为 "FitnessMin"，用于表示一个最小化的优化问题。weights=(-1.0,) 指定了适应度的权重，这里 -1.0 表示我们希望最小化优化目标，因此设置为负数。
creator.create("Individual", list, fitness=creator.FitnessMin)  # fitness=creator.FitnessMin 指定了个体的适应度评价方式，即使用前面定义的最小化的适应度类 "FitnessMin"。

# 初始化遗传算法工具箱
toolbox = base.Toolbox()

# 注册个体和种群的创建函数
toolbox.register("attr_float", random.uniform, 0, 1) # 这行代码注册了一个名为 "attr_float" 的工具，用于生成一个在区间 [0, 1] 内的随机浮点数。

# 这行代码注册了一个名为 "individual" 的工具，用于创建一个个体。tools.initRepeat 是 DEAP 库中的一个函数，用于初始化一个重复的数据结构。
# creator.Individual 指定了要创建的个体的类型，即之前定义的 "Individual" 类。
# toolbox.attr_float 是用于生成个体的基因值的函数，即上面注册的 "attr_float" 工具。
# n=5 指定了个体中基因的数量，这里是5个，对应于需要优化的5个参数。
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=5) 

# 这行代码注册了一个名为 "population" 的工具，用于创建一个种群。list 指定了种群的基本类型，即种群是一个列表（list）。toolbox.individual 是用于生成个体的函数，即上面注册的 "individual" 工具。种群中的每个个体都是通过该函数生成的。
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# 定义评价函数
def evaluate(individual):
    # 根据个体选择相应的特征
    features = [X[:, i] for i, bit in enumerate(individual) if bit > 0.5]
    if len(features) == 0:
        return float('inf'),  # 避免除以零错误
    # 计算随机森林模型的预测值
    '''
    这里需要修改
    '''
    # 这里用随机值代替
    prediction = np.random.rand(100)
    # 计算均方误差（MSE）
    mse = np.mean((prediction - y) ** 2)
    return mse,

# 注册评价函数
toolbox.register("evaluate", evaluate)

# 注册选择、交叉和变异算子
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# 设置遗传算法参数
pop_size = 100
num_generations = 50
cxpb = 0.7
mutpb = 0.2

# 创建初始种群
pop = toolbox.population(n=pop_size)

# 迭代进化
for gen in range(num_generations):
    # 评价种群中的个体
    fitnesses = map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit

    # 选择下一代个体
    offspring = toolbox.select(pop, len(pop))
    # 克隆选中的个体
    offspring = list(map(toolbox.clone, offspring))

    # 对选中的个体进行交叉和变异
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < cxpb:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < mutpb:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # 替换当前种群
    pop[:] = offspring

# 找到最优个体
best_ind = tools.selBest(pop, 1)[0]
best_features = [X[:, i] for i, bit in enumerate(best_ind) if bit > 0.5]
print("Best individual:", best_features)
print("MSE:", evaluate(best_ind))


找最优模型

In [11]:
import json
import os

# 文件路径
file_paths = [
    "output/TN_NH3_N2O/result_mse_N2O-N loss (%).json",
    "output/TN_NH3_N2O/result_mse_NH3-N loss (%).json",
    "output/TN_NH3_N2O/result_mse_TN loss (%).json"
]

min_key_set = set()
min_value = float('inf')  # 初始值为正无穷大

# 遍历每个文件
for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
        # 查找最小值的键
        for key, value in data.items():
            if value < min_value:
                min_key_set = {key}
                min_value = value
            elif value == min_value:
                min_key_set.add(key)

# 打印最小值的键
print("最小值的键:", min_key_set)


最小值的键: {'CatRegression(k)'}


加载相应的模型

In [14]:
import os
import pickle

# 模型文件路径
model_files = [
    "output\TN_NH3_N2O\model_N2O-N loss (%)\ctb_model.pkl",
    "output\TN_NH3_N2O\model_NH3-N loss (%)\ctb_model.pkl",
    "output\TN_NH3_N2O\model_TN loss (%)\ctb_model.pkl"
]

loaded_models = {}  # 用于存储加载的模型

# 遍历每个模型文件
for model_file in model_files:
    # 获取模型名称
    model_name = os.path.basename(os.path.dirname(model_file)) + "_" + os.path.splitext(os.path.basename(model_file))[0]
    
    # 加载模型
    with open(model_file, 'rb') as file:
        loaded_model = pickle.load(file)
    
    # 将加载的模型存储在字典中，以模型名称作为键
    loaded_models[model_name] = loaded_model
print(loaded_models)

{'model_N2O-N loss (%)_ctb_model': <catboost.core.CatBoostRegressor object at 0x0000023A49420450>, 'model_NH3-N loss (%)_ctb_model': <catboost.core.CatBoostRegressor object at 0x0000023A48DD72D0>, 'model_TN loss (%)_ctb_model': <catboost.core.CatBoostRegressor object at 0x0000023A4A6AFFD0>}


读取每个模型的输入和输出

In [16]:
import pandas as pd
import os

# CSV 文件路径
csv_files = [
    "data/TN_NH3_N2O/data_for_N2O-N loss (%).csv",
    "data/TN_NH3_N2O/data_for_NH3-N loss (%).csv",
    "data/TN_NH3_N2O/data_for_TN loss (%).csv"
]

# 用于存储输入特征和输出特征的列表
input_features_list = []
output_feature_list = []
model_names = []

# 遍历每个 CSV 文件
for csv_file in csv_files:
    # 从文件路径中提取模型名称
    model_name = os.path.splitext(os.path.basename(csv_file))[0]
    model_names.append(model_name)
    
    # 读取 CSV 文件
    df = pd.read_csv(csv_file)
    
    # 获取输入特征（除了最后一列）和输出特征（最后一列）的列名，并添加到列表中
    input_features = list(df.columns[:-1])
    output_feature = df.columns[-1]
    input_features_list.append(input_features)
    output_feature_list.append(output_feature)

# 打印模型名称、输入特征和输出特征的列表
for model_name, input_features, output_feature in zip(model_names, input_features_list, output_feature_list):
    print("模型名称:", model_name)
    print("输入特征:", input_features)
    print("输出特征:", output_feature)



模型名称: data_for_N2O-N loss (%)
输入特征: ['material_0', 'initial CN(%)', 'initial moisture content(%)', 'initial pH', 'material_1', 'Excipients', 'initial TN(%)', 'initial TC(%)', 'Additive Species']
输出特征: N2O-N loss (%)
模型名称: data_for_NH3-N loss (%)
输入特征: ['material_0', 'initial CN(%)', 'initial moisture content(%)', 'initial pH', 'material_1', 'Excipients', 'initial TN(%)', 'initial TC(%)', 'Additive Species']
输出特征: NH3-N loss (%)
模型名称: data_for_TN loss (%)
输入特征: ['material_0', 'initial CN(%)', 'initial moisture content(%)', 'initial pH', 'material_1', 'Excipients', 'initial TN(%)', 'initial TC(%)', 'Additive Species']
输出特征: TN loss (%)


找输入输出的取值范围

In [27]:
import pandas as pd
import os

# CSV 文件路径
csv_files = [
    "data/TN_NH3_N2O/data_for_N2O-N loss (%).csv",
    "data/TN_NH3_N2O/data_for_NH3-N loss (%).csv",
    "data/TN_NH3_N2O/data_for_TN loss (%).csv"
]

# 存储所有输入特征和标签的最小值和最大值的字典
min_max_values = {}

# 遍历每个 CSV 文件
for csv_file in csv_files:
    # 读取 CSV 文件
    df = pd.read_csv(csv_file)
    
    # 获取输入特征的列名（去除最后一列）
    input_features = list(df.columns[:-1])
    # 获取标签的列名
    label_column = df.columns[-1]
    
    # 遍历每个输入特征
    for feature in input_features:
        # 如果特征不存在，创建新的特征项
        if feature not in min_max_values:
            min_max_values[feature] = {'Minimum': None, 'Maximum': None}
        # 计算输入特征的最小值和最大值，并更新字典中的值
        if min_max_values[feature]['Minimum'] is None:
            min_max_values[feature]['Minimum'] = df[feature].min()
        else:
            min_max_values[feature]['Minimum'] = min(min_max_values[feature]['Minimum'], df[feature].min())
        if min_max_values[feature]['Maximum'] is None:
            min_max_values[feature]['Maximum'] = df[feature].max()
        else:
            min_max_values[feature]['Maximum'] = max(min_max_values[feature]['Maximum'], df[feature].max())
    
    # 如果标签不存在，创建新的标签项
    if label_column not in min_max_values:
        min_max_values[label_column] = {'Minimum': None, 'Maximum': None}
    # 计算标签的最小值和最大值，并更新字典中的值
    if min_max_values[label_column]['Minimum'] is None:
        min_max_values[label_column]['Minimum'] = df[label_column].min()
    else:
        min_max_values[label_column]['Minimum'] = min(min_max_values[label_column]['Minimum'], df[label_column].min())
    if min_max_values[label_column]['Maximum'] is None:
        min_max_values[label_column]['Maximum'] = df[label_column].max()
    else:
        min_max_values[label_column]['Maximum'] = max(min_max_values[label_column]['Maximum'], df[label_column].max())

# 打印每个输入特征和标签的最小值和最大值
for category, stats in min_max_values.items():
    print(f"{category}: Minimum={stats['Minimum']}, Maximum={stats['Maximum']}")

print(min_max_values)


material_0: Minimum=0, Maximum=6
initial CN(%): Minimum=-1.0, Maximum=53.7349397590362
initial moisture content(%): Minimum=-1.0, Maximum=89.8
initial pH: Minimum=-1.0, Maximum=10.7
material_1: Minimum=0, Maximum=5
Excipients: Minimum=0, Maximum=78
initial TN(%): Minimum=-1.0, Maximum=14.56
initial TC(%): Minimum=-1.0, Maximum=197.0
Additive Species: Minimum=0, Maximum=4
N2O-N loss (%): Minimum=-0.0031818181818181, Maximum=13.05
NH3-N loss (%): Minimum=6.09411764705882e-06, Maximum=84.5149768218341
TN loss (%): Minimum=0.2, Maximum=90.5
{'material_0': {'Minimum': 0, 'Maximum': 6}, 'initial CN(%)': {'Minimum': -1.0, 'Maximum': 53.7349397590362}, 'initial moisture content(%)': {'Minimum': -1.0, 'Maximum': 89.8}, 'initial pH': {'Minimum': -1.0, 'Maximum': 10.7}, 'material_1': {'Minimum': 0, 'Maximum': 5}, 'Excipients': {'Minimum': 0, 'Maximum': 78}, 'initial TN(%)': {'Minimum': -1.0, 'Maximum': 14.56}, 'initial TC(%)': {'Minimum': -1.0, 'Maximum': 197.0}, 'Additive Species': {'Minimum': 0

遗传算法优化

In [28]:
import random
import numpy as np
from deap import base, creator, tools



# 定义输入特征的最大最小值
input_min_max = {
    'material_0': {'Minimum': 0, 'Maximum': 6},
    'initial CN(%)': {'Minimum': -1.0, 'Maximum': 53.7349397590362},
    'initial moisture content(%)': {'Minimum': -1.0, 'Maximum': 89.8},
    'initial pH': {'Minimum': -1.0, 'Maximum': 10.7},
    'material_1': {'Minimum': 0, 'Maximum': 5},
    'Excipients': {'Minimum': 0, 'Maximum': 78},
    'initial TN(%)': {'Minimum': -1.0, 'Maximum': 14.56},
    'initial TC(%)': {'Minimum': -1.0, 'Maximum': 197.0},
    'Additive Species': {'Minimum': 0, 'Maximum': 4}
}

# 定义遗传算法的相关函数

# 创建适应度函数，计算模型预测值与目标值的差异
def evaluate(individual):
    # 将个体解码为输入参数
    inputs = {}
    for i, feature in enumerate(input_min_max.keys()):
        inputs[feature] = individual[i]
    # 计算模型预测值
    predictions = {}
    for label, model in loaded_models.items():
        predictions[label] = model.predict(inputs)
    # 计算适应度
    fitness = sum(abs(predictions[label] - target_values[label]) for label in loaded_models.keys())
    return fitness,

# 定义个体编码
def create_individual():
    return [random.uniform(feature['Minimum'], feature['Maximum']) for feature in input_min_max.values()]

# 定义交叉操作
def crossover(parent1, parent2):
    crossover_point = random.randint(1, len(parent1) - 2)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2

# 定义变异操作
def mutation(individual):
    mutation_index = random.randint(0, len(individual) - 1)
    individual[mutation_index] = random.uniform(input_min_max[mutation_index]['Minimum'], input_min_max[mutation_index]['Maximum'])
    return individual,

# 初始化 DEAP 模块
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", crossover)
toolbox.register("mutate", mutation)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# 进行模型优化
def optimize_models(models, input_min_max, num_generations, population_size=100, cxpb=0.5, mutpb=0.2):
    # 将目标值设定为要优化的标签的最小值
    target_values = {}
    for label in loaded_models.keys():
        target_values[label] = min_max_values[label]['Minimum']
    
    # 初始化种群
    pop = toolbox.population(n=population_size)
    
    # 迭代优化过程
    for gen in range(num_generations):
        # 评估种群中的个体
        fitnesses = map(toolbox.evaluate, pop)
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit
        
        # 选择下一代个体
        offspring = toolbox.select(pop, len(pop))
        
        # 克隆选中的个体
        offspring = list(map(toolbox.clone, offspring))
        
        # 对选中的个体进行交叉和变异
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < cxpb:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values
        
        for mutant in offspring:
            if random.random() < mutpb:
                toolbox.mutate(mutant)
                del mutant.fitness.values
        
        # 评估新生成的个体
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        
        # 更新种群
        pop[:] = offspring
    
    # 返回优化后的个体
    return pop

# 设置遗传算法的参数
num_generations = 50
population_size = 100
cxpb = 0.5
mutpb = 0.2

# 进行模型优化
optimized_individuals = optimize_models(loaded_models, input_min_max, num_generations, population_size, cxpb, mutpb)


KeyError: 'model_N2O-N loss (%)_ctb_model'