读入数据

In [3]:
import pandas as pd
import os

folder_path = r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data"
file_name_label = "new_updated_data.csv"
file_path_label = os.path.join(folder_path, file_name_label)

df = pd.read_csv(file_path_label)

columns_need = ['Length', 'Entry Name', 'Sequence', 'Mass',
       'subcellular_location', 'num_binding_sites', 'num_unique_ligands',
       'binding_pos_mean', 'binding_pos_std', 'num_substrates', 'num_products']

new_df = df[columns_need]

In [None]:
# 定义一个函数用于清洗标签
def clean_labels(labels):
    if isinstance(labels, str):
        # 去除中括号
        labels = labels.replace('[', '').replace(']', '')
        # 先按逗号分隔字符串，再去除多余引号和首尾空格
        return [label.strip("' ").strip() for label in labels.split(',')]
    elif isinstance(labels, list):
        # 若已经是列表，去除每个元素中的中括号、多余引号和首尾空格
        return [label.replace('[', '').replace(']', '').strip("' ").strip() for label in labels]
    return []

# 应用清洗函数到 subcellular_location 列
new_df['subcellular_location'] = new_df['subcellular_location'].apply(clean_labels)

step1 定义所有可优化参数

In [None]:
PARAM_SPACE = {
    # PseAAC
    "pseaac_lambda": {"type": "int", "low": 3, "high": 5},
    
    # EBGW
    "ebgw_window_size": {"type": "int", "low": 5, "high": 9},
    
    # PsePSSM
    "psepssm_lambda": {"type": "int", "low": 3, "high": 5},
    "psepssm_w": {"type": "float", "low": 0.1, "high": 0.9},
    
    # Node2Vec
    "n2v_dimensions": {"type": "int", "low": 16, "high": 64},
    "n2v_walk_length": {"type": "int", "low": 20, "high": 100},
    "n2v_num_walks": {"type": "int", "low": 10, "high": 50},
    
    # 模型参数（以XGBoost为例）
    "xgb_n_estimators": {"type": "int", "low": 100, "high": 500},
    "xgb_max_depth": {"type": "int", "low": 3, "high": 10},
    "xgb_learning_rate": {"type": "float", "low": 0.01, "high": 0.3, "log": True}
}

step2 定义所有特征提取函数

step2.1 PAAC特征提取函数

In [4]:
from propy import PyPro

# lambda_value 待优化参数
def calculate_pseaac(sequence, lambda_value=5):
    try:
        protein = PyPro.GetProDes(sequence)
        paac = protein.GetPAAC(lamda=lambda_value)
        return paac
    except Exception as e:
        print(f"Error for sequence {sequence[:10]}...: {e}")
        return None

step2.2 EBGW特征提取函数

In [None]:
import numpy as np

def calculate_ebgw_features(protein_seq, window_num=3, group_combinations=None):
    if group_combinations is None:
        # 定义氨基酸分组
        neutral_non_polar = {'G', 'A', 'V', 'L', 'I', 'M', 'P', 'F', 'W'}
        neutral_polar = {'Q', 'N', 'S', 'T', 'Y', 'C'}
        acidic = {'D', 'E'}
        basic = {'H', 'K', 'R'}
        group_combinations = [(neutral_non_polar | neutral_polar, acidic | basic),
                              (neutral_non_polar | acidic, neutral_polar | basic),
                              (neutral_non_polar | basic, neutral_polar | acidic)]

    ebgw_features = []
    for combination in group_combinations:
        group1, group2 = combination
        binary_seq = [1 if aa in group1 else 0 for aa in protein_seq]
        n = len(binary_seq)
        if n == 0:  # 处理空序列
            ebgw_features.extend([0.0] * window_num)
            continue
        window_size = max(1, n // window_num)  # 确保窗口大小至少为 1
        sub_seq_weights = []
        for i in range(0, n, window_size):
            sub_seq = binary_seq[i:i + window_size]
            sub_seq_weight = np.mean(sub_seq) if sub_seq else 0.0
            sub_seq_weights.append(sub_seq_weight)
        # 补全特征数量
        if len(sub_seq_weights) < window_num:
            sub_seq_weights += [0.0] * (window_num - len(sub_seq_weights))
        ebgw_features.extend(sub_seq_weights)
    return ebgw_features

step2.3 PsePSSM特征提取函数

In [None]:
def parse_pssm(pssm_file):
    pssm_matrix = []
    with open(pssm_file, 'r') as f:
        lines = f.readlines()
        # 跳过前三行
        for line in lines[3:]:
            if line.strip() == "":
                continue
            # 提取每行的20个PSSM值（第2-21列）
            values = line.split()[2:22]
            try:
                # 将值转换为浮点数
                values = list(map(float, values))
                # 如果特征不足20个，进行零填充
                if len(values) < 20:
                    values.extend([0] * (20 - len(values)))
                pssm_matrix.append(values)
            except ValueError:
                print(f"Error converting values in line: {line}")
    return np.array(pssm_matrix)


def calculate_psepssm(pssm_matrix, lamda=30, w=0.05):
    """
    计算 PsePSSM 特征
    :param pssm_matrix: PSSM 矩阵
    :param lamda: 相关因子
    :param w: 权重因子
    :return: PsePSSM 特征向量
    """
    n = pssm_matrix.shape[0]
    m = pssm_matrix.shape[1]
    tau_values = []
    for tau in range(1, lamda + 1):
        tau_sum = 0
        for i in range(n - tau):
            for j in range(m):
                tau_sum += (pssm_matrix[i, j] - pssm_matrix[i + tau, j]) ** 2
        tau_values.append(tau_sum / (n - tau))

    # 归一化 tau_values
    norm_tau_values = np.array(tau_values) / np.sum(tau_values)

    # 计算 PsePSSM 特征
    psepssm_features = []
    for i in range(m):
        column_sum = np.sum(pssm_matrix[:, i])
        psepssm_features.append(column_sum / n)

    for tau_val in norm_tau_values:
        psepssm_features.append(w * tau_val)

    return np.array(psepssm_features)

step2.4 标签相似性特征提取

In [None]:
import networkx as nx
from node2vec import Node2Vec


def label_similarity_modeling(locations_df, df, node2vec_dimensions=16, node2vec_walk_length=30, node2vec_num_walks=100, node2vec_window=10):
    # 计算标签共现矩阵
    label_cooccurrence = np.dot(locations_df.T, locations_df)

    # 构建标签图
    G = nx.Graph()
    label_names = locations_df.columns
    for i in range(len(label_names)):
        for j in range(i + 1, len(label_names)):
            denominator = label_cooccurrence[i, i] + label_cooccurrence[j, j] - label_cooccurrence[i, j]
            if denominator == 0:
                sim = 0  # 处理除零错误
            else:
                sim = label_cooccurrence[i, j] / denominator
            G.add_edge(label_names[i], label_names[j], weight=sim)

    if G.number_of_edges() == 0:
        raise ValueError("标签图没有有效边，无法生成嵌入")

    # Node2Vec 嵌入
    try:
        node2vec = Node2Vec(G, dimensions=node2vec_dimensions, walk_length=node2vec_walk_length, num_walks=node2vec_num_walks)
        # 修改参数名称
        model = node2vec.fit(window=node2vec_window)
        label_embeddings = pd.DataFrame([model.wv[node] for node in label_names], index=label_names)
    except Exception as e:
        print(f"发生错误: {e}")
        return None
    
    # 处理 Node2Vec 生成的嵌入，注意转置，方便后续处理
    label_embeddings = label_embeddings.T 

    # 转换为样本级特征（关键：多标签聚合）
    X_label_feat = []
    for labels in df['subcellular_location']:
        valid_labels = [lb for lb in labels if lb in label_embeddings]
        if len(valid_labels) > 0:
            feat = np.mean([label_embeddings[lb] for lb in valid_labels], axis=0)
        else:
            feat = np.zeros(node2vec_dimensions)
        X_label_feat.append(feat)
    X_label_feat = np.array(X_label_feat)

    # 为 X_label_feat 的列命名
    n = X_label_feat.shape[1]  # 获取特征维度
    label_feat_columns = [f'label_feat_{i}' for i in range(n)]
    X_label_feat_df = pd.DataFrame(X_label_feat, columns=label_feat_columns)

    return X_label_feat_df

    

step3 定义特征动态计算管道

In [None]:
import numpy as np
from propy.PseudoAAC import GetPseudoAAC
from node2vec import Node2Vec
import networkx as nx

def generate_features(df, labels, params):
    """根据参数动态生成所有特征"""
    features = {}
    
    # ---------------------------
    # 1. 生成PseAAC特征
    # ---------------------------
    pseaac_list = []
    for seq in sequences:
        pseaac = GetPseudoAAC(seq, lamda=params['pseaac_lambda'])
        pseaac_list.append(list(pseaac.values()))
    features['pseaac'] = np.array(pseaac_list)
    
    # ---------------------------
    # 2. 生成EBGW特征
    # ---------------------------
    def calculate_ebgw(seq, window_size):
        encoded = [aa_to_index.get(aa, 0) for aa in seq]
        ebgw = []
        for i in range(len(encoded) - window_size + 1):
            window = encoded[i:i+window_size]
            prob = np.bincount(window, minlength=20) / window_size
            ebgw.append(entropy(prob))
        return np.mean(ebgw)
    
    ebgw_values = [calculate_ebgw(seq, params['ebgw_window_size']) for seq in sequences]
    features['ebgw'] = np.array(ebgw_values).reshape(-1, 1)
    
    # ---------------------------
    # 3. 生成PsePSSM特征（需预计算PSSM）
    # ---------------------------
    # 假设已通过PSI-BLAST生成PSSM矩阵
    pssm_data = load_pssm_data()  # 自定义加载函数
    psepssm = []
    for pssm in pssm_data:
        psepssm_feat = compute_psepssm(pssm, lamda=params['psepssm_lambda'], w=params['psepssm_w'])
        psepssm.append(psepssm_feat)
    features['psepssm'] = np.array(psepssm)
    
    # ---------------------------
    # 4. 生成标签相似性特征（Node2Vec）
    # ---------------------------
    # 构建标签共现图
    label_cooccurrence = calculate_cooccurrence(labels)  # 自定义函数
    G = nx.Graph()
    for i in range(label_cooccurrence.shape[0]):
        for j in range(i+1, label_cooccurrence.shape[0]):
            if label_cooccurrence[i,j] > 0:
                G.add_edge(label_names[i], label_names[j], weight=label_cooccurrence[i,j])
    
    # Node2Vec嵌入
    n2v = Node2Vec(
        G, 
        dimensions=params['n2v_dimensions'],
        walk_length=params['n2v_walk_length'],
        num_walks=params['n2v_num_walks'],
        p=params['n2v_p'],
        q=params['n2v_q']
    )
    model = n2v.fit(window=10)
    
    # 转换为样本级特征（多标签平均）
    label_embeddings = []
    for sample_labels in labels:
        valid_labels = [lb for lb in sample_labels if lb in model.wv]
        if len(valid_labels) > 0:
            emb = np.mean([model.wv[lb] for lb in valid_labels], axis=0)
        else:
            emb = np.zeros(params['n2v_dimensions'])
        label_embeddings.append(emb)
    features['label_emb'] = np.array(label_embeddings)
    
    # ---------------------------
    # 5. 拼接所有特征
    # ---------------------------
    combined = np.hstack([
        features['pseaac'],
        features['ebgw'],
        features['psepssm'],
        features['label_emb']
    ])
    
    return combined