去除无法提取pseaac特征的序列

In [None]:
import os
import pandas as pd
from itertools import groupby
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

folder_path = r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data"
file_name_fas = "uniprotkb_proteome_UP000005640.fasta"
file_name_tsv = "uniprotkb_proteome_UP000005640.tsv"

file_path_fas = os.path.join(folder_path, file_name_fas)
file_path_tsv = os.path.join(folder_path, file_name_tsv)

# feature
sequences = [str(record.seq) for record in SeqIO.parse(file_path_fas, "fasta")]
unique_sequences = list(set(sequences))  # 简单去重（完全相同的序列）

valid_seqs = []
for seq in unique_sequences:
    if "X" not in seq and "U" not in seq:  # 剔除含未知氨基酸（X）或硒代半胱氨酸（U）的序列
        valid_seqs.append(seq)

# 将字符串序列转换为 SeqRecord 对象列表
seq_records = []
for i, seq in enumerate(valid_seqs):
    record = SeqRecord(Seq(seq), id=f"seq_{i}", description="")
    seq_records.append(record)

file_name_fas_processed = "cleaned.fasta"
file_path_fas_processed = os.path.join(folder_path, file_name_fas_processed)
SeqIO.write(seq_records, file_path_fas_processed, "fasta")

In [None]:
# label
df = pd.read_csv(file_path_tsv, sep="\t", encoding='utf-8')

# 1. 过滤实验验证数据
df = df[df["Protein existence"] == "Evidence at protein level"]
df = df[df["Subcellular location [CC]"].str.contains("ECO:0000269", na=False)]

# 2. 提取亚细胞定位标签
def extract_location(text):
    if pd.isna(text):
        return None
    locations = text.split("SUBCELLULAR LOCATION: ")[-1].split(";")[0]
    return locations.split("{")[0].strip()

df["subcellular_location"] = df["Subcellular location [CC]"].apply(extract_location)

# 3.处理多标签分类
df["subcellular_location"] = df["Subcellular location [CC]"].apply(
    lambda x: [loc.split("{")[0].strip() for loc in x.split("SUBCELLULAR LOCATION: ")[-1].split(";")]
)

# 4.删除缺失值
df = df.dropna(subset=["subcellular_location"])

# 5.删除reviewed列和Protein existence列，重复值
df = df.drop(['Reviewed', 'Protein existence', 'Organism'], axis=1)

file_name_tsv_to_csv = "labels.csv"
file_path_tsv_to_csv = os.path.join(folder_path, file_name_tsv_to_csv)
df[["Entry","Entry Name" ,"Sequence", "subcellular_location"]].to_csv(file_path_tsv_to_csv, index=False)

In [None]:
# 统计类别分布
df_check = pd.read_csv(file_path_tsv_to_csv)
df["subcellular_location"].value_counts()

subcellular_location
[Nucleus]                                                                                                                             1572
[Cytoplasm]                                                                                                                           1309
[Secreted]                                                                                                                             328
[Cell membrane, Multi-pass membrane protein]                                                                                           316
[Mitochondrion]                                                                                                                        240
                                                                                                                                      ... 
[Nucleus matrix, Peripheral membrane protein]                                                                                            1
[Apica

In [None]:
data = []
for record in SeqIO.parse(file_path_fas_processed, "fasta"):
    data.append({
        "protein_id": record.id,
        "sequence": str(record.seq),
        "length": len(record.seq)
    })
pd.DataFrame(data)

下面这段代码是实际运用到的初步数据处理代码

In [None]:
import os
import re
import pandas as pd
import numpy as np


folder_path = r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data"
file_name_tsv = "uniprotkb_reviewed_true_AND_proteome_up.tsv"

file_path_tsv = os.path.join(folder_path, file_name_tsv)
df = pd.read_csv(file_path_tsv, sep="\t", encoding='utf-8')

# 过滤实验验证数据
df = df[df["Subcellular location [CC]"].str.contains("ECO:0000269", na=False)]
df = df.reset_index(drop=True)

# 2. 提取亚细胞定位标签
def extract_location(text):
    if pd.isna(text):
        return None
    # 提取所有定位并去除证据代码
    locations = re.findall(r"SUBCELLULAR LOCATION: (.*?)\{", text)
    if locations:
        primary_locations = locations[0].split(". ")
        return [loc.strip() for loc in primary_locations if loc.strip()]
    else:
        return None

df["subcellular_location"] = df["Subcellular location [CC]"].apply(extract_location)

# 3.处理多标签分类()


# 4.删除缺失值
df = df.dropna(subset=["subcellular_location"])
df = df.drop(['Subcellular location [CC]'], axis=1)

# 数据类型转换
df['Length'] = df['Length'].astype(int)
df['Mass'] = df['Mass'].astype(float)


# 处理结合位点列
def extract_binding_features(text):
    features = {}
    if pd.isna(text):
        return features
    
    # 结合位点数量
    binding_sites = re.findall(r"BINDING (\d+)", text)
    features["num_binding_sites"] = len(binding_sites)
    
    # 配体类型统计（例如Fe cation）
    ligands = re.findall(r'/ligand="([^"]+)"', text)
    unique_ligands = list(set(ligands))
    features["num_unique_ligands"] = len(unique_ligands)
    
    # 结合位点位置分布
    positions = [int(pos) for pos in re.findall(r"BINDING (\d+)", text)]
    features["binding_pos_mean"] = np.mean(positions) if positions else 0
    features["binding_pos_std"] = np.std(positions) if len(positions) > 1 else 0
    
    
    return features

# 应用函数
df["binding_features"] = df["Binding site"].apply(extract_binding_features)
df_binding = pd.json_normalize(df["binding_features"])
df = pd.concat([df, df_binding], axis=1)
df.drop('Binding site', axis=1, inplace=True)
df.drop('binding_features', axis=1, inplace=True)


# 2. 处理催化活性列
def extract_catalytic_features(text):
    features = {}
    if pd.isna(text):
        return features
    
    # 提取EC编号（例如EC=1.13.11.18）
    ec_match = re.search(r"EC=(\d+\.\d+\.\d+\.\d+)", text)
    features["ec_number"] = ec_match.group(1) if ec_match else 0
    
    # 提取反应底物和产物
    substrates = re.findall(r"Reaction=([^=]+)\s*=", text)
    products = re.findall(r"=\s*([^;]+)", text.split("Reaction=")[-1]) if "Reaction=" in text else []
    features["num_substrates"] = len(substrates)
    features["num_products"] = len(products)
    
    return features

# 应用函数
df["catalytic_features"] = df["Catalytic activity"].apply(extract_catalytic_features)
df_catalytic = pd.json_normalize(df["catalytic_features"])
df = pd.concat([df, df_catalytic], axis=1)
df.drop('Catalytic activity', axis=1, inplace=True)
df.drop('catalytic_features', axis=1, inplace=True)


# df.drop('Entry Name', axis=1, inplace=True)
# df

file_name_tsv_to_csv = "data.csv"
file_path_tsv_to_csv = os.path.join(folder_path, file_name_tsv_to_csv)
df.to_csv(file_path_tsv_to_csv, index=False)