In [None]:
import csv
import os

def parse_location_info(line):
    """
    解析一行定位信息
    :param line: 包含定位信息的字符串
    :return: 解析后的信息字典
    """
    info = {}
    parts = line.split(': ')
    if len(parts) > 1:
        isoform = parts[0].split(']')[0].strip('[').strip()
        info['isoform'] = isoform
        location_parts = parts[1].split('. ')
        base_locations = []
        dynamic_info = None
        excluded_info = None
        for part in location_parts:
            if 'Note=' in part:
                if 'Translocates' in part:
                    dynamic_info = part.replace('Note=', '').strip()
                continue
            elif 'Excluded from' in part:
                excluded_info = part.replace('Excluded from', '').strip().strip(';')
                continue
            base_locations.append(part.strip())
        info['base_locations'] = base_locations
        if dynamic_info:
            info['dynamic_info'] = dynamic_info
        if excluded_info:
            info['excluded_info'] = excluded_info
    return info


def extract_location_info_from_tsv(file_path):
    """
    从 TSV 文件中提取定位信息
    :param file_path: TSV 文件的路径
    :return: 包含所有异构体定位信息的列表，每个元素是 (entry_name, info) 元组
    """
    all_info = []
    with open(file_path, 'r', newline='') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        for row in reader:
            if len(row) >= 3:
                entry_name = row[2]
                last_column = row[-1]
                info = parse_location_info(last_column)
                if info:
                    all_info.append((entry_name, info))
    return all_info


def save_to_csv(data, output_file):
    """
    将提取的信息保存为 CSV 文件
    :param data: 包含 (entry_name, info) 元组的列表
    :param output_file: 输出的 CSV 文件路径
    """
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['entry_name', 'isoform', 'base_locations', 'dynamic_info', 'excluded_info']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for entry_name, info in data:
            row = {
                'entry_name': entry_name,
                'isoform': info.get('isoform', ''),
                'base_locations': ', '.join(info.get('base_locations', [])),
                'dynamic_info': info.get('dynamic_info', ''),
                'excluded_info': info.get('excluded_info', '')
            }
            writer.writerow(row)


# 示例使用
folder_path = r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data"
file_name = r'uniprotkb_reviewed_true_AND_proteome_up.tsv'
file_path = os.path.join(folder_path, file_name)
location_info = extract_location_info_from_tsv(file_path)
# for info in location_info:
#     print(info)
# location_info
output_filename = os.path.join(folder_path, 'location_info.csv')
save_to_csv(location_info, output_filename)
    

下面这段代码提取蛋白质亚细胞定位第二版（提取出所有定位，忽略异构体，以及Note部分，只要有分布就算有定位）这里处理的是fasta文件，不用管

In [18]:
import pandas as pd
import re

def extract_location(text):
    if pd.isna(text):
        return None
    all_locations = []
    # 匹配所有的 SUBCELLULAR LOCATION 部分，忽略异构体和 Note 部分
    subcellular_parts = re.findall(r"SUBCELLULAR LOCATION: (?:\[.*?\]:\s*)?(.*?)(?=Note=|SUBCELLULAR LOCATION:|$)", text, re.DOTALL)
    for part in subcellular_parts:
        # 去除证据代码
        part = re.sub(r"\{.*?\}", "", part)
        # 分割定位
        locs = re.split(r'[.;]', part)
        for loc in locs:
            loc = loc.strip()
            if loc:
                all_locations.append(loc)
    # 去除重复的定位信息
    unique_locations = list(set(all_locations))
    return unique_locations

# 读取 TSV 文件
folder_path = r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data"
file_name = r'uniprotkb_reviewed_true_AND_proteome_up.tsv'
file_path = os.path.join(folder_path, file_name)
df = pd.read_csv(file_path, sep="\t", encoding="utf-8")

# 过滤实验验证数据
df = df[df["Subcellular location [CC]"].str.contains("ECO:0000269", na=False)]
df = df.reset_index(drop=True)

# 生成标签列
df["location_label"] = df["Subcellular location [CC]"].apply(extract_location)
df.to_csv(os.path.join(folder_path, r'NewLabels4.csv'), index=False)

匹配替换locations列，运行这段代码就可以（更改updated_data.csv文件后）

In [1]:
import pandas as pd

def match_and_replace():
    try:
        # 读取 updated_data.csv 文件
        updated_data = pd.read_csv(r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data\updated_data.csv")
        # 读取 newLabels4.csv 文件
        new_labels = pd.read_csv(r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data\NewLabels4.csv")

        # 创建一个字典，键为 entry 列的值，值为 location_label 列的值
        label_mapping = dict(zip(new_labels['Entry'], new_labels['location_label']))

        # 根据 entry 列的值，使用 map 函数将 updated_data 中的 locations 列替换为 newLabels4 中的 location_label 列
        updated_data['subcellular_location'] = updated_data['Entry'].map(label_mapping)

        # 过滤掉未匹配成功的行
        updated_data = updated_data.dropna(subset=['subcellular_location'])

        # 保存新的数据到新文件
        updated_data.to_csv(r"D:\Han\software\Math\CSUpan\ShareCache\尹涵(数学与统计学院)\赛\一流专业人才计划\Codes\data\new_updated_data.csv", index=False)
        print("数据处理完成，新文件已保存为 new_updated_data.csv")
    except FileNotFoundError:
        print("错误：未找到所需的 CSV 文件，请检查文件路径和文件名。")
    except KeyError:
        print("错误：文件中缺少必要的列（entry、locations 或 location_label），请检查文件内容。")
    except Exception as e:
        print(f"发生未知错误：{e}")

if __name__ == "__main__":
    match_and_replace()
    

数据处理完成，新文件已保存为 new_updated_data.csv
