此文件用于获取每个GO术语对应的蛋白质序列embed

In [1]:
import pandas as pd
import torch
import numpy as np
import os

In [2]:
term_P = pd.read_pickle("/home/Kioedru/code/SSGO/data/finetune/9606/terms_P.pkl")[
    "terms"
].tolist()
term_F = pd.read_pickle("/home/Kioedru/code/SSGO/data/finetune/9606/terms_F.pkl")[
    "terms"
].tolist()
term_C = pd.read_pickle("/home/Kioedru/code/SSGO/data/finetune/9606/terms_C.pkl")[
    "terms"
].tolist()

查看三个方面的GO术语是否有重叠

In [3]:
# # 将列表转换为集合
# set1 = set(term_P)
# set2 = set(term_F)
# set3 = set(term_C)

# # 找出三个集合的交集
# common_elements = set1 & set2 & set3

# # 输出结果
# if common_elements:
#     print("三个列表中有重复的元素：", common_elements)
# else:
#     print("三个列表中没有重复的元素")

In [4]:
import sys

sys.path.append("/home/Kioedru/code/SSGO")
from codespace.utils.read_finetune_data import (
    read_seq_embed_avgpool_esm2_480_by_index_without_normalize,
    read_seq_embed_avgpool_prott5_1024_by_index_without_normalize,
    read_labels,
)


def get_embed_avgpool_by_index(feature_name, usefor, aspect, organism_num):
    if feature_name == "esm2-480":
        return read_seq_embed_avgpool_esm2_480_by_index_without_normalize(
            usefor, aspect, organism_num
        )
    elif feature_name == "prott5-1024":
        return read_seq_embed_avgpool_prott5_1024_by_index_without_normalize(
            usefor, aspect, organism_num
        )
    else:
        raise ValueError("feature_name must be esm2 or prott5")


def get_seq_and_labels(feature_name, aspect, organism_num):
    train_seq = get_embed_avgpool_by_index(feature_name, "train", aspect, organism_num)
    valid_seq = get_embed_avgpool_by_index(feature_name, "valid", aspect, organism_num)
    concat_seq = np.concatenate((train_seq, valid_seq), axis=0)
    train_labels = read_labels("train", aspect, organism_num)
    valid_labels = read_labels("valid", aspect, organism_num)
    concat_labels = np.concatenate((train_labels, valid_labels), axis=0)

    print(concat_seq.shape)
    print(concat_labels.shape)
    return concat_seq, concat_labels

In [5]:
def get_samples(embed, labels):
    positive_samples = {}
    negetive_samples = {}
    for label_index in range(labels.shape[1]):
        positive_indices = np.where(labels[:, label_index] == 1)[0]
        positive_features = embed[positive_indices]
        positive_samples[label_index] = positive_features
        negetive_indices = np.where(labels[:, label_index] == 0)[0]
        negetive_features = embed[negetive_indices]
        negetive_samples[label_index] = negetive_features
    return positive_samples, negetive_samples

In [6]:
# 检查并创建文件夹
def check_and_create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"文件夹 '{folder_path}' 已创建。")
    else:
        print(f"文件夹 '{folder_path}' 已存在。")

In [15]:
def save_samples(
    positive_samples, negetive_samples, aspect, organism_num, feature_name
):
    terms = {"P": term_P, "F": term_F, "C": term_C}
    for key in positive_samples.keys():
        postive_sample = positive_samples[key]
        # 删除全0的行
        postive_sample = postive_sample[~np.all(postive_sample == 0, axis=1)]
        # 删除全0的行
        negetive_sample = negetive_samples[key]
        negetive_sample = negetive_sample[~np.all(negetive_sample == 0, axis=1)]
        term = terms[aspect][key]
        term_path = os.path.join(
            f"/home/Kioedru/code/SSGO/data/synthetic/{feature_name}/{organism_num}/{aspect}",
            term,
        )
        check_and_create_folder(term_path)
        pd.to_pickle(
            postive_sample,
            os.path.join(term_path, f"{term}_Real_Training_Positive.pkl"),
        )
        pd.to_pickle(
            negetive_sample,
            os.path.join(term_path, f"{term}_Real_Training_Negative.pkl"),
        )

In [16]:
aspects = ["P", "F", "C"]
organism_num = "9606"
for aspect in aspects:
    concat_seq, concat_labels = get_seq_and_labels("esm2-480", aspect, organism_num)
    positive_samples, negetive_samples = get_samples(concat_seq, concat_labels)
    save_samples(positive_samples, negetive_samples, aspect, organism_num, "esm2-480")

(3501, 480)
(3501, 45)
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0000209' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0000122' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0045944' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0016567' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0001934' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0045087' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0045893' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0010628' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0008285' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0006974' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0006508' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/9606/P/GO:0060271' 已创建。
文件夹 '/home/Kioedru/code/SSGO/data/synthetic/esm2-480/