In [1]:
from Bio import SeqIO
import numpy as np
import torch
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig

fasta_path = "output_cdhit70.fasta"
emb_dict = {}
client = ESMC.from_pretrained("esmc_600m").to("cuda")

for record in SeqIO.parse(fasta_path, "fasta"):
    seq_id = record.id           # id after >
    description = record.description  
    seq = str(record.seq)

    protein = ESMProtein(sequence=seq)
    protein_tensor = client.encode(protein)

    logits_output = client.logits(
    protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
    )
    print(logits_output.logits, logits_output.embeddings)
    emb = logits_output.embeddings
    if emb.dim() == 3 and emb.shape[0] == 1:
        emb = emb[0]

        emb_dict[seq_id] = emb.cpu().numpy()  # (L, D)

    np.savez("esmc600m_residue_embeddings.npz", **emb_dict)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 4006.98it/s]


ForwardTrackData(sequence=tensor([[[-21.7500, -21.6250, -21.8750,  ..., -21.7500, -21.7500, -21.7500],
         [-25.1250, -25.0000, -25.0000,  ..., -25.0000, -25.1250, -25.1250],
         [-28.3750, -28.2500, -28.3750,  ..., -28.2500, -28.3750, -28.2500],
         ...,
         [-23.2500, -23.2500, -23.3750,  ..., -23.1250, -23.3750, -23.2500],
         [-19.6250, -19.7500, -19.7500,  ..., -19.6250, -19.7500, -19.7500],
         [-19.7500, -19.7500, -19.7500,  ..., -19.6250, -19.7500, -19.7500]]],
       device='cuda:0', dtype=torch.bfloat16), structure=None, secondary_structure=None, sasa=None, function=None) tensor([[[-0.0061, -0.0053, -0.0057,  ...,  0.0062,  0.0095,  0.0198],
         [-0.0192,  0.0614,  0.0042,  ..., -0.0321, -0.0227,  0.0500],
         [-0.0146,  0.0140, -0.0180,  ...,  0.0063,  0.0055,  0.0161],
         ...,
         [ 0.0046,  0.0101, -0.0282,  ...,  0.0097, -0.0108,  0.0017],
         [ 0.0160,  0.0048, -0.0180,  ..., -0.0372,  0.0147,  0.0486],
         [-0

In [1]:
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

NameError: name 'torch' is not defined

In [None]:
import torch
import gc

# 1. Delete the variables holding the model and data
# (If you don't do this, Python keeps the memory "in use", so clearing cache won't help)
try:
    del model
    del inputs
    del outputs
except NameError:
    pass # Variables didn't exist, that's fine

# 2. Run Python Garbage Collection
# This forces Python to actually let go of the deleted variables
gc.collect()

# 3. Clear the CUDA Cache
# This tells the GPU: "Anything that is not currently in use, mark it as free."
torch.cuda.empty_cache()

# 4. Check memory
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

In [7]:
import numpy as np


file_path = 'esmc600m_residue_embeddings.npz'

# 加载 .npz
data = np.load(file_path, allow_pickle=True)
#print("文件的键名：", data.files)

keys = data.files              # 获取所有键名的列表  :contentReference[oaicite:0]{index=0}
num_keys = len(keys)
print("键名总数：", num_keys)


embeddings = data['P43408|Topt=70.0|Source=Seq2']
print("embeddings.shape =", embeddings.shape)
print("数据类型 dtype =", embeddings.dtype)

print("前 5 行：/n", embeddings[:5])

# 用完之后关闭句柄（推荐）
data.close()


键名总数： 2681
embeddings.shape = (194, 960)
数据类型 dtype = float32
前 5 行：/n [[ 0.0041868   0.011073    0.00299249 ... -0.00300843  0.00256654
  -0.01597064]
 [-0.05313397  0.01745777  0.04466331 ...  0.02839143  0.04543718
   0.0307509 ]
 [-0.03387368  0.02209267  0.03826474 ...  0.05044253 -0.00937684
   0.00907512]
 [-0.01397859  0.02686415 -0.01112987 ... -0.00163873 -0.00069075
  -0.01434276]
 [-0.04089601  0.03477665 -0.01770025 ...  0.03291729  0.0419331
  -0.0273476 ]]
