In [None]:
!pip install fair-esm

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


In [None]:
!pip install Biopython

Collecting Biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.85


In [None]:
import os
import pickle
from pathlib import Path
import time
import torch
import esm
from Bio import SeqIO
import numpy as np
import gc
import argparse
from sklearn.decomposition import PCA

In [None]:
!wget -O /content/drive/MyDrive/CapstoneProject/ESM2/esm2_t6_8M_UR50D.pt https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt
!wget -O /content/drive/MyDrive/CapstoneProject/ESM2/esm2_t6_8M_UR50D-contact-regression.pt https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt

In [None]:
torch.serialization.add_safe_globals([argparse.Namespace])
FASTA_PATH = "/content/drive/MyDrive/CapstoneProject/SeqRaw/UP000005640_9606.fasta"
#OUTPUT_PATH = "/content/drive/MyDrive/CapstoneProject/Processed/seq_embeddings1.pkl"
#CHECKPOINT_PATH = OUTPUT_PATH + ".ckpt"
BATCH_SIZE = 1
SAVE_INTERVAL = 50 #save checkpoint each 50 batches.
# === 读取FASTA ===
def read_fasta_file(fasta_path):
    data = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        data.append((record.id, str(record.seq)))
    return data

print("Reading fasta...")
protein_data = read_fasta_file(FASTA_PATH)#20644 * 2 = m * n
print(f"Nums of Proteins: {len(protein_data)}")

processed_batches = 5000
long_batch_index = [5265,5353]

In [None]:
# === 读取已处理的进度（如果有的话） ===
OUTPUT_PATH = "/content/drive/MyDrive/CapstoneProject/Processed/seq_embeddings4.pkl"
embeddings_dict = {}
"""
last_save_time = time.time()
if os.path.exists(CHECKPOINT_PATH):
    with open(CHECKPOINT_PATH, "rb") as f:
        checkpoint = pickle.load(f)
        embeddings_dict = checkpoint["embeddings"]
        processed_batches = checkpoint["processed_batches"]
        last_save_time = checkpoint.get("last_save_time", time.time())
    print(f"Resuming from batch {processed_batches + 1}")
"""
# === 加载ESM-2模型（确保模型文件已下载）===
print("Loading ESM-2 model...")
MODEL_PATH = "/content/drive/MyDrive/CapstoneProject/ESM2/esm2_t33_650M_UR50D.pt"
REGRESSION_PATH = "/content/drive/MyDrive/CapstoneProject/ESM2/esm2_t33_650M_UR50D-contact-regression.pt"
model, alphabet = esm.pretrained.load_model_and_alphabet_local(MODEL_PATH,)
batch_converter = alphabet.get_batch_converter()
model.eval()
print("Moedl Load Successed")
# === 设置 GPU 设备 ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 使用 GPU 或者 CPU
model.to(device)  # 将模型移动到 GPU 或 CPU
print("Model loaded successfully and moved to device:", device)
# === 开始处理（支持断点续跑）===
total_batches = (len(protein_data) + BATCH_SIZE - 1) // BATCH_SIZE
#fp16
#model = model

#for i in range(processed_batches * BATCH_SIZE, len(protein_data), BATCH_SIZE):
for i in range(processed_batches, len(protein_data)):
    if i in long_batch_index:
      print(f"Skip long batch {i + 1}")
      processed_batches += 1
      continue
    batch = [(record[0], record[1]) for record in protein_data[i: i + BATCH_SIZE]]
    print(batch)
    batch_str_len = len(batch[0][1])
    print(f"current batch length = {batch_str_len}")
    if (batch_str_len > 6000):
      print("Long batch Skip")
      long_batch_index.append(i)
      continue
    batch_labels, batch_strs, batch_tokens = batch_converter(batch)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
    batch_tokens = batch_tokens.to(device)
    # 计算嵌入
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[6])
    token_representations = results["representations"][6]
    #print(f"{batch_labels}, {batch_strs}, {batch_tokens}")
    print(f"batche_len = {batch_lens} ")

    #print(f"token representation:\n{token_representations}")
    for j, tokens_len in enumerate(batch_lens):
        embedding = token_representations[j, 1:tokens_len-1].mean(0)
        #output 1024 dim
        pca = PCA(n_components=1024)
        embeddings = pca.fit_transform(embeddings)
        protein_id = batch_labels[j]
        embeddings_dict[protein_id] = embedding.cpu().numpy()

    # 更新已处理批次
    processed_batches += 1
    print(f"Processed batch: {processed_batches}/{total_batches}")
    """
    # 每处理50 batches，自动保存进度
    if processed_batches % SAVE_INTERVAL == 0:
      checkpoint = {
          "embeddings": embeddings_dict,
          "processed_batches": processed_batches
      }
      with open(CHECKPOINT_PATH, "wb") as f:
          pickle.dump(checkpoint, f)
      print(f"Saving CheckPoint at Batch {processed_batches}")
    """
    #gc to save grpah memory
    del batch_tokens, token_representations,results
    torch.cuda.empty_cache()
    gc.collect()
    """
    if processed_batches % 5000 == 0:
      with open(OUTPUT_PATH, "wb") as f:
        pickle.dump(embeddings_dict, f)
      print(f"Output 5000 batches which is {processed_batches} of total")
    """
with open(OUTPUT_PATH, "wb") as f:
  pickle.dump(embeddings_dict, f)
print(f"long batch index is {long_batch_index}")




# === 训练完成，保存最终结果 ===
"""
with open(OUTPUT_PATH, "wb") as f:
    pickle.dump(embeddings_dict, f)
"""

In [None]:
batch = protein_data[5000]
len(batch[1])

long batch index [5265, 5353, 6365, 11195, 11282, 12166, 12647, 13237, 13370, 16851, 19267, 20230]

In [None]:
!nvidia-smi

In [None]:
torch.cuda.empty_cache()
gc.collect()


In [None]:
"""
import argparse
import os
import pickle
from pathlib import Path

import torch
import esm
from Bio import SeqIO
import numpy as np

def read_fasta_file(fasta_path):
    data = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        data.append((record.id, str(record.seq)))
    return data

def main():
    parser = argparse.ArgumentParser(
        description="get Protein seq embeddings by esm-2"
    )
    parser.add_argument("--fasta", type=str, required=True,
                        help="seq_file_path")
    parser.add_argument("--output", type=str, required=True,
                        help="output path")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="batch size, 32 default")
    args = parser.parse_args()

    #read fasta
    print("reading fasta...")
    protein_data = read_fasta_file(args.fasta)
    print(f"Nums of Proteins：{len(protein_data)}")

    # load esm-2
    print("load esm-2 650M Para version, Dim 1280 Embeddings...")
    model_path = "/content/drive/MyDrive/CapstoneProject/ESM2/esm2_t33_650M_UR50D.pt"
    regression_path = "/content/drive/MyDrive/CapstoneProject/ESM2/esm2_t33_650M_UR50D-contact-regression.pt"
    model, alphabet = esm.pretrained.load_model_and_alphabet_local(model_path, regression_path)
    batch_converter = alphabet.get_batch_converter()
    model.eval()

    embeddings_dict = {}
    batch_size = args.batch_size
    total_batches = (len(protein_data) + batch_size - 1) // batch_size
    for i in range(0, len(protein_data), batch_size):
        batch = protein_data[i: i + batch_size]
        batch_labels, batch_strs, batch_tokens = batch_converter(batch)
        # no padding tokens
        batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

        with torch.no_grad():
            results = model(batch_tokens, repr_layers=[33])
        # last layer reault as output
        token_representations = results["representations"][33]

        # calcu means of the residues representations as global embeddings
        for j, tokens_len in enumerate(batch_lens):
            # token0 and the last token is the start and end signs respectively
            embedding = token_representations[j, 1:tokens_len-1].mean(0)
            protein_id = batch_labels[j]
            embeddings_dict[protein_id] = embedding.cpu().numpy()

        print(f"proccessed batch:{i//batch_size + 1}/{total_batches}")

    # saving the embedding dict
    output_path = Path(args.output)
    with open(output_path, "wb") as f:
        pickle.dump(embeddings_dict, f)
    print(f"Save Successfully at：{output_path}")

if __name__ == "__main__":
    main()
"""

In [None]:
import sys
sys.path.append("/content/drive/MyDrive/CapstoneProject")

In [None]:
!pip install dgl
!pip install torchdata



0.11.0+cpu
