# Download the model if necessary

In [None]:
data_path="/content/drive/Shareddrives/GenomeOcean/"

In [None]:
!nvidia-smi

Wed Dec  4 00:35:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
%%bash

curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip"
unzip awscli-bundle.zip
apt install -y python3.10-venv

./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws


Archive:  awscli-bundle.zip
  inflating: awscli-bundle/install   
  inflating: awscli-bundle/packages/virtualenv-16.7.8.tar.gz  
  inflating: awscli-bundle/packages/jmespath-0.10.0.tar.gz  
  inflating: awscli-bundle/packages/colorama-0.4.5.tar.gz  
  inflating: awscli-bundle/packages/botocore-1.35.74.tar.gz  
  inflating: awscli-bundle/packages/docutils-0.16.tar.gz  
  inflating: awscli-bundle/packages/pyyaml-6.0.2.tar.gz  
  inflating: awscli-bundle/packages/rsa-4.7.2.tar.gz  
  inflating: awscli-bundle/packages/s3transfer-0.10.4.tar.gz  
  inflating: awscli-bundle/packages/jmespath-1.0.1.tar.gz  
  inflating: awscli-bundle/packages/pyasn1-0.6.1.tar.gz  
  inflating: awscli-bundle/packages/python-dateutil-2.9.0.post0.tar.gz  
  inflating: awscli-bundle/packages/urllib3-1.26.20.tar.gz  
  inflating: awscli-bundle/packages/six-1.16.0.tar.gz  
  inflating: awscli-bundle/packages/awscli-1.36.15.tar.gz  
  inflating: awscli-bundle/packages/setup/setuptools_scm-3.3.3.tar.gz  
  inflating: 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  1 22.4M    1  304k    0     0   559k      0  0:00:41 --:--:--  0:00:41  558k100 22.4M  100 22.4M    0     0  19.3M      0  0:00:01  0:00:01 --:--:-- 19.3M




In [None]:
%%bash
# add aws credentials
mkdir -p ~/.aws
echo "[default]" > ~/.aws/credentials
echo "aws_access_key_id = " >> ~/.aws/credentials
echo "aws_secret_access_key = " >> ~/.aws/credentials

echo "[default]" > ~/.aws/config
echo "output = json" >> ~/.aws/config
echo "region = us-east-1" >> ~/.aws/config


In [None]:
!aws s3 sync s3://share.jgi-ga.org/genomeocean/ /content/drive/Shareddrives/GenomeOcean/data/trained_models/

# Test genomeocean model

## 1. Embedding

In [None]:
import os
import numpy as np
import transformers
import torch
import torch.utils.data as util_data
import torch.nn as nn
import tqdm

from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [None]:
def calculate_llm_embedding(dna_sequences,
                            model_name_or_path,
                            model_max_length=400,
                            batch_size=25,
                           ):
    """
    # reorder the sequences by length
    # process sequences with similar lengths in the same batch can greatly speed up the computation
    # need to adjust batch_size according to the GPU memory
    # use all GPUs on a node
    """

    lengths = [len(seq) for seq in dna_sequences]
    idx = np.argsort(lengths)
    dna_sequences = [dna_sequences[i] for i in idx]

    tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_name_or_path,
            cache_dir=None,
            model_max_length=model_max_length,
            padding_side="left",
            use_fast=True,
            trust_remote_code=True,
        )


    model = transformers.AutoModel.from_pretrained(
            model_name_or_path,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            # attn_implementation="flash_attention_2", #T4 doesn't support flash_att2
        )


    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = nn.DataParallel(model)

    model.to("cuda")


    train_loader = util_data.DataLoader(dna_sequences, batch_size=batch_size*n_gpu, shuffle=False, num_workers=2*n_gpu,  prefetch_factor=2)
    for j, batch in enumerate(tqdm.tqdm(train_loader)):
        with torch.no_grad():
            token_feat = tokenizer.batch_encode_plus(
                    batch,
                    max_length=model_max_length,
                    return_tensors='pt',
                    padding='longest',
                    truncation=True
                )
            input_ids = token_feat['input_ids'].cuda()
            attention_mask = token_feat['attention_mask'].cuda()
            model_output = model.forward(input_ids=input_ids, attention_mask=attention_mask)[0].detach().cpu()

            attention_mask = attention_mask.unsqueeze(-1).detach().cpu()
            embedding = torch.sum(model_output*attention_mask, dim=1) / torch.sum(attention_mask, dim=1)

            if j==0:
                embeddings = embedding
            else:
                embeddings = torch.cat((embeddings, embedding), dim=0)

    embeddings = np.array(embeddings.detach().float().cpu())


    # reorder the embeddings according to the original order
    embeddings = embeddings[np.argsort(idx)]

    return embeddings

In [None]:
data_path="/content/drive/Shareddrives/GenomeOcean/"
model_path = data_path + 'data/trained_models/meta_hmp_seq10240'
seq_path = data_path + 'data/metagenome_binning/zymo_mock/embeddings/zymo_mock_seqs_10kb.csv.gz'
model_max_length = 10240

dna = pd.read_csv(seq_path, sep='\t', compression='gzip')

embeddings = calculate_llm_embedding(
    list(dna['seq'].values),
    model_name_or_path=model_path,
    model_max_length=model_max_length,
    batch_size=50
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  1%|          | 6/587 [01:09<2:12:59, 13.73s/it]