### Nucleotide Transformer v2

Downloading model from Hugging Face

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)


In [None]:
# length to which the input sequences are padded
max_length = tokenizer.model_max_length     # 2048

# Create a dummy dna sequence and tokenize it (6-mers if multiple of 6)
sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = 11)["input_ids"]
print("Tokens shape:", tokens_ids.shape, "\n")


print("Tokens IDs: Starts with <CLS> token and splits into 6-mers if multiple of 6, padded with <PAD> tokens")
print(tokens_ids[:,:11])

print("Decoding back to sequences:")
decoded_sequences = tokenizer.batch_decode(tokens_ids, skip_special_tokens=False)
print("Decoded sequences:", decoded_sequences)

Tokens shape: torch.Size([2, 11])
Tokens IDs: Starts with <CLS> token and splits into 6-mers if multiple of 6, padded with <PAD> tokens
tensor([[   3,  369,  369,  369,    1,    1,    1,    1,    1,    1,    1],
        [   3,  351, 2463, 2466, 3186, 1740, 4105, 4102, 4103,    1,    1]])
Decoding back to sequences:
Decoded sequences: ['<cls> ATTCCG ATTCCG ATTCCG <pad> <pad> <pad> <pad> <pad> <pad> <pad>', '<cls> ATTTCT CTCTCT CTCTGA GATCGA TCGATC G A T <pad> <pad>']


In [6]:
# Compute the embeddings
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().numpy()
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

# Add embed dimension axis
attention_mask = torch.unsqueeze(attention_mask, dim=-1)

# Compute mean embeddings per sequence
mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)
print(f"Mean sequence embeddings: {mean_sequence_embeddings}")

Embeddings shape: (2, 2048, 1024)
Embeddings per token: [[[ 0.506377    0.14925334  0.5267083  ... -0.6013588   0.76712173
   -0.24135433]
  [ 0.3073899   0.01497174  0.31069803 ... -0.8692521   0.7053512
   -0.04935056]
  [ 0.2753041   0.23837496  0.16276833 ... -0.9556887   0.6267955
   -0.07545134]
  ...
  [ 0.19590917 -0.07998952  0.10465808 ...  0.00264151  0.46054763
   -0.0351527 ]
  [ 0.3376015  -0.20001088  0.04658983 ...  0.02611579  0.44138515
   -0.21832661]
  [ 0.07734504 -0.34250122  0.07812366 ...  0.20738798  0.2853003
   -0.15824707]]

 [[-0.02621049 -0.5077791   0.3580327  ... -0.23096023  0.7863456
   -1.0120155 ]
  [ 0.4935458  -0.5940486   0.24656974 ... -0.3358434   0.06688204
    0.25064632]
  [ 0.01741558  0.09177573  0.43233514 ... -0.20135874  0.33414453
    0.01312017]
  ...
  [ 0.20584676 -0.76628757  0.59376454 ... -0.5139848   0.6144526
   -1.1537104 ]
  [-0.07695752 -0.36022145 -0.37434888 ... -0.67634785  0.42619026
   -1.0293765 ]
  [-0.68402076 -0.5873

  mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)


Mean sequence embeddings: tensor([[ 0.3634,  0.1437,  0.2810,  ..., -0.8623,  0.6422, -0.1028],
        [ 0.3041, -0.1513,  0.3624,  ..., -0.0744,  0.0820, -0.2940]])
