### Nucleotide Transformer v2

Downloading model from Hugging Face

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

In [4]:
# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)


In [None]:
# length to which the input sequences are padded
max_length = tokenizer.model_max_length     # 2048, hence 12kbp context window

# Create a dummy dna sequence and tokenize it (6-mers if multiple of 6)
sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt", padding="max_length", max_length = 11)["input_ids"]
print("Tokens shape:", tokens_ids.shape, "\n")


print("Tokens IDs:")
print(tokens_ids[:,:11])

print("Decoding back to sequences:")
decoded_sequences = tokenizer.batch_decode(tokens_ids, skip_special_tokens=False)
print(decoded_sequences)

Tokens shape: torch.Size([2, 11]) 

Tokens IDs:
tensor([[   3,  369,  369,  369,    1,    1,    1,    1,    1,    1,    1],
        [   3,  351, 2463, 2466, 3186, 1740, 4105, 4102, 4103,    1,    1]])
Decoding back to sequences:
['<cls> ATTCCG ATTCCG ATTCCG <pad> <pad> <pad> <pad> <pad> <pad> <pad>', '<cls> ATTTCT CTCTCT CTCTGA GATCGA TCGATC G A T <pad> <pad>']


In [None]:
# Compute the embeddings
attention_mask = tokens_ids != tokenizer.pad_token_id
print("Attention Mask", attention_mask)


torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,  # prevents attention to padding tokens
    encoder_attention_mask=attention_mask,
    output_hidden_states=True       # to get all layer embeddings
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().numpy()   # obtains final transformer layer embeddings
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

# Add embed dimension axis
attention_mask = torch.unsqueeze(attention_mask, dim=-1)

# Compute mean embeddings per sequence
mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)
print(f"Mean sequence embeddings: {mean_sequence_embeddings}")

tensor([[ True,  True,  True,  True, False, False, False, False, False, False,
         False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True, False,
         False]])
Embeddings shape: (2, 11, 1024)
Embeddings per token: [[[ 0.50637925  0.14925307  0.526706   ... -0.6013561   0.76712173
   -0.24135263]
  [ 0.30739102  0.01497138  0.31069276 ... -0.8692496   0.7053464
   -0.04934862]
  [ 0.27530244  0.2383724   0.16276649 ... -0.95568717  0.6267945
   -0.07545093]
  ...
  [ 0.9203838  -0.04136808  0.08182072 ... -0.53490424  0.2571789
   -0.13717528]
  [ 0.9504963   0.02858299  0.07393027 ... -0.49353155  0.380477
   -0.21202193]
  [ 1.0615091  -0.08195896  0.142325   ... -0.51462924  0.4050352
   -0.29604894]]

 [[-0.02620885 -0.50777876  0.3580304  ... -0.23096171  0.7863488
   -1.0120119 ]
  [ 0.49354693 -0.59404796  0.24656996 ... -0.3358443   0.06688058
    0.2506485 ]
  [ 0.0174152   0.091777    0.43233433 ... -0.20136136  0.33414203
    0.01311848]
  ...

  mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2)/torch.sum(attention_mask, axis=1)
