<a href="https://colab.research.google.com/github/agemagician/ProtTrans/blob/master/Embedding/TensorFlow/Advanced/ProtBert-BFD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtBert-BFD pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [1]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer,BertConfig
import re
import numpy as np

<b>2. Load the vocabulary and ProtBert-BFD Model</b>

In [2]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False )

In [3]:
model = TFBertModel.from_pretrained("Rostlab/prot_bert_bfd", from_pt=True)

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB



2022-12-19 22:55:27.725644: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-19 22:55:27.725791: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorc

<b>3. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [64]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [65]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>4. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [66]:
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True, return_tensors="tf")

In [67]:
print(ids)

{'input_ids': <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[2, 1, 3],
       [2, 1, 3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[0, 0, 0],
       [0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[1, 1, 1],
       [1, 1, 1]], dtype=int32)>}


In [68]:
input_ids = ids['input_ids']
attention_mask = ids['attention_mask']

<b>5. Extracting sequences' features and load it into the CPU if needed<b>

In [69]:
embedding = model(input_ids)[0]

In [70]:
print(np.shape(embedding))

(2, 3, 1024)


In [45]:
embedding = np.asarray(embedding)

In [51]:
attention_mask = np.asarray(attention_mask)
print(attention_mask)

[[1 1 1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1 0 0]]


<b>6. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtBert-BFD model<b>

In [58]:
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][0:seq_len-1] # change to 1 to remove the CLS one
    features.append(seq_emd)

# supposedly the CLS embedding is the sentence embedding, see if you can look at that?

In [59]:
print(features)

[array([[ 7.87322819e-02,  2.53026970e-02,  1.00224458e-01,
        -8.79849680e-03, -2.71326303e-03, -9.33559164e-02,
        -5.79917133e-02, -3.28080095e-02,  3.19820866e-02,
        -6.15002215e-02,  4.22647819e-02, -2.44550426e-02,
         6.38098270e-02,  1.14060044e-02, -1.61623582e-02,
         1.02349676e-01, -1.84284687e-01,  6.46276772e-02,
         1.16509825e-01,  1.39597505e-02, -1.19588412e-02,
         1.42250955e-02, -1.03048213e-01, -1.21116221e-01,
        -6.46868646e-02,  8.19728449e-02,  1.90678649e-02,
        -6.76106960e-02,  9.12772268e-02,  7.03890175e-02,
        -9.48411524e-02, -7.01773614e-02,  1.47915497e-01,
        -3.59558612e-02,  8.44752975e-03,  8.08503479e-02,
        -1.49337545e-01, -9.96001437e-02, -4.90001142e-02,
        -7.17496052e-02,  6.81453245e-03,  1.36345446e-01,
         1.28263369e-01,  5.15055843e-02, -1.33880466e-01,
        -1.46779716e-01,  7.20262453e-02,  1.58067271e-01,
         4.91161123e-02, -6.64025098e-02,  7.20493495e-

In [49]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [60]:
print(np.shape(features[0]))
print(np.shape(features[1]))

(8, 1024)
(6, 1024)


In [63]:
print(features[0][0])

[ 7.87322819e-02  2.53026970e-02  1.00224458e-01 -8.79849680e-03
 -2.71326303e-03 -9.33559164e-02 -5.79917133e-02 -3.28080095e-02
  3.19820866e-02 -6.15002215e-02  4.22647819e-02 -2.44550426e-02
  6.38098270e-02  1.14060044e-02 -1.61623582e-02  1.02349676e-01
 -1.84284687e-01  6.46276772e-02  1.16509825e-01  1.39597505e-02
 -1.19588412e-02  1.42250955e-02 -1.03048213e-01 -1.21116221e-01
 -6.46868646e-02  8.19728449e-02  1.90678649e-02 -6.76106960e-02
  9.12772268e-02  7.03890175e-02 -9.48411524e-02 -7.01773614e-02
  1.47915497e-01 -3.59558612e-02  8.44752975e-03  8.08503479e-02
 -1.49337545e-01 -9.96001437e-02 -4.90001142e-02 -7.17496052e-02
  6.81453245e-03  1.36345446e-01  1.28263369e-01  5.15055843e-02
 -1.33880466e-01 -1.46779716e-01  7.20262453e-02  1.58067271e-01
  4.91161123e-02 -6.64025098e-02  7.20493495e-03  3.95427905e-02
 -5.86873014e-03  1.99546404e-02 -1.55027837e-01 -5.41929007e-02
 -3.65594998e-02  6.01957738e-02 -4.98135202e-02 -9.07738805e-02
 -1.95820611e-02  9.50060

In [71]:
s = "BINGO"
print(" ".join(s))

B I N G O
