<a href="https://colab.research.google.com/github/julian2001/AlphaFold3VeraBiotech/blob/main/Embedding/PyTorch/Advanced/ProtBert-BFD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3> Extracting protein sequences' features using ProtBert-BFD pretrained-model <h3>

<b>1. Load necessry libraries including huggingface transformers<b>

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [1]:
!pip install -q transformers

In [2]:
import torch
from transformers import BertModel, BertTokenizer
import re
import os
import requests
from tqdm.auto import tqdm

<b>2. Load the vocabulary and ProtBert-BFD Model</b>

In [3]:
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]



In [4]:
model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

<b>3. Load the model into the GPU if avilabile and switch to inference mode<b>

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
model = model.to(device)
model = model.eval()

<b>4. Create or load sequences and map rarely occured amino acids (U,Z,O,B) to (X)<b>

In [7]:
sequences_Example = ["A E T C Z A O","S K T Z P"]

In [8]:
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]

<b>5. Tokenize, encode sequences and load it into the GPU if possibile<b>

In [9]:
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, pad_to_max_length=True)



In [10]:
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

<b>6. Extracting sequences' features and load it into the CPU if needed<b>

In [11]:
with torch.no_grad():
    embedding = model(input_ids=input_ids,attention_mask=attention_mask)[0]

In [12]:
embedding = embedding.cpu().numpy()

<b>7. Remove padding ([PAD]) and special tokens ([CLS],[SEP]) that is added by ProtBert-BFD model<b>

In [13]:
features = []
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][1:seq_len-1]
    features.append(seq_emd)

In [14]:
print(features)

[array([[ 0.05551013, -0.10461219, -0.0325426 , ...,  0.05091707,
         0.04319375,  0.10180862],
       [ 0.1389565 , -0.04658427,  0.02193583, ...,  0.06942758,
         0.14762945,  0.06503999],
       [ 0.14610647, -0.08092867, -0.12500264, ..., -0.03651207,
         0.02485622,  0.07977507],
       ...,
       [ 0.02349986, -0.01549877, -0.05685236, ..., -0.01342201,
         0.0170432 ,  0.06431113],
       [ 0.08130052, -0.10929585, -0.03022971, ...,  0.08717732,
         0.02061495,  0.05156804],
       [ 0.06197343, -0.06417911, -0.02039755, ..., -0.02796477,
         0.08840054,  0.07532751]], dtype=float32), array([[-0.06304389, -0.23687494, -0.07115818, ..., -0.03852159,
        -0.0032202 , -0.05244163],
       [ 0.0190558 , -0.10517225, -0.02930141, ..., -0.00238695,
        -0.09289833,  0.02722679],
       [ 0.07721861, -0.17031859, -0.1398785 , ..., -0.08390117,
         0.03587991, -0.01317149],
       [ 0.00872697, -0.17718233, -0.05856205, ..., -0.09917984,
     

In [None]:
import os
import logging

# Set up directories for logs and results
log_dir = '/content/drive/My Drive/drug_discovery_datasets/logs'
os.makedirs(log_dir, exist_ok=True)

save_dir = '/content/drive/My Drive/drug_discovery_datasets/training_results'
os.makedirs(save_dir, exist_ok=True)

# Initialize logging for errors
logging.basicConfig(
    filename=os.path.join(log_dir, 'training_errors.log'),
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Function to log errors
def log_error(message):
    logging.error(message)

# Example paths
jobname = "chemberta_experiment"
zip_output_path = '/content/drive/My Drive/drug_discovery_datasets/chemberta_results.zip'

# Call the function to save and archive results
save_and_archive_results(loss, accuracy, training_loss, f1, precision, recall, auc, specificity, conf_matrix, roc_curve_vals, pr_curve_vals, mae, mse, rmse, model, save_dir, jobname, zip_output_path, save_to_google_drive=True)