In [None]:
!git clone https://github.com/ibmm-unibe-ch/TemBERTure.git



In [None]:
!cd TemBERTure


In [None]:
!git filter-branch --subdirectory-filter temBERTure -- --all

In [None]:
!conda install --file requirements.txt
!pip install -r requirements.txt

In [44]:
from transformers import BertTokenizer
from adapters import BertAdapterModel
#import logging
import tqdm
import math
import numpy as np
import torch.nn as nn
#logger = logging.getLogger(__name__)

class TemBERTure:
    """
    This class initializes and utilizes a pretrained BERT-based model (model_name) with adapter layers tuned
    for classification or regression tasks. The adapter path (adapter_path) provides the pre-trained
    adapter and head for the specified model and task (regression or classification).

    Attributes:
        adapter_path (str): Path to pre-trained adapters and heads for the model.
        model_name (str, default='Rostlab/prot_bert_bfd'): Name of the BERT-based model.
        batch_size (int, default=16): Batch size for predictions.
        device (str, default='cuda'): Device for running the model ('cuda' or 'cpu').

    Methods:
        __init__: Initializes the TemBERTure class with the specified BERT-based model,
                adapter path, tokenizer, batch size, and device.
        predict: Takes input texts, tokenizes them, and predicts outputs (classification/regression)
                using the loaded model and its adapters.
    """
    def __init__(self, adapter_path, model_name='Rostlab/prot_bert_bfd',batch_size=16, device='cuda', task = 'regression'):
        self.model = BertAdapterModel.from_pretrained(model_name) 
        self.model.load_adapter(adapter_path + 'AdapterBERT_adapter', with_head=True)
        self.model.load_head(adapter_path + 'AdapterBERT_head_adapter')
        self.model.set_active_adapters(['AdapterBERT_adapter'])
        self.model.active_head == 'AdapterBERT_head_adapter'  # pretrained for cls task adapter
        self.model.train_adapter(["AdapterBERT_adapter"])
        self.model.delete_head('default')
        self.model.bert.prompt_tuning = nn.Identity()
        #logger.info(f' * USING PRE-TRAINED ADAPTERS FROM: {adapter_path}')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.batch_size = batch_size
        self.device = device
        self.task = task
    
    def predict(self, input_texts):
        self.model = self.model.to(self.device)
        if not isinstance(input_texts, list):
            input_texts = [input_texts]
        input_texts = [" ".join("".join(sample.split())) for sample in input_texts]
        #input_texts = input_texts.tolist()
        nb_batches = math.ceil(len(input_texts) / self.batch_size)
        y_preds = []

        for i in range(nb_batches):
            batch_input = input_texts[i * self.batch_size: (i+1) * self.batch_size]
            encoded = self.tokenizer(batch_input, truncation=True, padding=True, max_length=512, return_tensors="pt").to(self.device)
            y_preds += self.model(**encoded).logits.reshape(-1).tolist()

        if self.task == 'classification':
            preds = 1 / (1 + np.exp(-np.array(y_preds)))
            y_preds = (preds > 0.5).astype(int)# Trasforma le probabilità in etichette binarie
            status = ['Thermophilic' if pred == 1 else 'Non-thermophilic' for pred in y_preds]
            
            return preds
        
        if self.task == 'regression':
            return y_preds

In [45]:
import pandas as pd

In [48]:
data_sb = pd.read_csv('top_100.csv', sep=',', names=['mutations', 'sequence', 'predicted_brightness', 'tm', 'mut'], header=0)
data_sb = pd.DataFrame(data_sb)
data_sb = data_sb.drop(['predicted_brightness', 'tm', 'mut'], axis=1)
data_sb

Unnamed: 0,mutations,sequence
0,T38N:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDANYGKLTMKF...
1,G4S:V61A:Q69M:Y145F:A154P:T203I,MSKSEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...
2,S30R:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTMKF...
3,S30R:Y39H:L44I:Q69M:Y145F:A154P,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...
4,L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTMKF...
...,...,...
95,S30R:V61I:Q69M:Y145F:A154P:L220V,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTLKF...
96,S30R:Y39H:L44I:A154P:S202H:T203H,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...
97,S30R:T38N:Y39H:Y145F:A154P:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDANHGKLTLKF...
98,S30R:Y39H:Y145F:A154P:K158R:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTLKF...


In [49]:
model = TemBERTure(
    adapter_path='temBERTure_CLS/',  # Path to the model adapter weights
    device='cpu',                                # Device to run the model on
    batch_size=1,                                 # Batch size for inference
    task='classification'                         # Task type (e.g., classification for TemBERTureCLS)
)

predictions = []

for idx, row in tqdm.tqdm(data_sb.iterrows(), total=len(data_sb)):
    cur_prediction = model.predict(row.sequence)[0]
    predictions.append(cur_prediction)
data_sb['temb_class_score'] = predictions

There are adapters available but none are activated for the forward pass.
100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


In [50]:
data_sb

Unnamed: 0,mutations,sequence,temb_class_score
0,T38N:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDANYGKLTMKF...,0.061777
1,G4S:V61A:Q69M:Y145F:A154P:T203I,MSKSEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0.037479
2,S30R:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTMKF...,0.069855
3,S30R:Y39H:L44I:Q69M:Y145F:A154P,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...,0.069304
4,L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTMKF...,0.063417
...,...,...,...
95,S30R:V61I:Q69M:Y145F:A154P:L220V,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTLKF...,0.075845
96,S30R:Y39H:L44I:A154P:S202H:T203H,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...,0.045234
97,S30R:T38N:Y39H:Y145F:A154P:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDANHGKLTLKF...,0.045758
98,S30R:Y39H:Y145F:A154P:K158R:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTLKF...,0.045198


In [51]:
model_replica1 = TemBERTure(
    adapter_path='./temBERTure_TM/replica1/',  # Path to the adapter for replica 1
    device='cpu',                                        # Device to run the model on
    batch_size=1,                                        # Batch size for inference
    task='regression'                                     # Task type (e.g., regression for TemBERTureTM)
)

model_replica2 = TemBERTure(
    adapter_path='./temBERTure_TM/replica2/',  # Path to the adapter for replica 2
    device='cpu',                                        # Device to run the model on
    batch_size=1,                                        # Batch size for inference
    task='regression'                                     # Task type (e.g., regression for TemBERTureTM)
)

model_replica3 = TemBERTure(
    adapter_path='./temBERTure_TM/replica3/',  # Path to the adapter for replica 3
    device='cpu',                                        # Device to run the model on
    batch_size=1,                                        # Batch size for inference
    task='regression'                                     # Task type (e.g., regression for TemBERTureTM)
)


There are adapters available but none are activated for the forward pass.
There are adapters available but none are activated for the forward pass.
There are adapters available but none are activated for the forward pass.


In [52]:
predictions_rep1 = []

for idx, row in tqdm.tqdm(data_sb.iterrows(), total=len(data_sb)):
    cur_prediction = model_replica1.predict(row.sequence)[0]
    predictions_rep1.append(cur_prediction)
data_sb['temb_reg_1'] = predictions_rep1

100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


In [53]:
predictions_rep2 = []

for idx, row in tqdm.tqdm(data_sb.iterrows(), total=len(data_sb)):
    cur_prediction = model_replica2.predict(row.sequence)[0]
    predictions_rep2.append(cur_prediction)
data_sb['temb_reg_2'] = predictions_rep2

100%|██████████| 100/100 [01:40<00:00,  1.00s/it]


In [54]:
predictions_rep3 = []

for idx, row in tqdm.tqdm(data_sb.iterrows(), total=len(data_sb)):
    cur_prediction = model_replica3.predict(row.sequence)[0]
    predictions_rep3.append(cur_prediction)
data_sb['temb_reg_3'] = predictions_rep3

100%|██████████| 100/100 [01:38<00:00,  1.01it/s]


In [55]:
data_sb.to_csv('./step1_res.csv', index = False)


In [56]:
data_sb


Unnamed: 0,mutations,sequence,temb_class_score,temb_reg_1,temb_reg_2,temb_reg_3
0,T38N:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDANYGKLTMKF...,0.061777,42.097389,46.428192,44.688885
1,G4S:V61A:Q69M:Y145F:A154P:T203I,MSKSEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,0.037479,41.035271,45.899796,42.164890
2,S30R:L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTMKF...,0.069855,41.565670,46.965614,43.408283
3,S30R:Y39H:L44I:Q69M:Y145F:A154P,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...,0.069304,40.386913,46.615738,42.859192
4,L44M:Q69M:Y145F:A154P:T203I,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTMKF...,0.063417,42.030533,46.582745,44.046989
...,...,...,...,...,...,...
95,S30R:V61I:Q69M:Y145F:A154P:L220V,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATYGKLTLKF...,0.075845,41.370853,45.905045,43.067329
96,S30R:Y39H:L44I:A154P:S202H:T203H,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTIKF...,0.045234,41.668400,45.941299,42.814514
97,S30R:T38N:Y39H:Y145F:A154P:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDANHGKLTLKF...,0.045758,41.893963,47.287422,44.011276
98,S30R:Y39H:Y145F:A154P:K158R:V224I,MSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATHGKLTLKF...,0.045198,40.596138,46.666203,43.793724


In [57]:
with open ('fasta_mut_seq.fasta', 'w') as file :
    for i, raw in tqdm.tqdm(data_sb.iterrows(), total = len(data_sb)) :
        seq = raw['sequence']
        name = raw['mutations']
        fasta_seq = f'>{name}\n{seq}\n'
        file.write(fasta_seq)

100%|██████████| 100/100 [00:00<00:00, 8291.27it/s]
