<a href="https://colab.research.google.com/github/jmbaek/proteinBERT/blob/main/ProteinBERT_benchmarks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Check the GPU device**


In [None]:
!nvidia-smi

Wed Mar 30 12:26:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
!wget ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/protein_benchmarks/* -P /content/protein_data/

--2022-03-31 11:00:12--  ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/protein_benchmarks/*
           => ‘/content/protein_data/.listing’
Resolving ftp.cs.huji.ac.il (ftp.cs.huji.ac.il)... 132.65.116.15
Connecting to ftp.cs.huji.ac.il (ftp.cs.huji.ac.il)|132.65.116.15|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /users/nadavb/protein_bert/protein_benchmarks ... done.
==> PASV ... done.    ==> LIST ... done.

.listing                [ <=>                ]   2.15K  --.-KB/s    in 0.03s   

2022-03-31 11:00:15 (77.9 KB/s) - ‘/content/protein_data/.listing’ saved [2197]

Removed ‘/content/protein_data/.listing’.
--2022-03-31 11:00:15--  ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/protein_benchmarks/PhosphositePTM.test.csv
           => ‘/content/protein_data/PhosphositePTM.test.csv’
==> CWD not required.
==> PASV ... done.    ==> RETR PhosphositePTM.test.csv ... done.
Length: 10368100 (9.9M)




## Setup

In [2]:
!git clone https://github.com/nadavbra/protein_bert.git
!git clone https://github.com/nadavbra/shared_utils.git
!cp -R protein_bert/proteinbert/ /usr/lib/python3.7/proteinbert/
!cp -R shared_utils/ /usr/lib/python3.7/proteinbert/

Cloning into 'protein_bert'...
remote: Enumerating objects: 128, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 128 (delta 66), reused 55 (delta 22), pack-reused 0[K
Receiving objects: 100% (128/128), 8.68 MiB | 12.72 MiB/s, done.
Resolving deltas: 100% (66/66), done.
Cloning into 'shared_utils'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 48 (delta 26), reused 37 (delta 15), pack-reused 0[K
Unpacking objects: 100% (48/48), done.


In [3]:
BENCHMARKS_DIR = '/content/protein_data/'
!mkdir -p /root/proteinbert_models/

## Fine-tune the model for the signal peptide benchmark

In [25]:
import os

import pandas as pd
from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_NAME = 'signalP_binary'

# A local (non-global) bianry output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)


# Loading the dataset

train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, stratify = train_set['label'], test_size = 0.1, random_state = 0)

test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')


# Loading the pre-trained model and fine-tuning it on the loaded dataset

pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)


# Evaluating the performance on the test-set

results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], \
        start_seq_len = 512, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

14945 training set records, 1661 validation set records, 4152 test set records.
 Local model dump file /root/proteinbert_models/default.pkl doesn't exist. Will download ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/epoch_92400_sample_23500000.pkl into /root/proteinbert_models. Please approve or reject this (to exit and potentially call the function again with different parameters).
Do you approve downloadig the file into the specified directory? Please specify "Yes" or "No":Yes
Downloaded file: /root/proteinbert_models/epoch_92400_sample_23500000.pkl
Created: /root/proteinbert_models/default.pkl
[2022_03_31-01:51:51] Training set: Filtered out 0 of 14945 (0.0%) records of lengths exceeding 510.
[2022_03_31-01:51:52] Validation set: Filtered out 0 of 1661 (0.0%) records of lengths exceeding 510.
[2022_03_31-01:51:52] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 9/40
Epoch 9: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_31-01:55:43] Training the entire fine-tuned model...
[2022_03_31-01:55:52] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 6/40
[2022_03_31-02:02:00] Training on final epochs of sequence length 1024...
[2022_03_31-02:02:00] Training set: Filtered out 0 of 14945 (0.0%) records of lengths exceeding 1022.
[2022_03_31-02:02:01] Validation set: Filtered out 0 of 1661 (0.

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,4152,0.996456
All,4152,0.996456


Confusion matrix:


Unnamed: 0,0,1
0,3441,37
1,32,642


## Run all benchmarks

In [26]:
import os

import pandas as pd
from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len, log
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARKS = [
    # name, output_type
    ('signalP_binary', OutputType(False, 'binary')),
    ('fluorescence', OutputType(False, 'numeric')),
    ('remote_homology', OutputType(False, 'categorical')),
    ('stability', OutputType(False, 'numeric')),
    ('scop', OutputType(False, 'categorical')),
    ('secondary_structure', OutputType(True, 'categorical')),
    ('disorder_secondary_structure', OutputType(True, 'binary')),
    ('ProFET_NP_SP_Cleaved', OutputType(False, 'binary')),
    ('PhosphositePTM', OutputType(True, 'binary')),
]

settings = {
    'max_dataset_size': None,
    'max_epochs_per_stage': 40,
    'seq_len': 512,
    'batch_size': 32,
    'final_epoch_seq_len': 1024,
    'initial_lr_with_frozen_pretrained_layers': 1e-02,
    'initial_lr_with_all_layers': 1e-04,
    'final_epoch_lr': 1e-05,
    'dropout_rate': 0.5,
    'training_callbacks': [
        keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
        keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
    ],
}

####### Uncomment for debug mode
# settings['max_dataset_size'] = 500
# settings['max_epochs_per_stage'] = 1

def run_benchmark(benchmark_name, pretraining_model_generator, input_encoder, pretraining_model_manipulation_function = None):
    
    log('========== %s ==========' % benchmark_name)  
    
    output_type = get_benchmark_output_type(benchmark_name)
    log('Output type: %s' % output_type)
    
    train_set, valid_set, test_set = load_benchmark_dataset(benchmark_name)        
    log(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')
    
    if settings['max_dataset_size'] is not None:
        log('Limiting the training, validation and test sets to %d records each.' % settings['max_dataset_size'])
        train_set = train_set.sample(min(settings['max_dataset_size'], len(train_set)), random_state = 0)
        valid_set = valid_set.sample(min(settings['max_dataset_size'], len(valid_set)), random_state = 0)
        test_set = test_set.sample(min(settings['max_dataset_size'], len(test_set)), random_state = 0)
    
    if output_type.is_seq or output_type.is_categorical:
        train_set['label'] = train_set['label'].astype(str)
        valid_set['label'] = valid_set['label'].astype(str)
        test_set['label'] = test_set['label'].astype(str)
    else:
        train_set['label'] = train_set['label'].astype(float)
        valid_set['label'] = valid_set['label'].astype(float)
        test_set['label'] = test_set['label'].astype(float)
        
    if output_type.is_categorical:
        
        if output_type.is_seq:
            unique_labels = sorted(set.union(*train_set['label'].apply(set)) | set.union(*valid_set['label'].apply(set)) | \
                    set.union(*test_set['label'].apply(set)))
        else:
            unique_labels = sorted(set(train_set['label'].unique()) | set(valid_set['label'].unique()) | set(test_set['label'].unique()))
            
        log('%d unique lebels.' % len(unique_labels))
    elif output_type.is_binary:
        unique_labels = [0, 1]
    else:
        unique_labels = None
        
    output_spec = OutputSpec(output_type, unique_labels)
    model_generator = FinetuningModelGenerator(pretraining_model_generator, output_spec, pretraining_model_manipulation_function = \
            pretraining_model_manipulation_function, dropout_rate = settings['dropout_rate'])
    finetune(model_generator, input_encoder, output_spec, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
            seq_len = settings['seq_len'], batch_size = settings['batch_size'], max_epochs_per_stage = settings['max_epochs_per_stage'], \
            lr = settings['initial_lr_with_all_layers'], begin_with_frozen_pretrained_layers = True, lr_with_frozen_pretrained_layers = \
            settings['initial_lr_with_frozen_pretrained_layers'], n_final_epochs = 1, final_seq_len = settings['final_epoch_seq_len'], \
            final_lr = settings['final_epoch_lr'], callbacks = settings['training_callbacks'])
    
    for dataset_name, dataset in [('Training-set', train_set), ('Validation-set', valid_set), ('Test-set', test_set)]:
        
        log('*** %s performance: ***' % dataset_name)
        results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, output_spec, dataset['seq'], dataset['label'], \
                start_seq_len = settings['seq_len'], start_batch_size = settings['batch_size'])
    
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display(results)
        
        if confusion_matrix is not None:
            with pd.option_context('display.max_rows', 16, 'display.max_columns', 10):
                log('Confusion matrix:')
                display(confusion_matrix)
                
    return model_generator

def load_benchmark_dataset(benchmark_name):
    
    train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % benchmark_name)
    valid_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.valid.csv' % benchmark_name)
    test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % benchmark_name)
    
    train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
    test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()
          
    if os.path.exists(valid_set_file_path):
        valid_set = pd.read_csv(valid_set_file_path).dropna().drop_duplicates()
    else:
        log(f'Validation set {valid_set_file_path} missing. Splitting training set instead.')
        train_set, valid_set = train_test_split(train_set, stratify = train_set['label'], test_size = 0.1, random_state = 0)
    
    return train_set, valid_set, test_set

def get_benchmark_output_type(benchmark_name):
    for name, output_type in BENCHMARKS:
        if name == benchmark_name:
            return output_type
        
pretrained_model_generator, input_encoder = load_pretrained_model()

for benchmark_name, _ in BENCHMARKS:
    run_benchmark(benchmark_name, pretrained_model_generator, input_encoder, pretraining_model_manipulation_function = \
            get_model_with_hidden_layers_as_outputs)
        
log('Done.')

[2022_03_31-02:04:28] Output type: global binary
[2022_03_31-02:04:28] Validation set /content/protein_data/signalP_binary.valid.csv missing. Splitting training set instead.
[2022_03_31-02:04:28] 14945 training set records, 1661 validation set records, 4152 test set records.
[2022_03_31-02:04:28] Training set: Filtered out 0 of 14945 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:04:30] Validation set: Filtered out 0 of 1661 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:04:30] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_31-02:06:58] Training the entire fine-tuned model...
[2022_03_31-02:07:07] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 6/40
[2022_03_31-02:13:13] Training on final epochs of sequence length 1024...
[2022_03_31-02:13:13] Training set: Filtered out 0 of 14945 (0.0%) records of lengths exceeding 1022.
[2022_03_31-02:13:14] Validation set: Filtered out 0 of 1661 (0.0%) records of lengths exceeding 1022.
[2022_03_31-02:15:32] *** Training-set performance: ***


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,14945,0.999865
All,14945,0.999865


[2022_03_31-02:15:55] Confusion matrix:


Unnamed: 0,0,1
0,12499,18
1,26,2402


[2022_03_31-02:15:55] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,1661,0.99288
All,1661,0.99288


[2022_03_31-02:16:01] Confusion matrix:


Unnamed: 0,0,1
0,1377,14
1,6,264


[2022_03_31-02:16:01] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,4152,0.995746
All,4152,0.995746


[2022_03_31-02:16:10] Confusion matrix:


Unnamed: 0,0,1
0,3448,30
1,35,639


[2022_03_31-02:16:10] Output type: global numeric
[2022_03_31-02:16:10] 21446 training set records, 5362 validation set records, 27217 test set records.
[2022_03_31-02:16:10] Training set: Filtered out 0 of 21446 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:16:11] Validation set: Filtered out 0 of 5362 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:16:12] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_31-02:20:04] Training the entire fine-tuned model...
[2022_03_31-02:20:13] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
[2022_03_31-02:38:06] Training on final epochs of sequence length 1024...
[2022_03_31-02:38:06] Training set: Filtered out 0 of 21446 (0.0%) records of lengths exceeding 1022.
[2022_03_31-02:38:47] Validation set: Filtered out 0 of 5362 (0.0%) records of lengths exceeding 1022.
[2

Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,21446,0.683609
All,21446,0.683609


[2022_03_31-02:42:38] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,5362,0.674749
All,5362,0.674749


[2022_03_31-02:42:49] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,27217,0.642266
All,27217,0.642266


[2022_03_31-02:43:28] Output type: global categorical
[2022_03_31-02:43:29] 12311 training set records, 736 validation set records, 718 test set records.
[2022_03_31-02:43:29] 1195 unique lebels.
[2022_03_31-02:43:29] Training set: Filtered out 153 of 12311 (1.2%) records of lengths exceeding 510.
[2022_03_31-02:43:29] Validation set: Filtered out 13 of 736 (1.8%) records of lengths exceeding 510.
[2022_03_31-02:43:29] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_31-02:44:37] Training the entire fine-tuned model...
[2022_03_31-02:44:46] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_03_31-02:47:22] Training on final epochs of sequence length 1024...
[2022_03_31-02:47:22] Training set: Filtered out 7 of 12311 (0.1%) records of lengths exceeding 1022.
[2022_03_31-02:47:23] Validation set: Filtered out 2 of 736 (0.3%) records of lengths exceeding 1022.
[2022_03_31-02:49:17] *** Training-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,12158,0.9152
1024,146,0.452055
2048,7,0.142857
All,12311,0.909268


[2022_03_31-02:49:44] Confusion matrix:


Unnamed: 0,0,1,10,100,1000,...,995,996,997,998,999
0,80,0,0,0,0,...,0,0,0,0,0
1,0,33,0,0,0,...,0,0,0,0,0
10,0,0,14,0,0,...,0,0,0,0,0
100,0,0,0,8,0,...,0,0,0,0,0
1000,0,0,0,0,2,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,...,0,0,0,0,0
996,0,0,0,0,0,...,0,0,0,0,0
997,0,0,0,0,0,...,0,0,0,0,0
998,0,0,0,0,0,...,0,0,0,0,0


[2022_03_31-02:49:44] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,723,0.257261
1024,11,0.272727
2048,2,0.0
All,736,0.256793


[2022_03_31-02:49:55] Confusion matrix:


Unnamed: 0,0,1,10,100,1000,...,995,996,997,998,999
0,0,0,0,0,0,...,0,0,0,0,0
1,0,4,0,0,0,...,0,0,0,0,0
10,0,0,0,0,0,...,0,0,0,0,0
100,0,0,0,0,0,...,0,0,0,0,0
1000,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,...,0,0,0,0,0
996,0,0,0,0,0,...,0,0,0,0,0
997,0,0,0,0,0,...,0,0,0,0,0
998,0,0,0,0,0,...,0,0,0,0,0


[2022_03_31-02:49:55] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,711,0.21519
1024,7,0.142857
All,718,0.214485


[2022_03_31-02:50:03] Confusion matrix:


Unnamed: 0,0,1,10,100,1000,...,995,996,997,998,999
0,0,0,0,0,0,...,0,0,0,0,0
1,0,6,0,0,0,...,0,0,0,0,0
10,0,0,0,0,0,...,0,0,0,0,0
100,0,0,0,0,0,...,0,0,0,0,0
1000,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,...,0,0,0,0,0
996,0,0,0,0,0,...,0,0,0,0,0
997,0,0,0,0,0,...,0,0,0,0,0
998,0,0,0,0,0,...,0,0,0,0,0


[2022_03_31-02:50:03] Output type: global numeric
[2022_03_31-02:50:03] 53613 training set records, 2512 validation set records, 12851 test set records.
[2022_03_31-02:50:03] Training set: Filtered out 0 of 53613 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:50:06] Validation set: Filtered out 0 of 2512 (0.0%) records of lengths exceeding 510.
[2022_03_31-02:50:06] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_31-02:59:17] Training the entire fine-tuned model...
[2022_03_31-02:59:27] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 9: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 10/40
Epoch 11/40
Epoch 11: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 12/40
[2022_03_31-03:41:13] Training on final epochs of sequence length 1024...
[2022_03_31-03:41:13] Training set: Filtered out 0 of 53613 (0.0%) records of lengths exceeding 1022.
[2022_03_31-03:42:08] Validation set: Filtered out 0 of 2512 (0.0%) records of lengths exceed

Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,53613,0.806009
All,53613,0.806009


[2022_03_31-03:50:43] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,2512,0.757541
All,2512,0.757541


[2022_03_31-03:50:50] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Spearman's rank correlation
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,12851,0.733131
All,12851,0.733131


[2022_03_31-03:51:10] Output type: global categorical
[2022_03_31-03:51:10] Validation set /content/protein_data/scop.valid.csv missing. Splitting training set instead.
[2022_03_31-03:51:10] 14112 training set records, 1568 validation set records, 3921 test set records.
[2022_03_31-03:51:10] 7 unique lebels.
[2022_03_31-03:51:10] Training set: Filtered out 6 of 14112 (0.0%) records of lengths exceeding 510.
[2022_03_31-03:51:11] Validation set: Filtered out 1 of 1568 (0.1%) records of lengths exceeding 510.
[2022_03_31-03:51:11] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 5/40
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_31-03:53:54] Training the entire fine-tuned model...
[2022_03_31-03:54:03] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_03_31-03:57:02] Training on final epochs of sequence length 1024...
[2022_03_31-03:57:02] Training set: Filtered out 0 of 14112 (0.0%) records of lengths exceeding 1022.
[2022_03_31-03:57:03] Validation set: Filtered out 0 of 1568 (0.0%) records of lengths exceeding 1022.
[2022_03_31-03:5

  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,14106,0.937332
1024,6,1.0
All,14112,0.937358


[2022_03_31-03:59:38] Confusion matrix:


Unnamed: 0,a,b,c,d,e,f,g
a,2253,5,28,65,1,0,2
b,7,2780,29,137,3,1,4
c,12,11,4110,58,6,0,0
d,57,169,196,2905,29,3,2
e,4,3,8,3,243,0,0
f,1,0,1,0,0,194,0
g,9,12,2,16,0,0,743


[2022_03_31-03:59:38] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,1567,0.910657
1024,1,1.0
All,1568,0.910714


[2022_03_31-03:59:46] Confusion matrix:


Unnamed: 0,a,b,c,d,e,f,g
a,249,0,4,8,0,1,0
b,1,307,4,14,2,1,0
c,2,1,451,10,2,0,0
d,5,26,32,306,2,2,0
e,2,0,6,2,19,0,0
f,3,3,0,0,0,16,0
g,5,1,0,1,0,0,80


[2022_03_31-03:59:46] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,3919,0.879561
1024,2,0.5
All,3921,0.879368


[2022_03_31-03:59:58] Confusion matrix:


Unnamed: 0,a,b,c,d,e,f,g
a,594,2,14,36,5,0,3
b,1,721,22,70,5,2,2
c,2,8,1121,32,3,0,0
d,23,86,95,713,10,5,1
e,4,2,5,5,56,1,0
f,3,2,1,1,0,47,1
g,6,5,0,10,0,0,196


[2022_03_31-03:59:58] Output type: categorical sequence
[2022_03_31-03:59:58] 8678 training set records, 2170 validation set records, 434 test set records.
[2022_03_31-03:59:58] 3 unique lebels.
[2022_03_31-03:59:58] Training set: Filtered out 570 of 8678 (6.6%) records of lengths exceeding 510.
[2022_03_31-03:59:59] Validation set: Filtered out 138 of 2170 (6.4%) records of lengths exceeding 510.
[2022_03_31-03:59:59] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 5/40
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 8/40
Epoch 8: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
[2022_03_31-04:03:28] Training the entire fine-tuned model...
[2022_03_31-04:03:37] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 12: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 13/40
Epoch 13: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_03_31-04:10:58] Training on fin

  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,1834580,0.768242
1024,351854,0.731426
2048,35073,0.70895
All,2221507,0.761475


[2022_03_31-04:12:58] Confusion matrix:


Unnamed: 0,0,1,2
0,631678,23372,144036
1,30127,303123,142206
2,110809,79336,756820


[2022_03_31-04:12:58] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,464337,0.761505
1024,84878,0.733099
2048,7716,0.717081
All,556931,0.756561


[2022_03_31-04:13:12] Confusion matrix:


Unnamed: 0,0,1,2
0,163044,6544,37649
1,8513,71828,34577
2,29426,18870,186480


[2022_03_31-04:13:12] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,Accuracy
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,85550,0.746277
1024,21222,0.71407
All,106772,0.739876


[2022_03_31-04:13:18] Confusion matrix:


Unnamed: 0,0,1,2
0,27581,1337,7473
1,1582,14043,7691
2,5177,4514,37374


[2022_03_31-04:13:18] Output type: binary sequence
[2022_03_31-04:13:19] 8678 training set records, 2170 validation set records, 434 test set records.
[2022_03_31-04:13:19] Training set: Filtered out 570 of 8678 (6.6%) records of lengths exceeding 510.
[2022_03_31-04:13:19] Validation set: Filtered out 138 of 2170 (6.4%) records of lengths exceeding 510.
[2022_03_31-04:13:20] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 8/40
Epoch 8: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 9/40
[2022_03_31-04:15:38] Training the entire fine-tuned model...
[2022_03_31-04:15:46] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_03_31-04:18:41] Training on final epochs of sequence length 1024...
[2022_03_31-04:18:41] Training set: Filtered out 29 of 8678 (0.3%) records of lengths exceeding 1022.
[2022_03_31-04:

  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,1834580,0.934639
1024,351854,0.90784
2048,35073,0.802276
All,2221507,0.928966


[2022_03_31-04:20:36] Confusion matrix:


Unnamed: 0,0,1
0,74877,69796
1,22190,2054644


[2022_03_31-04:20:36] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,464337,0.920565
1024,84878,0.907677
2048,7716,0.938385
All,556931,0.920006


[2022_03_31-04:20:50] Confusion matrix:


Unnamed: 0,0,1
0,17609,17309
1,5972,516041


[2022_03_31-04:20:50] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,85550,0.888659
1024,21222,0.797247
All,106772,0.869728


[2022_03_31-04:20:56] Confusion matrix:


Unnamed: 0,0,1
0,1144,1661
1,954,103013


[2022_03_31-04:20:56] Output type: global binary
[2022_03_31-04:20:57] 2727 training set records, 303 validation set records, 337 test set records.
[2022_03_31-04:20:57] Training set: Filtered out 520 of 2727 (19.1%) records of lengths exceeding 510.
[2022_03_31-04:20:57] Validation set: Filtered out 58 of 303 (19.1%) records of lengths exceeding 510.
[2022_03_31-04:20:57] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 8/40
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_31-04:21:36] Training the entire fine-tuned model...
[2022_03_31-04:21:46] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 3: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_03_31-04:22:29] Training on final epochs of sequence length 1024...
[2022_03_31-04:22:29] Training set: Filtered out 169 of 2727 (6.2%) records of lengths exceeding 1022.
[2022_03_31-04:22:30] Validation set: Filtered out 17 of 303 (5.6%) records of lengths exceeding 1022.
[2022_03_31-04:23:09] *** Training-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,2207,0.998631
1024,351,0.893254
2048,136,0.844444
4096,26,
8192,7,
All,2727,0.99621


[2022_03_31-04:23:30] Confusion matrix:


Unnamed: 0,0,1
0,841,29
1,15,1842


[2022_03_31-04:23:30] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)




Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,245,0.979922
1024,41,0.858974
2048,13,
4096,3,
8192,1,
All,303,0.978853


[2022_03_31-04:23:48] Confusion matrix:


Unnamed: 0,0,1
0,67,10
1,10,216


[2022_03_31-04:23:48] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)




  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,267,0.977267
1024,45,1.0
2048,18,
4096,6,
8192,1,
All,337,0.982184


[2022_03_31-04:24:05] Confusion matrix:


Unnamed: 0,0,1
0,93,12
1,6,226


[2022_03_31-04:24:05] Output type: binary sequence
[2022_03_31-04:24:06] 43356 training set records, 4825 validation set records, 8508 test set records.
[2022_03_31-04:24:06] Training set: Filtered out 18426 of 43356 (42.5%) records of lengths exceeding 510.
[2022_03_31-04:24:09] Validation set: Filtered out 2062 of 4825 (42.7%) records of lengths exceeding 510.
[2022_03_31-04:24:09] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 5/40
Epoch 6/40
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_31-04:28:52] Training the entire fine-tuned model...
[2022_03_31-04:29:01] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 5: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/40
Epoch 7/40
Epoch 7: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
[2022_03_31-04:48:42] Training on final epochs of sequence length 1024...
[2022_03_31-04:48:42] Training set: Filtered out 5814 of 43356 (13.4%) records of lengths exceeding 1022.
[2022_03_31-04:48:50] Validation set: Filtered out 623 of 4825 (12.9%) records of lengths 

  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,7439719,0.954433
1024,8931566,0.947519
2048,6457352,0.944159
4096,2399373,0.940191
8192,862534,0.938123
16384,57495,0.855059
32768,26926,
65536,137282,0.910883
All,26312247,0.947869


[2022_03_31-04:56:52] Confusion matrix:


Unnamed: 0,0,1
0,25874332,9788
1,413724,14403


[2022_03_31-04:56:52] *** Validation-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,823459,0.953357
1024,1029074,0.946452
2048,677731,0.943063
4096,285021,0.941351
8192,77959,0.927205
16384,8749,0.969611
All,2901993,0.947072


[2022_03_31-04:57:25] Confusion matrix:


Unnamed: 0,0,1
0,2853789,1024
1,45642,1538


[2022_03_31-04:57:25] *** Test-set performance: ***


  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)
  super(Adam, self).__init__(name, **kwargs)


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,1476347,0.951163
1024,1697696,0.94682
2048,1222679,0.944734
4096,545823,0.940524
8192,222772,0.93013
16384,8886,0.957155
All,5174203,0.946605


[2022_03_31-04:58:12] Confusion matrix:


Unnamed: 0,0,1
0,5089705,2031
1,79895,2572


[2022_03_31-04:58:12] Done.



## Visualizing the attention layers

You can run this only after you have fine-tuned the model on a benchmark (e.g. signal peptide) and obtained model_generator.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len, log
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

BENCHMARK_DISPLAY_NAME = 'Signal peptide'

TEST_SET_FILE_PATH = 'protein_data/signalP_binary.train.csv'
IDEAL_LEN = 80

def calculate_attentions(model, input_encoder, seq, seq_len = None):
    
    from tensorflow.keras import backend as K
    from proteinbert.tokenization import index_to_token
    
    if seq_len is None:
        seq_len = len(seq) + 2
    
    X = input_encoder.encode_X([seq], seq_len)
    (X_seq,), _ = X
    seq_tokens = list(map(index_to_token.get, X_seq))

    model_inputs = [layer.input for layer in model.layers if 'InputLayer' in str(type(layer))][::-1]
    model_attentions = [layer.calculate_attention(layer.input) for layer in model.layers if 'GlobalAttention' in str(type(layer))]
    invoke_model_attentions = K.function(model_inputs, model_attentions)
    attention_values = invoke_model_attentions(X)
    
    attention_labels = []
    merged_attention_values = []

    for attention_layer_index, attention_layer_values in enumerate(attention_values):
        for head_index, head_values in enumerate(attention_layer_values):
            attention_labels.append('Attention %d - head %d' % (attention_layer_index + 1, head_index + 1))
            merged_attention_values.append(head_values)

    attention_values = np.array(merged_attention_values)
    
    return attention_values, seq_tokens, attention_labels

def plot_attention(attention_values, seq_tokens, attention_labels, ax, cmap = 'Reds', vmin = 0, vmax = None, text_value_threshold = 0.1):

    heatmap = ax.pcolor(attention_values.transpose(), cmap = cmap, vmin = vmin, vmax = vmax)

    ax.set_xticks(np.arange(len(attention_labels)) + 0.5)
    ax.set_xticklabels(attention_labels, rotation = 45, ha = 'right', fontsize = 12)
    ax.set_yticks(np.arange(len(seq_tokens)) + 0.5)
    ax.set_yticklabels(seq_tokens, fontsize = 12)

    for i, row in enumerate(attention_values):
        for j, value in enumerate(row):
            if abs(value) >= text_value_threshold:
                add_plus_sign = attention_values.min() < 0 and value > 0
                plus_sign = '+' if add_plus_sign else ''
                ax.text(i + 0.5, j + 0.5, plus_sign + '%d%%' % (100 * value), color = 'white', ha = 'center', va = 'center', \
                        fontsize = 9, fontweight = 'bold', fontstretch = 'condensed')
                
test_set = pd.read_csv(TEST_SET_FILE_PATH)
chosen_index = ((test_set['seq'].str.len() - IDEAL_LEN).abs()).sort_values().index[0]
seq = test_set.loc[chosen_index, 'seq']
label = test_set.loc[chosen_index, 'label']
                
seq_len = len(seq) + 2

pretrained_model_generator, input_encoder = load_pretrained_model()
model = pretrained_model_generator.create_model(seq_len)
pretrained_attention_values, pretrained_seq_tokens, pretrained_attention_labels = calculate_attentions(model, input_encoder, seq, \
        seq_len = seq_len)

model = model_generator.create_model(seq_len)
finetuned_attention_values, finetuned_seq_tokens, finetuned_attention_labels = calculate_attentions(model, input_encoder, seq, \
        seq_len = seq_len)
assert finetuned_seq_tokens == pretrained_seq_tokens
assert finetuned_attention_labels == pretrained_attention_labels[:len(finetuned_attention_labels)]

fig, axes = plt.subplots(ncols = 4, figsize = (20, 0.2 * seq_len), gridspec_kw = dict(width_ratios = [1, 5, 1, 5]))
fig.subplots_adjust(wspace = 0.3)

axes[0].barh(np.arange(seq_len), 100 * pretrained_attention_values.sum(axis = 0), color = '#EC7063')
axes[0].set_ylim((-0.5, seq_len - 0.5))
axes[0].set_yticks([])
axes[0].invert_xaxis()
axes[0].set_xlabel('Total atten. %', fontsize = 14)

vmax = pretrained_attention_values.max()
plot_attention(pretrained_attention_values, pretrained_seq_tokens, pretrained_attention_labels, axes[1], cmap = 'Reds', vmax = vmax, \
        text_value_threshold = 0.05)
axes[1].set_title('Only pre-training', fontsize = 16)

axes[2].barh(np.arange(seq_len), 100 * (finetuned_attention_values - pretrained_attention_values).sum(axis = 0), color = '#28B463')
axes[2].set_ylim((-0.5, seq_len - 0.5))
axes[2].set_yticks([])
axes[2].invert_xaxis()
axes[2].set_xlabel('Total atten. % diff', fontsize = 14)

attention_diff = finetuned_attention_values - pretrained_attention_values[:len(finetuned_attention_labels), :]
vmax = np.abs(attention_diff).max()
plot_attention(attention_diff, finetuned_seq_tokens, finetuned_attention_labels, axes[3], cmap = 'PiYG', vmin = -vmax, vmax = vmax, \
        text_value_threshold = 0.03)
axes[3].set_title('%s fine-tuning' % BENCHMARK_DISPLAY_NAME, fontsize = 16)

print(seq, label)

 Local model dump file /root/proteinbert_models/default.pkl doesn't exist. Will download ftp://ftp.cs.huji.ac.il/users/nadavb/protein_bert/epoch_92400_sample_23500000.pkl into /root/proteinbert_models. Please approve or reject this (to exit and potentially call the function again with different parameters).
Do you approve downloadig the file into the specified directory? Please specify "Yes" or "No":Yes
Downloaded file: /root/proteinbert_models/epoch_92400_sample_23500000.pkl
Created: /root/proteinbert_models/default.pkl


  super(Adam, self).__init__(name, **kwargs)
