In [1]:
import pandas as pd
from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs
from os import path
import pickle

In [2]:
import wandb
from wandb.keras import WandbCallback


In [3]:
DATA_DIR = "../../data/"

In [4]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [5]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [5]:
#pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "checkpoint_2022_01_19.pkl")

In [7]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [30]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
    #WandbCallback()
]

In [6]:
train_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
valid_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
test_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
train_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [7]:
train_data["seq"] = train_data["heavy"] + train_data["light"]
valid_data["seq"] = valid_data["heavy"] + valid_data["light"]
test_data["seq"] = test_data["heavy"] + test_data["light"]

In [23]:
wandb.config = {
  "learning_rate": 1e-04,
  "epochs": 65,
  "batch_size": 32
}

In [31]:
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = 128, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)

[2022_01_25-16:32:40] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_01_25-16:32:40] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_25-16:32:40] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40

KeyboardInterrupt: 

In [25]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)

In [26]:
print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.936368
All,119,0.936368


Confusion matrix:


Unnamed: 0,0,1
0,96,0
1,13,10


In [27]:
f1 = 10 / (10 + 0.5* (13 + 0))
# TN / (TN + 0.5 * (FP + FN))
f1

0.6060606060606061

In [16]:
mod = model_generator.create_model(seq_len = 512)

In [17]:
mod.save(path.join(DATA_DIR, "protein_bert/batch_32_lr_1e-4_2022_01_25.pkl"))

2022-01-25 15:36:30.188640: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/batch_32_lr_1e-4_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/batch_32_lr_1e-4_2022_01_25.pkl/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [8]:
def fine_tune(i, lr, epochs, batch_size):
    wandb.init(project=f"ProteinBERT_{i}", entity="kvetab")
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)
    training_callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
        keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
        WandbCallback()
    ]
    
    wandb.config = {
      "learning_rate": lr,
      "epochs": epochs * 2,
      "batch_size": batch_size
    }
    
    finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epochs, lr = lr, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 512, final_lr = lr / 10, callbacks = training_callbacks)
    results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = batch_size)
    print(f"Model number {i} trained with learning rate {lr}:")
    print('Test-set performance:')
    display(results)

    print('Confusion matrix:')
    display(confusion_matrix)
    mod = model_generator.create_model(seq_len = 512)
    mod.save(path.join(DATA_DIR, f"protein_bert/{i}_batch_{batch_size}_lr_{lr}_2022_01_25.pkl"))

In [None]:
i = 0
batch_size = 64
epoch_num = 50
for learning_rate in [1e-5, 5e-5, 1e-4, 5e-4]:
    fine_tune(f"{i:02d}", learning_rate, epoch_num, batch_size)
    i += 1

[2022_01_26-17:23:10] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_01_26-17:23:11] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_26-17:23:11] Training with frozen pretrained layers...


2022-01-26 17:23:11.044361: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 14/50
Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_01_26-17:45:08] Training the entire fine-tuned model...
[2022_01_26-17:45:19] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
[2022_01_26-19:40:35] Training on final epochs of sequence length 512...
[2022_01_26-19:40:35] Training set: Filtered out 0 of 1338 (0.0%) records of length

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.939538
All,119,0.939538


Confusion matrix:


Unnamed: 0,0,1
0,96,0
1,12,11


  super(Adam, self).__init__(name, **kwargs)
2022-01-26 19:46:14.901117: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/00_batch_64_lr_1e-05_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/00_batch_64_lr_1e-05_2022_01_25.pkl/assets


[2022_01_26-19:46:27] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


[2022_01_26-19:46:27] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_26-19:46:27] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 00017: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 18/50
Epoch 19/50
Epoch 00019: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[2022_01_26-21:06:01] T

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.938179
All,119,0.938179


Confusion matrix:


Unnamed: 0,0,1
0,95,1
1,13,10


  super(Adam, self).__init__(name, **kwargs)


INFO:tensorflow:Assets written to: ../../data/protein_bert/01_batch_64_lr_5e-05_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/01_batch_64_lr_5e-05_2022_01_25.pkl/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


[2022_01_26-23:22:52] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_01_26-23:22:52] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_26-23:22:52] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 5/50
Epoch 6/50
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 00018: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 19/50
Epoch 00019: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
[2022_01_27-00:39:44] Training the entire fine-

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.922781
All,119,0.922781


Confusion matrix:


Unnamed: 0,0,1
0,96,0
1,14,9


  super(Adam, self).__init__(name, **kwargs)


INFO:tensorflow:Assets written to: ../../data/protein_bert/02_batch_64_lr_0.0001_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/02_batch_64_lr_0.0001_2022_01_25.pkl/assets


[2022_01_27-02:04:20] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


[2022_01_27-02:04:20] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_27-02:04:20] Training with frozen pretrained layers...


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/50
Epoch 2/50
Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 11/50
Epoch 12/50
Epoch 00012: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 13/50
Epoch 14/50
Epoch 00014: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[2022_01_27-03:25:03] T

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.925951
All,119,0.925951


Confusion matrix:


Unnamed: 0,0,1
0,96,0
1,15,8


  super(Adam, self).__init__(name, **kwargs)


INFO:tensorflow:Assets written to: ../../data/protein_bert/03_batch_64_lr_0.0005_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/03_batch_64_lr_0.0005_2022_01_25.pkl/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [12]:
f1 = 8 / (8 + 0.5* (15 + 0))
# TN / (TN + 0.5 * (FP + FN))
f1

0.5161290322580645

In [9]:
i = 4
epoch_num = 60
batch_size = 64
learning_rate = 1e-5
fine_tune(f"{i:02d}", learning_rate, epoch_num, batch_size)

[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)


[2022_01_27-10:06:31] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_01_27-10:06:31] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_27-10:06:31] Training with frozen pretrained layers...


2022-01-27 10:06:31.462191: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/60

  layer_config = serialize_layer_fn(layer)
[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: 
[34m[1mwandb[0m: [32m[41mERROR[0m Layer GlobalAttention has arguments ['self', 'n_heads', 'd_key', 'd_value']
[34m[1mwandb[0m: [32m[41mERROR[0m in `__init__` and therefore must override `get_config()`.
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: [32m[41mERROR[0m Example:
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: [32m[41mERROR[0m class CustomLayer(keras.layers.Layer):
[34m[1mwandb[0m: [32m[41mERROR[0m     def __init__(self, arg1, arg2):
[34m[1mwandb[0m: [32m[41mERROR[0m         super().__init__()
[34m[1mwandb[0m: [32m[41mERROR[0m         self.arg1 = arg1
[34m[1mwandb[0m: [32m[41mERROR[0m         self.arg2 = arg2
[34m[1mwandb[0m: [32m[41mERROR[0m 
[34m[1mwandb[0m: [32m[41mERROR[0m     def get_config(self):
[34m[1mwandb[0m: [32m[41mERROR[0m         config = super().get_conf

Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 15/60
Epoch 16/60
Epoch 00016: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 17/60
Epoch 00017: ReduceLROnPlateau reducing learning rate to 1e-05.
[2022_01_27-10:32:58] Training the entire fine-tuned model...
[2022_01_27-10:33:09] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
[2022_01_27-11:53:38] Training on final epochs

Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.922781
All,119,0.922781


Confusion matrix:


Unnamed: 0,0,1
0,96,0
1,15,8


  super(Adam, self).__init__(name, **kwargs)
2022-01-27 11:58:35.240804: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/04_batch_64_lr_1e-05_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/04_batch_64_lr_1e-05_2022_01_25.pkl/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)
