In [1]:
import pandas as pd
from tensorflow import keras
from os import path
import pickle


In [2]:
from proteinbert.finetuning import encode_train_and_valid_sets, encode_dataset
from proteinbert import OutputType, OutputSpec, evaluate_by_len, load_pretrained_model

In [3]:
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, \
finetune, evaluate_by_len

from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [4]:
import wandb
from wandb.keras import WandbCallback

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [6]:
DATA_DIR = "../../data/"

In [7]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [8]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [9]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [13]:
wandb.init(project=f"March_finetune", entity="kvetab")

[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [10]:
train_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
valid_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
test_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
train_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [11]:
train_data["seq"] = train_data["heavy"] + train_data["light"]
valid_data["seq"] = valid_data["heavy"] + valid_data["light"]
test_data["seq"] = test_data["heavy"] + test_data["light"]

In [14]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 3, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 6, restore_best_weights = True),
    WandbCallback()
]

In [12]:
epoch_num = 50
batch_size = 128
learning_rate = 1e-4

In [13]:
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }

In [14]:
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-12:38:15] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:38:15] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:38:15] Training with frozen pretrained layers...


2022-03-07 12:38:16.543610: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-07 12:38:17.659971: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:41:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-07 12:38:19.665660: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50


2022-03-07 12:38:28.305649: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
[2022_03_07-12:38:54] Training the entire fine-tuned model...
[2022_03_07-12:39:15] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

Epoch 00013: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
[2022_03_07-12:40:00] Training on final epochs of sequence length 1024...
[2022_03_07-12:40:00] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 1022.
[2022_03_07-12:40:02] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 1022.


In [20]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [21]:
print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.946784
All,119,0.946784


Confusion matrix:


Unnamed: 0,0,1
0,89,7
1,8,15


In [23]:
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

0.6666666666666666

In [19]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__01.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-07 12:45:22.012863: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__01.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__01.pkl/assets


# 2

In [1]:
import tensorflow as tf
print(tf. __version__)

2.6.2


In [27]:
epoch_num = 50
batch_size = 128
learning_rate = 1e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-12:50:19] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:50:19] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:50:19] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 30/50
Epoch 31/50
Epoch 32/50

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_07-12:51:09] Training the entire fine-tuned model...
[2022_03_07-12:51:58] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

Epoch 00013: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_07-12:52:51] Training on final epochs of sequence length 1024...
[2022_03_07-12:52:51] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 1022.
[2022_03_07-12:52:51] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 1022.






In [28]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,85,11
1,5,18


0.6923076923076923

In [29]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__02.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__02.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__02.pkl/assets


# 5, 6

In [46]:
wandb.init(project=f"March_finetune", entity="kvetab")

0,1
epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇██▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▁
loss,█▄▃▃▃▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
lr,████████████▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▅▄▄▄▃▅▃▄▃▄▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▂▃▁▁▁

0,1
best_epoch,17.0
best_val_loss,0.36364
epoch,0.0
loss,0.31084
lr,0.0
val_loss,0.36835


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [15]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [16]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 4, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 8, restore_best_weights = True),
    WandbCallback()
]

In [17]:
epoch_num = 100
batch_size = 128
learning_rate = 5e-5
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 512, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_30-14:56:34] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_30-14:56:34] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_30-14:56:34] Training with frozen pretrained layers...


2022-03-30 14:56:37.927515: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-30 14:56:41.171695: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:21:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-30 14:56:43.472165: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100


2022-03-30 14:56:54.924980: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 00021: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100

Epoch 00031: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100

Epoch 00044: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100

Epoch 00048: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_30-14:57:56] Training the entire fine-tuned model...
[2022_03_

In [18]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,94,2
1,11,12


0.6486486486486487

In [19]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_30__05.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-30 15:01:21.837698: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_30__05.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_30__05.pkl/assets


# 7, 8

In [58]:
wandb.init(project=f"March_finetune", entity="kvetab")

0,1
epoch,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇██▁▁▂▂▂▃▃▃▃▄▁
loss,█▄▃▃▄▄▃▃▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▂▅▃▂▂▂▁▁▁▁▁▁
lr,███████████████████▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▇▅▅▆█▄▄▃▄▆▃▃▂▃▃▂▂▂▂▂▂▂▂▂▆▄▂▂▁▁▁▁▄▂▁

0,1
best_epoch,5.0
best_val_loss,0.37816
epoch,0.0
loss,0.34296
lr,5e-05
val_loss,0.37942


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [59]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [60]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 3, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 6, restore_best_weights = True),
    WandbCallback()
]

In [61]:
epoch_num = 100
batch_size = 128
learning_rate = 1e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-13:31:05] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:31:05] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:31:06] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100

Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100

Epoch 00033: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 34/100
Epoch 35/100
Epoch 36/100

Epoch 00036: ReduceLROnPlateau reducing learning rate to 9.765624781721272e-06.
[2022_03_07-13:32:00] Training the entire fine-tuned model...
[2022_03_07-13:32:09] Incompatible number of optimizer weights - will not initialize



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 17/100
Epoch 18/100
Epoch 19/100

Epoch 00019: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_07-13:33:11] Training on final epochs of sequence length 1024...
[2022_03_07-13:33:11] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 1022.
[2022_03_07-13:33:37] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 1022.






In [62]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,88,8
1,8,15


0.6521739130434783

In [63]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__08.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__08.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__08.pkl/assets


# Oversampling
# 3

In [30]:
wandb.init(project=f"March_finetune", entity="kvetab")

0,1
epoch,▁▁▂▂▃▃▄▄▅▅▁▂▂▃▃▄▁▁▂▂▃▃▄▄▅▅▆▆▇▇██▁▂▂▃▃▃▄▁
loss,█▄▄▃▃▃▃▃▃▃▃▂▂▂▂▁█▅▄▄▃▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▂
lr,██████████▁▁▁▁▁▁████████████▃▃▁▁▁▁▁▁▁▁▁▁
val_loss,▆▆▄▄▄▃▃▃▃▄▂▂▂▂▃▁█▆▅▄▄▃▃▃▃▄▂▃▂▃▂▂▂▃▂▁▁▂▁▁

0,1
best_epoch,9.0
best_val_loss,0.36406
epoch,0.0
loss,0.32212
lr,1e-05
val_loss,0.37174


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [31]:
sampler = RandomOverSampler(random_state=42)
x_train, y_train = sampler.fit_resample(train_data.drop("Y", axis=1), train_data['Y'])
x_valid, y_valid = sampler.fit_resample(valid_data.drop("Y", axis=1), valid_data['Y'])

In [32]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)



In [33]:
epoch_num = 100
batch_size = 128
learning_rate = 1e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train["seq"], y_train, x_valid['seq'], y_valid, \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-12:59:26] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:59:26] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 510.
[2022_03_07-12:59:26] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_07-13:00:26] Training the entire fine-tuned model...
[2022_03_07-13:00:35] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_07-13:01:23] Training on final epochs of sequence length 1024...
[2022_03_07-13:01:23] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 1022.
[2022_03_07-13:02:03] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 1022.






In [34]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,75,21
1,1,22


0.6666666666666666

In [35]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__03.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__03.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__03.pkl/assets


# 9

In [64]:
wandb.init(project=f"March_finetune", entity="kvetab")

0,1
epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██▁▁▂▂▂▂▃▃▃▄▄▄▄▁
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂
lr,█████████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▃▃▂▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁

0,1
best_epoch,12.0
best_val_loss,0.36563
epoch,0.0
loss,0.32678
lr,1e-05
val_loss,0.37264


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [65]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [66]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 3, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 6, restore_best_weights = True),
    WandbCallback()
]

In [67]:
epoch_num = 100
batch_size = 128
learning_rate = 1e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train["seq"], y_train, x_valid['seq'], y_valid, \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-13:37:58] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:37:58] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:37:58] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_07-13:38:59] Training the entire fine-tuned model...
[2022_03_07-13:39:37] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_07-13:40:34] Training on final epochs of sequence length 1024...
[2022_03_07-13:40:34] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 1022.
[2022_03_07-13:40:34] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 1022.






In [68]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,78,18
1,1,22


0.6984126984126984

In [69]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__09.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__09.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__09.pkl/assets


# 4

In [36]:
sampler = RandomUnderSampler(random_state=42)
x_train_u, y_train_u = sampler.fit_resample(train_data.drop("Y", axis=1), train_data['Y'])

In [37]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

wandb.init(project=f"March_finetune", entity="kvetab")


0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██▁▁▂▂▂▂▃▃▃▁
loss,█▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▃▃▃▂▂▂▁▁▁▁▂
lr,███████████████▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▄▄▄▄▃▄▂▃▅▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▃▁▂▂▁▁▁▁▁

0,1
epoch,0.0
loss,0.4216
lr,1e-05
val_loss,0.50154


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [38]:
epoch_num = 100
batch_size = 128
learning_rate = 5e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }
finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train_u["seq"], y_train_u, valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_07-13:07:54] Training set: Filtered out 0 of 562 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:07:54] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_07-13:07:54] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_07-13:08:12] Training the entire fine-tuned model...
[2022_03_07-13:08:21] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100
Epoch 2/100




Epoch 3/100
Epoch 4/100

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
[2022_03_07-13:08:38] Training on final epochs of sequence length 1024...
[2022_03_07-13:08:38] Training set: Filtered out 0 of 562 (0.0%) records of lengths exceeding 1022.
[2022_03_07-13:08:38] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 1022.






In [39]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,71,25
1,5,18


0.5454545454545454

In [40]:
# protein_bert/2022_03_07__04.pkl
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_07__04.pkl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__04.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_07__04.pkl/assets


# Load Model

In [20]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [21]:
seq_len = 512
encoded_test_set = encode_dataset(test_data["seq"], test_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'Test set')

[2022_03_30-15:06:19] Test set: Filtered out 0 of 119 (0.0%) records of lengths exceeding 510.


In [22]:
test_X, test_Y, test_sample_weigths = encoded_test_set

In [23]:
model_path = path.join(DATA_DIR, "protein_bert/2022_03_30__05.pkl")
model = keras.models.load_model(model_path)

In [24]:
y_pred = model.predict(test_X, batch_size=32)
y_pred_classes = (y_pred >= 0.5)
f1_score(test_Y, y_pred_classes)

0.6486486486486486

In [25]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
tap_data.head()

Unnamed: 0,Antibody_ID,heavy,light,CDR_length,PSH,PPC,PNC,SFvCSP,Y
0,Abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,46,129.7603,0.0,0.0,16.32,1
1,Abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,45,115.9106,0.0954,0.0421,-3.1,1
2,Abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,45,109.6995,0.0,0.8965,-4.0,1
3,Actoxumab,QVQLVESGGGVVQPGRSLRLSCAASGFSFSNYGMHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQHKPGKAPKL...,49,112.629,0.0,1.1247,3.1,1
4,Adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,48,111.2512,0.0485,1.1364,-19.5,1


In [26]:
tap_data["seq"] = tap_data["heavy"] +  tap_data["light"]

In [27]:
encoded_tap_set = encode_dataset(tap_data["seq"], tap_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'TAP set')

[2022_03_30-15:06:39] TAP set: Filtered out 0 of 241 (0.0%) records of lengths exceeding 510.


In [28]:
tap_X, tap_Y, tap_sample_weigths = encoded_tap_settap_X, tap_Y, tap_sample_weigths = encoded_tap_set

In [29]:
y_pred = model.predict(tap_X, batch_size=32)
y_pred_classes = (y_pred >= 0.5)
f1_score(tap_Y, y_pred_classes)

0.34285714285714286

In [30]:
accuracy_score(tap_Y, y_pred_classes)

0.23651452282157676

In [34]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
seq_len = 512
encoded_test_set = encode_dataset(test_data["seq"], test_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'Test set')
test_X, test_Y, test_sample_weigths = encoded_test_set

[2022_03_10-10:20:09] Test set: Filtered out 0 of 119 (0.0%) records of lengths exceeding 510.


In [40]:
def test_model(model_name):
    model_path = path.join(DATA_DIR, f"protein_bert/{model_name}")
    model = keras.models.load_model(model_path)
    y_pred = model.predict(test_X, batch_size=32)
    y_pred_classes = (y_pred >= 0.5)
    f1 = f1_score(test_Y, y_pred_classes)
    print(f"Model {model_name}")
    print(f"Test F1: {f1}")
    
    y_pred = model.predict(tap_X, batch_size=32)
    y_pred_classes = (y_pred >= 0.5)
    f1 = f1_score(tap_Y, y_pred_classes)
    acc = accuracy_score(tap_Y, y_pred_classes)
    print(f"TAP F1: {f1}")
    print(f"TAP Acc: {acc}")

In [36]:
test_model(f"2022_03_10__09_78.pkl")

Model 2022_03_10__09_78.pkl
Test F1: 0.6666666666666666
TAP F1: 0.7774936061381073
TAP Acc: 0.6390041493775933


In [39]:
for seed in seeds:
    test_model(f"2022_03_10__09_{seed:02d}.pkl")

Model 2022_03_10__09_02.pkl
Test F1: 0.7333333333333334
TAP F1: 0.7277628032345014
TAP Acc: 0.5809128630705395


Model 2022_03_10__09_13.pkl
Test F1: 0.7301587301587301
TAP F1: 0.7374005305039788
TAP Acc: 0.5892116182572614


Model 2022_03_10__09_19.pkl
Test F1: 0.7301587301587301
TAP F1: 0.7413333333333334
TAP Acc: 0.5975103734439834


Model 2022_03_10__09_27.pkl
Test F1: 0.7457627118644068
TAP F1: 0.7119565217391305
TAP Acc: 0.5601659751037344


Model 2022_03_10__09_38.pkl
Test F1: 0.6571428571428571
TAP F1: 0.7552083333333333
TAP Acc: 0.6099585062240664


Model 2022_03_10__09_42.pkl
Test F1: 0.6999999999999998
TAP F1: 0.688888888888889
TAP Acc: 0.5352697095435685


Model 2022_03_10__09_56.pkl
Test F1: 0.6666666666666666
TAP F1: 0.805
TAP Acc: 0.6763485477178424


Model 2022_03_10__09_63.pkl
Test F1: 0.7096774193548387
TAP F1: 0.7119565217391305
TAP Acc: 0.5601659751037344


Model 2022_03_10__09_06.pkl
Test F1: 0.7931034482758621
TAP F1: 0.6704225352112676
TAP Acc: 0.5145228215767634

In [44]:
for i in range(9):
    if i == 4:
        continue
    test_model(f"2022_03_07__{i + 1:02d}")

Model 2022_03_07__01
Test F1: 0.6666666666666666
TAP F1: 0.5345911949685533
TAP Acc: 0.38589211618257263
Model 2022_03_07__02
Test F1: 0.6923076923076923
TAP F1: 0.5970149253731343
TAP Acc: 0.43983402489626555
Model 2022_03_07__03
Test F1: 0.6666666666666667
TAP F1: 0.7552083333333333
TAP Acc: 0.6099585062240664
Model 2022_03_07__04
Test F1: 0.5454545454545454
TAP F1: 0.703601108033241
TAP Acc: 0.5560165975103735
Model 2022_03_07__06
Test F1: 0.5945945945945946
TAP F1: 0.3604240282685512
TAP Acc: 0.24896265560165975
Model 2022_03_07__07
Test F1: 0.5454545454545454
TAP F1: 0.3237410071942446
TAP Acc: 0.21991701244813278
Model 2022_03_07__08
Test F1: 0.6521739130434783
TAP F1: 0.529968454258675
TAP Acc: 0.3817427385892116
Model 2022_03_07__09
Test F1: 0.6984126984126985
TAP F1: 0.7154471544715447
TAP Acc: 0.5643153526970954


wandb: Network error (ReadTimeout), entering retry loop.


# Cross-validation

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
seeds = [2, 13, 19, 27, 38, 42, 56, 63, 6, 78]

In [11]:
def read_data_for_seed(seed):
    chen_train = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_train_{seed}.csv"), index_col=0)
    chen_test = pd.read_csv(path.join(DATA_DIR, f"chen/deduplicated/crossval/chen_test_{seed}.csv"), index_col=0)
    chen_valid, chen_test = train_test_split(chen_test, test_size=0.5, random_state=3)
    return chen_train, chen_valid, chen_test

In [13]:
wandb.init(project=f"Cross-val", entity="kvetab")
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 3, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 6, restore_best_weights = True),
    WandbCallback()
]

[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [14]:
epoch_num = 100
batch_size = 128
learning_rate = 1e-4
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }

In [15]:
def train_on_split(seed):
    train, valid, test = read_data_for_seed(seed)
    train["seq"] = train["heavy"] + train["light"]
    valid["seq"] = valid["heavy"] + valid["light"]
    test["seq"] = test["heavy"] + test["light"]
    sampler = RandomOverSampler(random_state=42)
    x_train, y_train = sampler.fit_resample(train.drop("Y", axis=1), train['Y'])
    x_valid, y_valid = sampler.fit_resample(valid.drop("Y", axis=1), valid['Y'])
    wandb.init(project=f"Cross-val", entity="kvetab")
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)
    
    finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train["seq"], y_train, x_valid['seq'], y_valid, \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)
    
    results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test['seq'], test['Y'], \
            start_seq_len = 512, start_batch_size = 32)
    print(f"Training split {seed}")
    print('Confusion matrix:')
    display(confusion_matrix)
    fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
    f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
    print(f1)
    
    mod = model_generator.create_model(seq_len = 512)
    mod.save(path.join(DATA_DIR, f"protein_bert/2022_03_10__09_{seed:02d}.pkl"))

In [16]:
for s in seeds:
    train_on_split(s)

[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:31:24] Training set: Filtered out 0 of 2002 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:31:24] Validation set: Filtered out 0 of 246 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:31:24] Training with frozen pretrained layers...


2022-03-10 09:31:25.040869: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-10 09:31:25.598857: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:21:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-10 09:31:27.522882: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100


2022-03-10 09:31:35.358668: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_10-09:32:11] Training the entire fine-tuned model...
[2022_03_10-09:32:23] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:33:06] Training on final epochs of sequence length 1024...
[2022_03_10-09:33:06] Training set: Filtered out 0 of 2002 

Unnamed: 0,0,1
0,74,49
1,5,30


0.5263157894736842


  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-10 09:33:40.257984: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_02.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_02.pkl/assets


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██▁▁▂▂▃▃▃▄▁
loss,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂
lr,██████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▃▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▂▂▆▄▄▅▅▁

0,1
best_epoch,0.0
best_val_loss,0.64906
epoch,0.0
loss,0.43348
lr,1e-05
val_loss,0.64906


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:34:00] Training set: Filtered out 0 of 2048 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:34:00] Validation set: Filtered out 0 of 226 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:34:00] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 27/100
Epoch 28/100
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_10-09:34:59] Training the entire fine-tuned model...
[2022_03_10-09:35:38] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:36:27] Training on final epochs of sequence length 1024...
[2022_03_10-09:36:27] Training set: Filtered out 0 of 2048 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:37:05] Validation set: Filtered out 0 of 226 (0.0%) records of lengths exceeding 1022.
Training split 13
Confusion matrix:


Unnamed: 0,0,1
0,71,39
1,1,35


0.6363636363636364


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_13.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_13.pkl/assets


0,1
epoch,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██▁▁▁▂▂▂▃▃▃▃▁
loss,█▆▅▅▅▅▄▅▄▃▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▁▁▁▃
lr,███████████████▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▇▅▆▃▃▅▂▃▃▂▂▂▄▂▁▁▁▁▁▂▁▁▁▂▁▁▁▁▂▂▁▄▃▂▂▂▂▂

0,1
best_epoch,22.0
best_val_loss,0.49373
epoch,0.0
loss,0.41662
lr,1e-05
val_loss,0.50702


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:37:57] Training set: Filtered out 0 of 2088 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:37:57] Validation set: Filtered out 0 of 202 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:37:57] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 31/100
Epoch 32/100
Epoch 33/100

Epoch 00033: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_10-09:39:12] Training the entire fine-tuned model...
[2022_03_10-09:39:20] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:40:21] Training on final epochs of sequence length 1024...
[2022_03_10-09:40:21] Training set: Filtered out 0 of 2088 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:40:21] Validation set: Filtered out 0 of 202 (0.0%) records of lengths exceeding 1022.




Training split 19
Confusion matrix:


Unnamed: 0,0,1
0,69,33
1,5,18


0.4864864864864865


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_19.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_19.pkl/assets


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇██▁▁▁▂▂▂▃▃▃▃▃
loss,█▅▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁
lr,███████▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▃▂▂▂▂▁▁▁▁▁▂▂▄▂▄▄

0,1
epoch,0.0
loss,0.3673
lr,1e-05
val_loss,0.52903


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:41:26] Training set: Filtered out 0 of 2048 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:41:26] Validation set: Filtered out 0 of 206 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:41:26] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100

Epoch 00031: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 32/100
Epoch 33/100
Epoch 34/100

Epoch 00034: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_10-09:42:33] Training the entire fine-tuned model...
[2022_03_10-09:42:41] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:43:38] Training on final epochs of sequence length 1024...
[2022_03_10-09:43:38] Training set: Filtered out 0 of 2048 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:44:07] Validation set: Filtered out 0 of 206 (0.0%) records of lengths exceeding 1022.
Training split 27
Confusion matrix:


Unnamed: 0,0,1
0,81,39
1,7,11


0.3235294117647059


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_27.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_27.pkl/assets


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██▁▁▁▂▂▂▂▃▃▃▁
loss,█▅▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂
lr,███████████████████▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▇█▅▅▄▃▃▃▃▂▂▂▂▂▂▂▂▃▂▁▁▂▁▁▁▁▁▁▁▂▂▁▁▁▁▂▂▂▂▁

0,1
best_epoch,5.0
best_val_loss,0.45876
epoch,0.0
loss,0.38477
lr,1e-05
val_loss,0.47199


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:45:00] Training set: Filtered out 0 of 1910 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:45:00] Validation set: Filtered out 0 of 288 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:45:00] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 12/100
Epoch 13/100
Epoch 14/100

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 15/100
[2022_03_10-09:45:38] Training the entire fine-tuned model...
[2022_03_10-09:45:58] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:46:39] Training on final epochs of sequence length 1024...
[2022_03_10-09:46:39] Training set: Filtered out 0 of 1910 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:46:39] Validation set: Filtered out 0 of 288 (0.0%) records of lengths exceeding 1022.




Training split 38
Confusion matrix:


Unnamed: 0,0,1
0,92,56
1,4,29


0.4915254237288136


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_38.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_38.pkl/assets


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█▁▁▂▃▃▃▄▅▁
loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂
lr,████████▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▆▃▄▂▁█▄▁▁▁▁▁▁▁▁▂▁▁▁▂▂▃▃▁

0,1
epoch,0.0
loss,0.47858
lr,1e-05
val_loss,0.59479


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:47:34] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:47:34] Validation set: Filtered out 0 of 194 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:47:34] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 30/100
Epoch 31/100
Epoch 32/100

Epoch 00032: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_03_10-09:48:46] Training the entire fine-tuned model...
[2022_03_10-09:48:54] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:49:51] Training on final epochs of sequence length 1024...
[2022_03_10-09:49:51] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:49:52] Validation set: Filtered out 0 of 194 (0.0%) records of lengths exceeding 1022.




Training split 42
Confusion matrix:


Unnamed: 0,0,1
0,75,18
1,5,22


0.6567164179104478


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_42.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_42.pkl/assets


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███▁▁▂▂▂▂▃▃▃▁
loss,█▆▅▅▄▄▄▄▄▃▄▃▃▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▂
lr,█████████████████▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,▇▇█▅▄▄▄▄▃▃▃▃▅▂▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▂

0,1
best_epoch,4.0
best_val_loss,0.42111
epoch,0.0
loss,0.39418
lr,1e-05
val_loss,0.44756


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:50:56] Training set: Filtered out 0 of 2070 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:50:56] Validation set: Filtered out 0 of 206 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:50:56] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 31/100
Epoch 32/100
Epoch 33/100

Epoch 00033: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_10-09:52:09] Training the entire fine-tuned model...
[2022_03_10-09:52:17] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:53:06] Training on final epochs of sequence length 1024...
[2022_03_10-09:53:06] Training set: Filtered out 0 of 2070 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:53:46] Validation set: Filtered out 0 of 206 (0.0%) records of lengths exceeding 1022.




Training split 56
Confusion matrix:


Unnamed: 0,0,1
0,66,43
1,0,20


0.4819277108433735


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_56.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_56.pkl/assets


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇███▁▁▁▂▂▂▂▃▁
loss,█▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂
lr,███████████████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▄▄▄▃▃▄▃▂▃▂▄▂▃▂▂▃▆▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▄▃▂▁

0,1
epoch,0.0
loss,0.41938
lr,1e-05
val_loss,0.52414


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:54:50] Training set: Filtered out 0 of 1864 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:54:50] Validation set: Filtered out 0 of 316 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:54:50] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100

Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 25/100
Epoch 26/100
Epoch 27/100

Epoch 00027: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
[2022_03_10-09:55:49] Training the entire fine-tuned model...
[2022_03_10-09:55:57] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:56:38] Training on final epochs of sequence length 1024...
[2022_03_10-09:56:38] Training set: Filtered out 0 of 1864 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:56:45] Validation set: Filtered out 0 of 316 (0.0%) records of lengths exceeding 1022.




Training split 63
Confusion matrix:


Unnamed: 0,0,1
0,100,57
1,9,38


0.5352112676056338


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_63.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_63.pkl/assets


0,1
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██▁▁▂▂▂▂▃▃▁
loss,█▆▄▄▄▃▃▃▃▃▃▃▃▂▂▂▃▂▂▂▂▂▂▂▂▂▂▃▂▂▂▁▁▁▁▂
lr,████████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▄▃▃▂▃▄▂▂▂▂▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▃▂▁▁▁▁▁

0,1
epoch,0.0
loss,0.42088
lr,1e-05
val_loss,0.52247


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-09:57:40] Training set: Filtered out 0 of 2022 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:57:40] Validation set: Filtered out 0 of 248 (0.0%) records of lengths exceeding 510.
[2022_03_10-09:57:40] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 17/100
Epoch 18/100
Epoch 19/100

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_10-09:58:26] Training the entire fine-tuned model...
[2022_03_10-09:58:34] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-09:59:47] Training on final epochs of sequence length 1024...
[2022_03_10-09:59:47] Training set: Filtered out 0 of 2022 (0.0%) records of lengths exceeding 1022.
[2022_03_10-09:59:47] Validation set: Filtered out 0 of 248 (0.0%) records of lengths exceeding 1022.




Training split 6
Confusion matrix:


Unnamed: 0,0,1
0,79,33
1,9,20


0.4878048780487805


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_06.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_06.pkl/assets


0,1
epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▁
loss,█▆▅▅▅▅▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁▁▃
lr,████████████████▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▆▅▄▄▄▃▃▃▃▆▂▄▃▃▃▄▃▂▂▃▂▂▂▁▁▁▂▁▆▃▃▄▃

0,1
epoch,0.0
loss,0.33906
lr,1e-05
val_loss,0.49065


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_10-10:00:41] Training set: Filtered out 0 of 1948 (0.0%) records of lengths exceeding 510.
[2022_03_10-10:00:41] Validation set: Filtered out 0 of 268 (0.0%) records of lengths exceeding 510.
[2022_03_10-10:00:41] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_10-10:01:10] Training the entire fine-tuned model...
[2022_03_10-10:01:18] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100

Epoch 00004: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_10-10:01:56] Training on final epochs of sequence length 1024...
[2022_03_10-10:01:56] Training set: Filtered out 0 of 1948 (0.0%) records of lengths exceeding 1022.
[2022_03_10-10:02:06] Validation set: Filtered out 0 of 268 (0.0%) records of lengths exceeding 1022.




Training split 78
Confusion matrix:


Unnamed: 0,0,1
0,75,64
1,6,25


0.4166666666666667


  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_78.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_10__09_78.pkl/assets
