In [1]:
import pandas as pd
from tensorflow import keras
from os import path
import pickle


In [2]:
from proteinbert.finetuning import encode_train_and_valid_sets, encode_dataset
from proteinbert import OutputType, OutputSpec, evaluate_by_len, load_pretrained_model

In [3]:
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, \
finetune, evaluate_by_len

from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [4]:
import wandb
from wandb.keras import WandbCallback

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [6]:
DATA_DIR = "../../data/"

In [7]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [8]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_set = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train.csv"), index_col=0)
test_set = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test.csv"), index_col=0)
train_set["seq"] = train_set["heavy"] + train_set["light"]
test_set["seq"] = test_set["heavy"] + test_set["light"]

In [11]:
len(train_set)

1291

In [12]:
len(test_set)

260

In [13]:
learning_rate = 1e-4
patience = (6, 4)

In [14]:
sizes = [0.5, 0.6, 0.7, 0.8, 0.9]

In [16]:
def train_and_save_model(train_data, valid_data, test_data, size):
    wandb.init(project=f"Dataset size exp", entity="kvetab")
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
            get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

    training_callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience = patience[1], factor = 0.25, min_lr = 1e-07, verbose = 1),
        keras.callbacks.EarlyStopping(patience = patience[0], restore_best_weights = True),
        WandbCallback()
    ]

    epoch_num = 100
    batch_size = 128
    #learning_rate = 1e-5
    wandb.config = {
          "learning_rate": learning_rate,
          "epochs": epoch_num * 2,
          "batch_size": batch_size
        }
    finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data["seq"], train_data["Y"], valid_data['seq'], valid_data["Y"], \
            seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
            lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)
    mod = model_generator.create_model(seq_len = 512)
    mod_name = f"2022_04_22_size{size}"
    mod.save(path.join(DATA_DIR, f"protein_bert/by_data_size/{mod_name}"))

    results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
            start_seq_len = 512, start_batch_size = 32)
    fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
    f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
    return confusion_matrix, f1

In [15]:
cms = {}
f1s = {}

In [17]:
for size in sizes:
    train, valid = train_test_split(train_set, test_size=1-size, random_state=42, stratify=train_set["Y"])
    #test = pd.concat([test, test_set])
    test = test_set
    #valid, test = train_test_split(test, test_size=0.5, random_state=333, stratify=test["Y"])
    print(len(train), len(valid), len(test))
    cm, f1_score = train_and_save_model(train, valid, test, size)
    cms[size] = cm
    f1s[size] = f1_score

645 646 260


[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-16:50:26] Training set: Filtered out 0 of 645 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:50:27] Validation set: Filtered out 0 of 646 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:50:27] Training with frozen pretrained layers...


2022-04-24 16:50:27.159831: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 16:50:27.728399: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:41:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-04-24 16:50:29.565892: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100


2022-04-24 16:50:37.412848: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100

Epoch 00046: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 47/100
Epoch 48/100
[2022_04_24-16:51:32] Training the entire fine-tuned model...
[2022_04_24-16:51:40] Incompatible number of optimizer weights - will not initialize them.


2022-04-24 16:53:02.290053: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.5/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.5/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


774 517 260


0,1
epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇███▁▁▂▂▂▂▃▃▃▃▄▄▁
loss,█▇▄▃▄▅▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂
lr,██████▃▃▃▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▃▂▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,15.0
best_val_loss,0.39776
epoch,0.0
loss,0.34265
lr,1e-05
val_loss,0.40121


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-16:53:31] Training set: Filtered out 0 of 774 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:53:31] Validation set: Filtered out 0 of 517 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:53:31] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100

Epoch 00047: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100

Epoch 00057: ReduceLROnPlateau redu



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

Epoch 00017: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100

Epoch 00034: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-06.
Epoch 35/100
Epoch 36/100
Epoch 37/100
[2022_04_24-16:56:34] Training on final epochs of sequence length 1024...
[2022_04_24-16:56:34] Training set: Filtered out 0 of 774 (0.0%) records of lengths exceeding 1022.
[2022_04_24-16:56:34] Validation set: Filtered out 0 of 517 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.6/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.6/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


903 388 260


0,1
epoch,▁▁▁▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇██▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▁
loss,█▅▃▃▃▂▃▃▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁
lr,████████▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▂▂▁▂▁▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,30.0
best_val_loss,0.37393
epoch,0.0
loss,0.3256
lr,1e-05
val_loss,0.37983


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-16:57:29] Training set: Filtered out 0 of 903 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:57:29] Validation set: Filtered out 0 of 388 (0.0%) records of lengths exceeding 510.
[2022_04_24-16:57:29] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 20/100
Epoch 21/100
[2022_04_24-16:58:04] Training the entire fine-tuned model...
[2022_04_24-16:58:12] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 00021: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 22/100
Epoch 23/100
[2022_04_24-16:59:11] Training on final epochs of sequence length 1024...
[2022_04_24-16:59:11] Training set: Filtered out 0 of 903 (0.0%) records of lengths exceeding 1022.
[2022_04_24-16:59:11] Validation set: Filtered out 0 of 388 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.7/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.7/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


1032 259 260


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇█▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██▁
loss,█▆▄▃▃▃▃▃▂▃▃▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▂▁▁▁▁
lr,█████████████████▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,██▃▃▃▃▂▂▂▂▂▄▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁

0,1
best_epoch,16.0
best_val_loss,0.38567
epoch,0.0
loss,0.30687
lr,1e-05
val_loss,0.38759


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-17:00:07] Training set: Filtered out 0 of 1032 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:00:07] Validation set: Filtered out 0 of 259 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:00:07] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 31/100
Epoch 32/100
[2022_04_24-17:00:54] Training the entire fine-tuned model...
[2022_04_24-17:01:02] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100

Epoch 00032: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-06.
Epoch 33/100
Epoch 34/100
[2022_04_24-17:02:31] Training on final epochs of sequence length 1024...
[2022_04_24-17:02:31] Training set: Filtered out 0 of 1032 (0.0%) records of lengths exceeding 1022.
[2022_04_24-17:02:31] Validation set: Filtered out 0 of 259 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.8/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.8/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


1161 130 260


0,1
epoch,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇█▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇██▁
loss,█▆▄▃▄▄▃▃▄▄▄▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▂
lr,███████████████▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▄▃▂▂▂▂▇▂▂▂▂▃▂▂▂▂▂▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,27.0
best_val_loss,0.36927
epoch,0.0
loss,0.30799
lr,1e-05
val_loss,0.38269


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-17:03:27] Training set: Filtered out 0 of 1161 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:03:27] Validation set: Filtered out 0 of 130 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:03:27] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

Epoch 00022: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

Epoch 00036: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 41/100
[2022_04_24-17:04:23] Training the entire fine-tuned model...
[2022_04_24-17:04:32] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/1



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 24/100
Epoch 25/100
[2022_04_24-17:05:44] Training on final epochs of sequence length 1024...
[2022_04_24-17:05:44] Training set: Filtered out 0 of 1161 (0.0%) records of lengths exceeding 1022.
[2022_04_24-17:06:01] Validation set: Filtered out 0 of 130 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.9/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size0.9/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [18]:
size = 1
train = train_set
valid, test = train_test_split(test_set, test_size=0.5, random_state=333, stratify=test_set["Y"])
print(len(train_set), len(valid), len(test))
cm, f1_score = train_and_save_model(train, valid, test, size)
cms[size] = cm
f1s[size] = f1_score

1291 130 130


0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██▁▂▂▂▂▃▃▃▃▄▄▄▅▅▁
loss,█▅▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▁▁▂
lr,████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▄▃▃▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▁▂▁▁▁▂▂▂▁

0,1
best_epoch,18.0
best_val_loss,0.32869
epoch,0.0
loss,0.2923
lr,1e-05
val_loss,0.34477


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_24-17:06:56] Training set: Filtered out 0 of 1291 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:06:56] Validation set: Filtered out 0 of 130 (0.0%) records of lengths exceeding 510.
[2022_04_24-17:06:56] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 36/100
Epoch 37/100
[2022_04_24-17:07:52] Training the entire fine-tuned model...
[2022_04_24-17:08:35] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 10/100
Epoch 11/100
[2022_04_24-17:09:13] Training on final epochs of sequence length 1024...
[2022_04_24-17:09:13] Training set: Filtered out 0 of 1291 (0.0%) records of lengths exceeding 1022.
[2022_04_24-17:09:14] Validation set: Filtered out 0 of 130 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size1/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/by_data_size/2022_04_22_size1/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [19]:
f1s

{0.5: 0.37894736842105264,
 0.6: 0.4864864864864865,
 0.7: 0.43010752688172044,
 0.8: 0.4807692307692308,
 0.9: 0.5094339622641509,
 1: 0.5531914893617021}

In [34]:
train_set

Unnamed: 0,Antibody_ID,heavy,light,Y,cluster,seq
535,2g60,EVQLQQSGGELAKPGASVKMSCKSSGYTFTAYAIHWAKQAAGAGLE...,DVLMTQAPLTLPVSLGDQASISCRSSQAIVHANGNTYLEWYLQKPG...,0,911,EVQLQQSGGELAKPGASVKMSCKSSGYTFTAYAIHWAKQAAGAGLE...
455,2a1w,DVKLVESGGGLVKPGGSLRLSCAASGFTFRNYGMSWVRQTPEKRLE...,DVLMTQSPLSLPVSLGDQASISCRCSQSIVKSNGHTYLEWYLQKPG...,0,723,DVKLVESGGGLVKPGGSLRLSCAASGFTFRNYGMSWVRQTPEKRLE...
459,2a77,DVKLVESGGGLVKPGGSLRLSCAASGFTFRNYGMSWVRQTPEKRLE...,DVLMTQSPLSLPVSLGDQASISCRCSQSIVKSNGHTYLEWYLQKPG...,0,723,DVKLVESGGGLVKPGGSLRLSCAASGFTFRNYGMSWVRQTPEKRLE...
1120,4ffy,QVQLLQPGAELVKPGASMKLSCKASGYTFTNWWMHWVRLRPGRGLE...,NIVLTQSPASLAVSLGQRATISCRASESVDHYGNSFIYWYQQKPGQ...,0,478,QVQLLQPGAELVKPGASMKLSCKASGYTFTNWWMHWVRLRPGRGLE...
851,3l5x,EVTLKESGPVLVKPTETLTLTCTVSGFSLSTYGMGVGWIRQPPGKA...,EIVLTQSPATLSLSPGERATLSCRASKSISKYLAWYQQKPGQAPRL...,0,433,EVTLKESGPVLVKPTETLTLTCTVSGFSLSTYGMGVGWIRQPPGKA...
...,...,...,...,...,...,...
1664,5f9w,QVQLVQSGAEVKKPGASVTVSCQASGYTFTNYYVHWVRQAPGQGLQ...,EIVLTQSPATLSVSPGERATLSCRASQSVRSNLAWYQQRPGQAPRL...,0,271,QVQLVQSGAEVKKPGASVTVSCQASGYTFTNYYVHWVRQAPGQGLQ...
2017,5x5x,QVKLQQSGAEFVKAGASVKLSCKTSGYTFNNYWIHWVKQSPGQGLE...,DIELTQSPLSLPVSLGDQASISCTSSQSLLHSNGDTYLHWYLQKPG...,0,861,QVKLQQSGAEFVKAGASVKLSCKTSGYTFNNYWIHWVKQSPGQGLE...
1400,4qww,EVQLVESGGGLVQPKGSLKLSCAASGFTFNTYAMHWVRQAPGKGLE...,QIVLTQSPAIMSASPGEKVTMTCSASSSVSYMYWYHQKPGSSPKPW...,0,436,EVQLVESGGGLVQPKGSLKLSCAASGFTFNTYAMHWVRQAPGKGLE...
59,1cgs,RVQLLESGAELMKPGASVQISCKATGYTFSEYWIEWVKERPGHGLE...,ELVMTQSPLSLPVSLGDQASISCRPSQSLVHSNGNTYLHWYLQKPG...,0,103,RVQLLESGAELMKPGASVQISCKATGYTFSEYWIEWVKERPGHGLE...


# CV on all data

In [26]:
chen_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_data_w_clusters.csv"), index_col=0)
chen_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y,cluster
0,12e8,EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLE...,DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKL...,0,677
1,15c8,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0,685
2,1a0q,EVQLQESDAELVKPGASVKISCKASGYTFTDHVIHWVKQKPEQGLE...,DIELTQSPSSLSASLGGKVTITCKASQDIKKYIGWYQHKPGKQPRL...,1,102
3,1a14,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,0,442
4,1a2y,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,0,59


In [43]:
chen_data["cluster"].value_counts()

18     59
24     35
28     28
8      25
7      21
       ..
588     1
562     1
786     1
722     1
329     1
Name: cluster, Length: 932, dtype: int64

In [56]:
def split_into_k_sets(k, data):
    total = len(data)
    size = total // k + 1
    clusters_by_size = data["cluster"].value_counts().index
    cluster_sizes = data["cluster"].value_counts()
    groups = { i: [] for i in range(k) }
    group_nums = { i: [] for i in range(k) }
    group = 0
    for clust in clusters_by_size:
        start_group = group
        if len(groups[group]) + cluster_sizes[clust] > size:
            group += 1
            group = group % k
        while len(groups[group]) + cluster_sizes[clust] > size and group != start_group:
            group += 1
            group = group % k
        if len(groups[group]) < size:
            groups[group] += list(data[data["cluster"] == clust].index)
            group_nums[group].append(clust)
        else:
            print("error")
    return groups, group_nums

In [57]:
indices, clusters = split_into_k_sets(10, chen_data)
for key, gr in indices.items():
    print(len(gr))

152
154
155
154
156
156
156
156
156
156


In [46]:
chen_data["seq"] = chen_data["heavy"] + chen_data["light"]

In [20]:
def train_and_save_named_model(train_data, valid_data, test_data, name, project_name):
    wandb.init(project=project_name, entity="kvetab")
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
            get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

    training_callbacks = [
        keras.callbacks.ReduceLROnPlateau(patience = patience[1], factor = 0.25, min_lr = 1e-07, verbose = 1),
        keras.callbacks.EarlyStopping(patience = patience[0], restore_best_weights = True),
        WandbCallback()
    ]

    epoch_num = 100
    batch_size = 128
    #learning_rate = 1e-5
    wandb.config = {
          "learning_rate": learning_rate,
          "epochs": epoch_num * 2,
          "batch_size": batch_size
        }
    finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data["seq"], train_data["Y"], valid_data['seq'], valid_data["Y"], \
            seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
            lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)
    mod = model_generator.create_model(seq_len = 512)
    mod.save(path.join(DATA_DIR, f"protein_bert/{name}"))

    results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
            start_seq_len = 512, start_batch_size = 32)
    fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
    f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
    return confusion_matrix, f1

In [71]:
cms = {}
f1s = {}

In [75]:
for i in range(10):
    test = chen_data.loc[indices[i]]
    remaining = [idx for idx in list(chen_data.index) if idx not in indices[i]]
    train = chen_data.loc[remaining]
    train, valid = train_test_split(train, test_size=0.1, random_state=333)
    cm, f1 = train_and_save_named_model(train, valid, test, f"10-fold-cv/2022_04_24_split_{i}", "10_fold_cv")
    cms[i] = cm
    f1s[i] = f1

0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█▁▂▂▃▃▄▅▅▆▁
loss,▃▇█▆▂▃▄▁▁▃▃▂▂▁▁▁▁▁▁▁▁▁▁
lr,███████████▃▃▁▁▁▁▁▁▁▁▁▁
val_loss,▇█▆▂▂▃▁▂▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,2.0
best_val_loss,0.5482
epoch,0.0
loss,0.47977
lr,1e-05
val_loss,0.55076


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-08:53:30] Training set: Filtered out 0 of 1259 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:53:30] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:53:30] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 24/100
Epoch 25/100
[2022_04_25-08:54:08] Training the entire fine-tuned model...
[2022_04_25-08:54:28] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 11/100
Epoch 12/100
[2022_04_25-08:55:08] Training on final epochs of sequence length 1024...
[2022_04_25-08:55:08] Training set: Filtered out 0 of 1259 (0.0%) records of lengths exceeding 1022.
[2022_04_25-08:55:15] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_0/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_0/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██▁▁▂▂▂▂▃▃▃▄▄▄▁
loss,█▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▂
lr,███████████████████████▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▄▃▂▂▂▂▂▁▁▂▁▂▁▁▂▁▁▂▁▂▁▁▂▁▁▁▁▁▁▁▂▁▁▂▁

0,1
best_epoch,0.0
best_val_loss,0.40695
epoch,0.0
loss,0.32394
lr,1e-05
val_loss,0.40695


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-08:56:09] Training set: Filtered out 0 of 1257 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:56:09] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:56:09] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100

Epoch 00045: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100

Epoch 00049: ReduceLROnPlateau reducing learning rate to 



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

Epoch 00014: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 15/100
Epoch 16/100
[2022_04_25-08:58:15] Training on final epochs of sequence length 1024...
[2022_04_25-08:58:15] Training set: Filtered out 0 of 1257 (0.0%) records of lengths exceeding 1022.
[2022_04_25-08:58:15] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_1/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_1/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███▁▁▂▂▂▂▃▃▃▁
loss,█▅▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁
lr,████████████▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▃▃▃▂▂▂▂▂▂▃▂▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,9.0
best_val_loss,0.35093
epoch,0.0
loss,0.31087
lr,1e-05
val_loss,0.35749


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-08:59:11] Training set: Filtered out 0 of 1256 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:59:11] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-08:59:11] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100

Epoch 00054: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100

Epoch 00058: ReduceLRO



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 17/100
Epoch 18/100
[2022_04_25-09:02:46] Training on final epochs of sequence length 1024...
[2022_04_25-09:02:46] Training set: Filtered out 0 of 1256 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:03:17] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_2/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_2/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███▁▁▂▂▂▂▃▁
loss,█▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂
lr,███████▃▃▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▄▃▃▃▄▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁

0,1
best_epoch,11.0
best_val_loss,0.3279
epoch,0.0
loss,0.3096
lr,1e-05
val_loss,0.33878


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:04:16] Training set: Filtered out 0 of 1257 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:04:16] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:04:16] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100

Epoch 00024: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100

Epoch 00037: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 38/100
Epoch 39/100
[2022_04_25-09:05:11] Training the entire fine-tuned model...
[2022_04_25-09:05:19] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 12/100
Epoch 13/100
[2022_04_25-09:06:02] Training on final epochs of sequence length 1024...
[2022_04_25-09:06:02] Training set: Filtered out 0 of 1257 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:06:06] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_3/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_3/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███▁▁▂▂▂▂▃▃▃▁
loss,█▅▅▃▃▃▃▃▃▃▃▂▂▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂
lr,██████████████████▃▃▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▂▃▂▃▃▂▁▂▁▂▁▁▂▂▂▁▂▁▁▁▁▁▁▃▂▁▁

0,1
best_epoch,6.0
best_val_loss,0.36359
epoch,0.0
loss,0.32595
lr,1e-05
val_loss,0.37324


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:07:03] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:07:03] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:07:03] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 41/100
Epoch 42/100
[2022_04_25-09:08:02] Training the entire fine-tuned model...
[2022_04_25-09:08:42] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 21/100
Epoch 22/100
[2022_04_25-09:09:50] Training on final epochs of sequence length 1024...
[2022_04_25-09:09:50] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:10:10] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_4/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_4/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇██▁▁▂▂▂▂▃▃▃▄▄▄▄▁
loss,█▄▄▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂
lr,██████████████████▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▃▃▂▃▂▄▂▂▂▂▂▂▁▂▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,15.0
best_val_loss,0.31023
epoch,0.0
loss,0.32862
lr,1e-05
val_loss,0.31268


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:11:07] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:11:07] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:11:07] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

Epoch 00036: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 37/100
Epoch 38/100
[2022_04_25-09:12:01] Training the entire fine-tuned model...
[2022_04_25-09:12:09] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

Epoch 00017: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100

Epoch 00022: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 23/100
Epoch 24/100
[2022_04_25-09:13:21] Training on final epochs of sequence length 1024...
[2022_04_25-09:13:21] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:13:21] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_5/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_5/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇██▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▅
loss,█▅▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁
lr,███████████████████████▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▃▄▃▃▂▂▃▂▂▄▂▃▃▂▂▂▂▁▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,0.0
best_val_loss,0.35775
epoch,0.0
loss,0.28746
lr,1e-05
val_loss,0.35775


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:14:19] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:14:19] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:14:19] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 31/100
Epoch 32/100
[2022_04_25-09:15:06] Training the entire fine-tuned model...
[2022_04_25-09:15:17] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 11/100
Epoch 12/100
[2022_04_25-09:15:58] Training on final epochs of sequence length 1024...
[2022_04_25-09:15:58] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:15:59] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_6/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_6/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇███▁▁▁▂▂▂▃▃▃▃▁
loss,█▄▃▃▃▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
lr,███████████████████████████▃▃▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▃▃▂▂▂▁▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁▂▁▁▁

0,1
best_epoch,5.0
best_val_loss,0.40072
epoch,0.0
loss,0.32937
lr,1e-05
val_loss,0.40744


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:16:54] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:16:54] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:16:54] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

Epoch 00026: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

Epoch 00036: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 41/100
Epoch 42/100
[2022_04_25-09:17:52] Training the entire fine-tuned model...
[2022_04_25-09:18:31] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 7/100
Epoch 8/100
[2022_04_25-09:19:01] Training on final epochs of sequence length 1024...
[2022_04_25-09:19:01] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:19:01] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_7/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_7/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▁▁▁▂▂▂▁
loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁
lr,█████████████████████▃▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▃▂▂▂▂▂▂▂▂▃▁▁▁▂▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▂▁▃▁▂▁▁

0,1
best_epoch,0.0
best_val_loss,0.38623
epoch,0.0
loss,0.33967
lr,1e-05
val_loss,0.38623


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:19:57] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:19:57] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:19:57] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100

Epoch 00037: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100

Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
[2022_04_25-09:20:54] Training the entire fine-tuned model...
[2022_04_25-09:21:37] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 13/100
Epoch 14/100
[2022_04_25-09:22:22] Training on final epochs of sequence length 1024...
[2022_04_25-09:22:22] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:22:22] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_8/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_8/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███▁▁▂▂▂▂▃▃▃▁
loss,█▅▄▃▃▃▃▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁
lr,████████████████████▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▃▂▂▂▂▂▂▃▁▂▂▂▁▁▂▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▂▂▂▁▁▁▁▁

0,1
best_epoch,7.0
best_val_loss,0.37241
epoch,0.0
loss,0.3258
lr,1e-05
val_loss,0.38233


[34m[1mwandb[0m: wandb version 0.12.15 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_04_25-09:23:21] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:23:21] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 510.
[2022_04_25-09:23:21] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100


[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 30/100
Epoch 31/100
[2022_04_25-09:24:08] Training the entire fine-tuned model...
[2022_04_25-09:24:16] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

Epoch 00014: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 15/100
Epoch 16/100
[2022_04_25-09:25:08] Training on final epochs of sequence length 1024...
[2022_04_25-09:25:08] Training set: Filtered out 0 of 1255 (0.0%) records of lengths exceeding 1022.
[2022_04_25-09:25:44] Validation set: Filtered out 0 of 140 (0.0%) records of lengths exceeding 1022.








INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_9/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/10-fold-cv/2022_04_24_split_9/assets
  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [76]:
f1s

{0: 0.44155844155844154,
 1: 0.5806451612903226,
 2: 0.5714285714285714,
 3: 0.4897959183673469,
 4: 0.41509433962264153,
 5: 0.5714285714285714,
 6: 0.5666666666666667,
 7: 0.42857142857142855,
 8: 0.38596491228070173,
 9: 0.4642857142857143}

In [77]:
from statistics import mean

In [82]:
f1_scores = [value for key, value in f1s.items()]
mean(f1_scores)

0.4915439725500407

In [81]:
from statistics import stdev

In [84]:
stdev(f1_scores)

0.07494233825267892

wandb: Network error (ReadTimeout), entering retry loop.


In [96]:
seq_len = 512

In [100]:
for i in range(10):
    test = chen_data.loc[indices[i]]
    encoded_test_set = encode_dataset(test["seq"], test["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
        dataset_name = 'Tap set')
    test_X, test_Y, test_sample_weigths = encoded_test_set
    filename = path.join(DATA_DIR, f"evaluations/protein_bert/10-fold-cv/2022_04_24_split_{i}.csv")
    model_path = path.join(DATA_DIR, f"protein_bert/10-fold-cv/2022_04_24_split_{i}")
    model = keras.models.load_model(model_path)
    y_pred = model.predict(test_X, batch_size=32)
    y_pred_classes = (y_pred >= 0.5)
    str_preds = [str(int(pred)) for pred in y_pred_classes]
    with open(filename, "wt") as f:
        f.write(",".join(str_preds) + "\n")
    metric_dict = {
        "f1": float(metrics.f1_score(test_Y, y_pred_classes)),
        "acc": float(metrics.accuracy_score(test_Y, y_pred_classes)),
        "mcc": float(metrics.matthews_corrcoef(test_Y, y_pred_classes)),
        "auc": float(metrics.roc_auc_score(test_Y, y_pred_classes)),
        "precision": float(metrics.precision_score(test_Y, y_pred_classes)),
        "recall": float(metrics.recall_score(test_Y, y_pred_classes))
    }
    filename_sum = path.join(DATA_DIR, f"evaluations/protein_bert/10-fold-cv/all.csv")
    line = [f"10-fold-cv/2022_04_24_split_{i}", metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"]]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

[2022_04_25-11:05:07] Tap set: Filtered out 0 of 152 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:15] Tap set: Filtered out 0 of 154 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:23] Tap set: Filtered out 0 of 155 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:32] Tap set: Filtered out 0 of 154 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:40] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:49] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:05:57] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:06:06] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:06:14] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.
[2022_04_25-11:06:23] Tap set: Filtered out 0 of 156 (0.0%) records of lengths exceeding 510.


In [102]:
test = chen_data.loc[indices[0]]
test.to_csv(path.join(DATA_DIR, "evaluations/comparison/y_true_0.csv"))

# Comparison with best ML

In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle
import json
import csv

In [86]:
def logistic_regression(preprocessing, data_name, hp_dir):
    filename = path.join(hp_dir, f"logistic_regression_{data_name}_{preprocessing}.json")
    parameters = json.load(open(filename))
    #C = float(parameters["C"])
    lr = LogisticRegression(
        class_weight='balanced', max_iter=1000, random_state=42,
        C=float(parameters["C"]), penalty=parameters["penalty"], solver=parameters["solver"]
    )
    return lr, parameters, "logistic_regression"

In [87]:
def pybiomed(train_df, test_df, tap_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/pybiomed/X_data.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/pybiomed/X_TAP_data.ftr"))
    x_tap = x_tap.loc[tap_df.index]
    return x_chen_train, x_chen_test, x_tap

def bert(train_df, test_df, tap_df):
    x_chen = pd.read_feather(path.join(DATA_DIR, "chen/embeddings/bert/bert_chen_embeddings.ftr"))
    x_chen_train = x_chen.merge(train_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_chen_test = x_chen.merge(test_df[["Antibody_ID", "Y"]].reset_index(), left_on="Ab_ID", right_on="Antibody_ID").set_index('index').drop("Antibody_ID", axis=1)
    x_tap = pd.read_feather(path.join(DATA_DIR, "tap/embeddings/bert/bert_tap_embeddings.ftr"))
    x_tap = x_tap.drop("Ab_ID", axis=1)
    x_tap = x_tap.loc[tap_df.index]
    return x_chen_train, x_chen_test, x_tap

In [88]:
def scaling(train_df, test_df, tap_df):
    scaler = StandardScaler()
    scaler.fit(train_df.drop(["Ab_ID", "Y"], axis=1))
    x_train_tr = scaler.transform(train_df.drop(["Ab_ID", "Y"], axis=1))
    x_train_df = pd.DataFrame(data=train_df,  index=train_df.index, columns=train_df.drop(["Ab_ID", "Y"], axis=1).columns)
    x_train_df["Ab_ID"] = train_df["Ab_ID"]
    
    x_test_tr = scaler.transform(test_df.drop(["Ab_ID", "Y"], axis=1))
    x_test_df = pd.DataFrame(data=test_df,  index=test_df.index, columns=test_df.drop(["Ab_ID", "Y"], axis=1).columns)
    x_test_df["Y"] = test_df["Y"]
    x_test_df["Ab_ID"] = test_df["Ab_ID"]
    
    x_tap_tr = scaler.transform(tap_df)
    x_tap_df = pd.DataFrame(data=tap_df,  index=tap_df.index, columns=tap_df.columns)

    return x_train_df, train_df["Y"], x_test_df, x_tap_df

In [90]:
def train_and_eval(model_name, classifier, X_train, y_train, X_valid, y_valid):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_valid)
    filename = path.join(DATA_DIR, "evaluations/comparison", "models", f"{model_name}.pkl")
    with open(filename, 'wb') as f:
        pickle.dump(classifier, f)
    filename = path.join(DATA_DIR, "evaluations/comparison", f"{model_name}.csv")
    str_preds = [str(int(pred)) for pred in y_pred]
    with open(filename, "wt") as f:
        f.write(",".join(str_preds) + "\n")
    metric_dict = {
        "f1": float(metrics.f1_score(y_valid, y_pred)),
        "acc": float(metrics.accuracy_score(y_valid, y_pred)),
        "mcc": float(metrics.matthews_corrcoef(y_valid, y_pred)),
        "auc": float(metrics.roc_auc_score(y_valid, y_pred)),
        "precision": float(metrics.precision_score(y_valid, y_pred)),
        "recall": float(metrics.recall_score(y_valid, y_pred))
    }
    filename_sum = os.path.join(DATA_DIR, f"evaluations/comparison/all.csv")
    line = [model_name, metric_dict["f1"], metric_dict["mcc"], metric_dict["acc"],metric_dict["precision"],metric_dict["recall"],metric_dict["auc"]]
    with open(filename_sum, 'a', newline='') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='\t')
        csvwriter.writerow(line)

In [93]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/tap_not_in_chen.csv"))
for i in range(10):
    test = chen_data.loc[indices[i]]
    remaining = [idx for idx in list(chen_data.index) if idx not in indices[i]]
    train = chen_data.loc[remaining]
    x_train, x_test, x_tap = pybiomed(train, test, tap_data)
    x_train_tr, y_train_tr, x_test_tr, tap_tr = scaling(x_train, x_test, x_tap)
    classifier, params, model_label = logistic_regression("scaling", "pybiomed", path.join(DATA_DIR, "evaluations/hyperparameters"))
    train_and_eval(f"logistic_regression_pybiomed_{i}", classifier, x_train_tr.drop(["Ab_ID"], axis=1), 
                    y_train_tr, x_test_tr.drop(["Ab_ID", "Y"], axis=1), x_test_tr["Y"])

In [94]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/tap_not_in_chen.csv"))
for i in range(10):
    test = chen_data.loc[indices[i]]
    remaining = [idx for idx in list(chen_data.index) if idx not in indices[i]]
    train = chen_data.loc[remaining]
    x_train, x_test, x_tap = bert(train, test, tap_data)
    x_train_tr, y_train_tr, x_test_tr, tap_tr = scaling(x_train, x_test, x_tap)
    classifier, params, model_label = logistic_regression("scaling", "bert", path.join(DATA_DIR, "evaluations/hyperparameters"))
    train_and_eval(f"logistic_regression_bert_{i}", classifier, x_train_tr.drop(["Ab_ID"], axis=1), 
                    y_train_tr, x_test_tr.drop(["Ab_ID", "Y"], axis=1), x_test_tr["Y"])