In [1]:
import pandas as pd
from tensorflow import keras

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, \
finetune, evaluate_by_len
from proteinbert.finetuning import encode_train_and_valid_sets, encode_dataset
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs
from os import path
import pickle

In [2]:
import wandb
from wandb.keras import WandbCallback

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
DATA_DIR = "../../data/"

In [5]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [6]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [7]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [8]:
wandb.init(project=f"ManualTraining1", entity="kvetab")

[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [9]:
train_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
valid_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
test_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
train_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [10]:
train_data["seq"] = train_data["heavy"] + train_data["light"]
valid_data["seq"] = valid_data["heavy"] + valid_data["light"]
test_data["seq"] = test_data["heavy"] + test_data["light"]

In [11]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True),
    WandbCallback()
]
seq_len = 512

In [12]:
encoded_train_set, encoded_valid_set = encode_train_and_valid_sets(
    train_data['seq'], 
    train_data['Y'], 
    valid_data['seq'], 
    valid_data['Y'], 
    input_encoder, 
    OUTPUT_SPEC, 
    seq_len
)

[2022_02_02-18:01:15] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_02_02-18:01:15] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.


In [13]:
train_X, train_Y, train_sample_weigths = encoded_train_set

In [16]:
model = model_generator.create_model(seq_len=512, freeze_pretrained_layers=True)

2022-02-02 18:01:32.480937: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-02 18:01:33.190773: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:41:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [17]:
epoch_num = 50
batch_size = 128
learning_rate = 1e-2

In [18]:
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num,
      "batch_size": batch_size
    }

In [19]:
model.optimizer.lr = learning_rate

In [20]:
model.fit(
    x=train_X,
    y=train_Y,
    batch_size=batch_size,
    epochs=epoch_num,
    callbacks=training_callbacks,
    validation_data=encoded_valid_set
)


2022-02-02 18:01:46.045244: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50


2022-02-02 18:01:55.040045: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/50
Epoch 3/50
Epoch 4/50

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 5/50
Epoch 6/50
Epoch 7/50

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 8/50
Epoch 9/50
Epoch 10/50

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 11/50
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: ReduceLROnPlateau reducing learning rate to 9.765624781721272e-06.
Epoch 16/50
Epoch 17/50

Epoch 00017: ReduceLROnPlateau reducing learning rate to 2.441406195430318e-06.
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 6.103515488575795e-07.
Epoch 19/50

Epoch 00019: ReduceLROnPlateau reducing learning rate to 1.5258788721439487e-07.
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 1e-07.
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoc

<keras.callbacks.History at 0x7f1d90084890>

In [21]:
model_generator.update_state(model)

In [15]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs:", len(physical_devices))

Num GPUs: 1


In [26]:
import tensorflow as tf
print(tf. __version__)

2.6.2


In [22]:
def slice_arrays(arrays, slicing):
    if isinstance(arrays, list) or isinstance(arrays, tuple):
        return [array[slicing] for array in arrays]
    else:
        return arrays[slicing]

In [27]:
model_generator.dummy_epoch = (slice_arrays(train_X, slice(0, 1)), slice_arrays(train_Y, slice(0, 1)))
model = model_generator.create_model(seq_len=512, freeze_pretrained_layers=False)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


[2022_02_02-18:58:48] Incompatible number of optimizer weights - will not initialize them.


In [28]:
wandb.init(project=f"ManualTraining1", entity="kvetab")
epoch_num = 50
batch_size = 128
learning_rate = 5e-5
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num,
      "batch_size": batch_size
    }
model.optimizer.lr = learning_rate

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
loss,█▆▅▅▄▄▃▂▂▁▂▃▁▂▃▁▃▂▁▂
lr,█████▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,██▅▄▅▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,19.0
best_val_loss,0.43132
epoch,19.0
loss,0.40285
lr,0.0
val_loss,0.43132


[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [29]:
model.fit(
    x=train_X,
    y=train_Y,
    batch_size=batch_size,
    epochs=epoch_num,
    callbacks=training_callbacks,
    validation_data=encoded_valid_set
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.249999968422344e-05.
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 3.12499992105586e-06.
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

Epoch 00021: ReduceLROnPlateau reducing learning rate to 7.81249980263965e-07.
Epoch 22/50
Epoch 23/50

Epoch 00023: ReduceLROnPlateau reducing learning rate to 1.9531249506599124e-07.
Epoch 24/50

Epoch 00024: ReduceLROnPlateau reducing learning rate to 1e-07.
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f1a592295d0>

In [30]:
model.save(path.join(DATA_DIR, "protein_bert/manual_training_2_2022_02_02.pkl"))

2022-02-02 19:03:55.781999: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/manual_training_2_2022_02_02.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/manual_training_2_2022_02_02.pkl/assets


In [31]:
encoded_test_set = encode_dataset(test_data["seq"], test_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'Test set')

[2022_02_02-19:04:06] Test set: Filtered out 0 of 119 (0.0%) records of lengths exceeding 510.


In [32]:
from sklearn.metrics import f1_score, confusion_matrix
test_X, test_Y, test_sample_weigths = encoded_test_set

In [33]:
y_pred = model.predict(test_X, batch_size=32)
y_pred_classes = (y_pred >= 0.5)
f1_score(test_Y, y_pred_classes)

0.5161290322580645

In [34]:
pd.DataFrame(confusion_matrix(test_Y, y_pred_classes, labels=[0, 1]), index=[0, 1], columns=[0, 1])

Unnamed: 0,0,1
0,96,0
1,15,8
