In [1]:
import pandas as pd
from tensorflow import keras
from os import path
import pickle

In [2]:
from proteinbert.finetuning import encode_train_and_valid_sets, encode_dataset
from proteinbert import OutputType, OutputSpec, evaluate_by_len, load_pretrained_model

In [3]:
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, finetune
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [4]:
import wandb
from wandb.keras import WandbCallback

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [6]:
DATA_DIR = "../../data/"

In [7]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [8]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [9]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [10]:
wandb.init(project=f"Heavy", entity="kvetab")

[34m[1mwandb[0m: Currently logged in as: [33mkvetab[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [11]:
train_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
valid_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
test_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
train_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [12]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 3, factor = 0.25, min_lr = 1e-07, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 6, restore_best_weights = True),
    WandbCallback()
]

In [13]:
epoch_num = 100
batch_size = 128
learning_rate = 1e-4

In [14]:
wandb.config = {
      "learning_rate": learning_rate,
      "epochs": epoch_num * 2,
      "batch_size": batch_size
    }

# Separate models
## Heavy

In [15]:
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['heavy'], train_data['Y'], valid_data['heavy'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_09-16:08:28] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:08:28] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:08:28] Training with frozen pretrained layers...


2022-03-09 16:08:28.487074: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-09 16:08:29.046937: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9656 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:21:00.0, compute capability: 7.5
  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-09 16:08:30.813657: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100


2022-03-09 16:08:38.600077: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 7605




[34m[1mwandb[0m: [32m[41mERROR[0m Can't save model, h5py returned error: Layer GlobalAttention has arguments in `__init__` and therefore must override `get_config`.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 26/100
Epoch 27/100
Epoch 28/100

Epoch 00028: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 29/100
[2022_03_09-16:09:16] Training the entire fine-tuned model...
[2022_03_09-16:09:26] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to

In [16]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['heavy'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,83,13
1,10,13


0.5306122448979592

In [17]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_09__01vh"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")
2022-03-09 16:11:51.222381: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__01vh/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__01vh/assets


## Light

In [18]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)
wandb.init(project=f"Heavy", entity="kvetab")

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['light'], train_data['Y'], valid_data['light'], valid_data['Y'], \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

0,1
epoch,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇██▁▁▁▂▂▂▃▃▃▃▃▁
loss,█▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▂
lr,█████████▃▃▃▃▃▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▂▂▂▂▂▃▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▂▁▃▂▂▁

0,1
best_epoch,0.0
best_val_loss,0.46027
epoch,0.0
loss,0.4238
lr,1e-05
val_loss,0.46027


[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2022_03_09-16:15:22] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:15:22] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:15:22] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_09-16:15:44] Training the entire fine-tuned model...
[2022_03_09-16:15:52] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

Epoch 00014: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
Epoch 15/100
Epoch 16/100
Epoch 17/100

Epoch 00017: ReduceLROnPlateau reducing learning rate to 1.56249996052793e-06.
[2022_03_09-16:16:47] Training on final epochs of sequence length 1024...
[2022_03_09-16:16:47] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 1022.
[2022_03_09-16:16:48] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 1022.






In [21]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['light'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Confusion matrix:


Unnamed: 0,0,1
0,90,6
1,12,11


0.55

In [20]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_09__01vl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__01vl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__01vl/assets


# Over-sampling

In [26]:
sampler = RandomOverSampler(random_state=42)
x_train, y_train = sampler.fit_resample(train_data.drop("Y", axis=1), train_data['Y'])
x_valid, y_valid = sampler.fit_resample(valid_data.drop("Y", axis=1), valid_data['Y'])

## Heavy

In [29]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train['heavy'], y_train, x_valid['heavy'], y_valid, \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_09-16:30:35] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:30:35] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:30:35] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_09-16:31:20] Training the entire fine-tuned model...
[2022_03_09-16:31:28] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_09-16:32:12] Training on final epochs of sequence length 1024...
[2022_03_09-16:32:12] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 1022.
[2022_03_09-16:32:15] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 1022.






In [31]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['heavy'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Confusion matrix:


Unnamed: 0,0,1
0,51,45
1,4,19


0.4367816091954023

In [32]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_09__02vh"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__02vh/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__02vh/assets


## Light

In [33]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

finetune(model_generator, input_encoder, OUTPUT_SPEC, x_train['light'], y_train, x_valid['light'], y_valid, \
        seq_len = 512, batch_size = batch_size, max_epochs_per_stage = epoch_num, lr = learning_rate, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = learning_rate / 10, callbacks = training_callbacks)

[2022_03_09-16:34:47] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:34:47] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 510.
[2022_03_09-16:34:47] Training with frozen pretrained layers...


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
[2022_03_09-16:35:20] Training the entire fine-tuned model...
[2022_03_09-16:35:45] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100

Epoch 00004: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 6.24999984211172e-06.
[2022_03_09-16:36:24] Training on final epochs of sequence length 1024...
[2022_03_09-16:36:24] Training set: Filtered out 0 of 2114 (0.0%) records of lengths exceeding 1022.
[2022_03_09-16:36:32] Validation set: Filtered out 0 of 188 (0.0%) records of lengths exceeding 1022.






In [34]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['light'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)
print('Confusion matrix:')
display(confusion_matrix)
fn_fp = confusion_matrix.loc["0"][1] + confusion_matrix.loc["1"][0]
f1 = confusion_matrix.loc["1"][1] / (confusion_matrix.loc["1"][1] + 0.5 * fn_fp)
f1

Confusion matrix:


Unnamed: 0,0,1
0,69,27
1,3,20


0.5714285714285714

In [35]:
mod = model_generator.create_model(seq_len = 512)
mod.save(path.join(DATA_DIR, "protein_bert/2022_03_09__02vl"))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__02vl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/2022_03_09__02vl/assets


# Test

In [24]:
model_path_vh = path.join(DATA_DIR, "protein_bert/2022_03_09__02vh")
model_vh = keras.models.load_model(model_path_vh)

model_path_vl = path.join(DATA_DIR, "protein_bert/2022_03_09__02vl")
model_vl = keras.models.load_model(model_path_vl)

In [12]:
seq_len = 512
encoded_heavy_set = encode_dataset(test_data["heavy"], test_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'Heavy set')
encoded_light_set = encode_dataset(test_data["light"], test_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'Light set')

[2022_03_10-10:38:55] Heavy set: Filtered out 0 of 119 (0.0%) records of lengths exceeding 510.
[2022_03_10-10:38:55] Light set: Filtered out 0 of 119 (0.0%) records of lengths exceeding 510.


In [13]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
test_X_h, test_Y_h, test_sample_weigths_h = encoded_heavy_set
test_X_l, test_Y_l, test_sample_weigths_l = encoded_light_set

In [25]:
y_pred_h = model_vh.predict(test_X_h, batch_size=32)
y_pred_l = model_vl.predict(test_X_l, batch_size=32)

y_pred_classes = ((y_pred_h + y_pred_l) >= 1.0)
f1_score(test_Y_h, y_pred_classes)

0.547945205479452

In [15]:
confusion_matrix(test_Y_h, y_pred_classes)

array([[92,  4],
       [10, 13]])

In [16]:
tap_data = pd.read_csv(path.join(DATA_DIR, "tap/TAP_data.csv"))
encoded_tap_heavy = encode_dataset(tap_data["heavy"], tap_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'TAP set')
encoded_tap_light = encode_dataset(tap_data["light"], tap_data["Y"], input_encoder, OUTPUT_SPEC, seq_len = seq_len, needs_filtering = True, \
            dataset_name = 'TAP set')
tap_X_h, tap_Y_h, tap_sample_weigths = encoded_tap_heavy
tap_X_l, tap_Y_l, tap_sample_weigths = encoded_tap_light

[2022_03_10-10:39:20] TAP set: Filtered out 0 of 241 (0.0%) records of lengths exceeding 510.
[2022_03_10-10:39:20] TAP set: Filtered out 0 of 241 (0.0%) records of lengths exceeding 510.


In [26]:
y_pred_h = model_vh.predict(tap_X_h, batch_size=32)
y_pred_l = model_vl.predict(tap_X_l, batch_size=32)

y_pred_classes = ((y_pred_h + y_pred_l) >= 1.0)
f1_score(tap_Y_h, y_pred_classes)

0.8020050125313284

In [27]:
accuracy_score(tap_Y_h, y_pred_classes)

0.6721991701244814