In [1]:
import pandas as pd
from tensorflow import keras

from sklearn.model_selection import train_test_split

from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs
from os import path
import pickle

In [16]:
import wandb
from wandb.keras import WandbCallback
wandb.init(project="my-test-project", entity="kvetab")

In [2]:
DATA_DIR = "../../data/"

In [3]:
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)

In [4]:
pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "epoch_92400_sample_23500000.pkl")

In [5]:
#pretrained_model_generator, input_encoder = load_pretrained_model("../../data/protein_bert/", "checkpoint_2022_01_19.pkl")

In [5]:
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

In [6]:
training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True)
    #WandbCallback()
]

In [7]:
train_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_train_data.csv"), index_col=0)
valid_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_valid_data.csv"), index_col=0)
test_data = pd.read_csv(path.join(DATA_DIR, "chen/deduplicated/chen_test_data.csv"), index_col=0)
train_data.head()

Unnamed: 0,Antibody_ID,heavy,light,Y
2073,6aod,EVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIVMTKSPSSLSASVGDRVTITCRASQGIRNDLGWYQQKPGKAPKR...,0
1517,4yny,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,EFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2025,5xcv,EVQLVESGGGLVQPGRSLKLSCAASGFTFSNYGMAWVRQTPTKGLE...,QFVLTQPNSVSTNLGSTVKLSCKRSTGNIGSNYVNWYQQHEGRSPT...,1
2070,6and,EVQLVESGGGLVQPGGSLRLSCAASGYEFSRSWMNWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRSSQSIVHSVGNTFLEWYQQKPG...,1
666,2xqy,QVQLQQPGAELVKPGASVKMSCKASGYSFTSYWMNWVKQRPGRGLE...,DIVLTQSPASLALSLGQRATISCRASKSVSTSGYSYMYWYQQKPGQ...,0


In [8]:
train_data["seq"] = train_data["heavy"] + train_data["light"]
valid_data["seq"] = valid_data["heavy"] + valid_data["light"]
test_data["seq"] = test_data["heavy"] + test_data["light"]

In [10]:
wandb.config = {
  "learning_rate": 1e-04,
  "epochs": 80,
  "batch_size": 64
}

In [9]:
finetune(model_generator, input_encoder, OUTPUT_SPEC, train_data['seq'], train_data['Y'], valid_data['seq'], valid_data['Y'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)

[2022_01_25-11:35:43] Training set: Filtered out 0 of 1338 (0.0%) records of lengths exceeding 510.
[2022_01_25-11:35:43] Validation set: Filtered out 0 of 120 (0.0%) records of lengths exceeding 510.
[2022_01_25-11:35:43] Training with frozen pretrained layers...


2022-01-25 11:35:43.604370: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 4/40
Epoch 5/40
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00015624999650754035.
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 00012: ReduceLROnPlateau reducing learning rate to 3.9062499126885086e-05.
Epoch 13/40
Epoch 00013: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
[2022_01_25-12:07:22] Training the entire fine-tuned model...
[2022_01_25-12:07:32] Incompatible number of optimizer weights - will not initialize them.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 00003: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 00006: ReduceLROnP

In [10]:
results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)

In [None]:
# fine tuned loaded
results, confusion_matrix = evaluate_by_len(pretrained_model_generator, input_encoder, OUTPUT_SPEC, test_data['seq'], test_data['Y'], \
        start_seq_len = 512, start_batch_size = 32)

In [11]:
print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
512,119,0.936821
All,119,0.936821


Confusion matrix:


Unnamed: 0,0,1
0,85,11
1,8,15


In [32]:
model_generator.pretraining_model_generator.optimizer_weights is None

True

In [33]:
to_save = {
    "n_annotations": model_generator.pretraining_model_generator.n_annotations,
    "model_weights": model_generator.model_weights,
    "optimizer_weights": model_generator.optimizer_weights
}

In [36]:
with open(path.join(DATA_DIR, "protein_bert/checkpoint_2022_01_19.pkl"), 'wb') as f:
    pickle.dump(to_save, f)

In [12]:
f1 = 15 / (15 + 0.5* (11 + 8))
# TN / (TN + 0.5 * (FP + FN))
f1

0.6122448979591837

In [16]:
mod = model_generator.create_model(seq_len = 512)

In [17]:
mod.save(path.join(DATA_DIR, "protein_bert/batch_32_lr_1e-4_2022_01_25.pkl"))


2022-01-25 15:36:30.188640: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../../data/protein_bert/batch_32_lr_1e-4_2022_01_25.pkl/assets


INFO:tensorflow:Assets written to: ../../data/protein_bert/batch_32_lr_1e-4_2022_01_25.pkl/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


In [None]:
def fine_tune():
    model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)