In [None]:

from plants_sm.data_standardization.truncation import Truncator
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.featurization.proteins.bio_embeddings.prot_bert import ProtBert
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

transformers = [ProteinStandardizer(), Truncator(max_length=884), ProtBert(device="cpu")]

dataset = SingleInputDataset.from_csv("data/aminotransferase_binary.csv", representation_field="SEQ", instances_ids_field="ids", nrows=10)

for i, transformer in enumerate(transformers):
    transformers[i].fit(dataset)

In [3]:
transformers[i].fitted

True

In [4]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset
from plants_sm.pipeline.pipeline import Pipeline


pipeline = Pipeline(steps=transformers, models=[])

dataset = SingleInputDataset.from_csv("data/aminotransferase_binary.csv", representation_field="SEQ", instances_ids_field="ids", nrows=10)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/jcapela/miniconda3/envs/deeplants/lib/python3.11/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
pipeline.predict(dataset)
pipeline.save("ProtBERT_pipeline")

In [1]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset
from plants_sm.pipeline.pipeline import Pipeline


dataset = SingleInputDataset.from_csv("data/aminotransferase_binary.csv", representation_field="SEQ", instances_ids_field="ids", nrows=10)

pipeline = Pipeline.load("ProtBERT_pipeline")

  from .autonotebook import tqdm as notebook_tqdm
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/jcapela/miniconda3/envs/deeplants/lib/python3.11/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [2]:
import torch
from plants_sm.models.fc.fc import DNN
from plants_sm.models.pytorch_model import PyTorchModel
from torch import nn

model = DNN(1024, [2560], 5743, batch_norm=True, last_sigmoid=True)
model.load_state_dict(torch.load("ProtBERT_pipeline/prot_bert.pt"))
model = PyTorchModel(model=model, loss_function=nn.BCELoss, model_name="ec_number")

In [3]:
pipeline.add_models(model)

In [4]:
pipeline.predict(dataset)

ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 177.74it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 567.03it/s]
ProtBert: 100%|██████████| 10/10 [00:07<00:00,  1.38it/s]


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [6]:
from plants_sm.models.lightning_model import InternalLightningModel
from plants_sm.pathway_prediction._fine_tune_ec_number_prediction_model import FineTuneModelECNumber

additional_layers = [1280, 640]

learning_rate = 0.0035555738943412697
base_layers = [2560]
batch_size = 64
input_dim = 1024

# module = FineTuneModelECNumber(input_dim=input_dim, additional_layers=additional_layers, classification_neurons=1, \
#     path_to_model="ProtBERT_pipeline/prot_bert.pt", learning_rate=learning_rate, base_layers=base_layers, layers_to_freeze=len(base_layers), scheduler=False)

module = FineTuneModelECNumber.load_from_checkpoint("prot_bert_enzyme_discrimination/pytorch_model_weights.ckpt",
                                           input_dim=input_dim, additional_layers=additional_layers, classification_neurons=1, \
                                            path_to_model="ProtBERT_pipeline/prot_bert.pt")

model = InternalLightningModel(module=module,
        batch_size=batch_size,
        devices=[2],
        accelerator="gpu", model_name="enzyme_discrimination")

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
pipeline.add_models(model)

In [11]:
enzymes_non_enzymes = pipeline.predict(dataset, "enzyme_discrimination")

enzymes_non_enzymes = enzymes_non_enzymes.reshape((enzymes_non_enzymes.shape[0],))
enzymes_non_enzymes[enzymes_non_enzymes==1]

ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 418.41it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 568.96it/s]
ProtBert: 100%|██████████| 10/10 [00:06<00:00,  1.44it/s]
/home/jcapela/miniconda3/envs/deeplants/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 161.92it/s]


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [1]:
from plants_sm.data_structures.dataset.single_input_dataset import SingleInputDataset

from plants_sm.data_standardization.truncation import Truncator
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.featurization.proteins.bio_embeddings.prot_bert import ProtBert
from plants_sm.featurization.proteins.bio_embeddings.esm import ESMEncoder

transformers = [ProteinStandardizer(), Truncator(max_length=884), ESMEncoder(esm_function="esm1b_t33_650M_UR50S", device="cpu")]

dataset = SingleInputDataset.from_csv("data/aminotransferase_binary.csv", representation_field="SEQ", instances_ids_field="ids", nrows=10)

for i, transformer in enumerate(transformers):
    transformers[i].fit(dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from plants_sm.pipeline.pipeline import Pipeline


pipeline = Pipeline(steps=transformers, models=[])
pipeline.transform(dataset)
pipeline.save("ESM1b_pipeline_2")

ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 293.63it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 275.60it/s]
ESM: 100%|██████████| 10/10 [00:09<00:00,  1.03it/s]


In [None]:
import torch
from plants_sm.models.fc.fc import DNN
from plants_sm.models.pytorch_model import PyTorchModel
from torch import nn

model = DNN(1280, [2560, 5120], 5743, batch_norm=True, last_sigmoid=True)
model.load_state_dict(torch.load("ESM1b_pipeline/esm1b.pt"))
model = PyTorchModel(model=model, loss_function=nn.BCELoss, model_name="ec_number")
pipeline.add_models(model)

In [3]:
pipeline.predict(dataset)

ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 380.18it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 466.44it/s]
ESM: 100%|██████████| 10/10 [00:09<00:00,  1.06it/s]


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [4]:
from plants_sm.models.lightning_model import InternalLightningModel
from plants_sm.pathway_prediction._fine_tune_ec_number_prediction_model import FineTuneModelECNumber

additional_layers = [1280]

learning_rate = 0.0010584969012120454
base_layers = [2560, 5120]
batch_size = 16
input_dim = 1280

# module = FineTuneModelECNumber(input_dim=input_dim, additional_layers=additional_layers, classification_neurons=1, \
#     path_to_model="ProtBERT_pipeline/prot_bert.pt", learning_rate=learning_rate, base_layers=base_layers, layers_to_freeze=len(base_layers), scheduler=False)

module = FineTuneModelECNumber.load_from_checkpoint("esm1b_enzyme_discrimination/pytorch_model_weights.ckpt",
                                           input_dim=input_dim, additional_layers=additional_layers, classification_neurons=1, 
                                           base_layers=base_layers,
                                            path_to_model="ESM1b_pipeline/esm1b.pt")

model = InternalLightningModel(module=module,
        batch_size=batch_size,
        devices=[2],
        accelerator="gpu", model_name="enzyme_discrimination")

from pytorch_lightning import Trainer
dataset = pipeline.transform(dataset)
model.predict(dataset)

pipeline.add_models(model)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 370.50it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 506.78it/s]
ESM: 100%|██████████| 10/10 [00:09<00:00,  1.06it/s]
/home/jcapela/miniconda3/envs/deeplants/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 128.62it/s]


In [5]:
enzymes_non_enzymes = pipeline.predict(dataset, "enzyme_discrimination")

enzymes_non_enzymes = enzymes_non_enzymes.reshape((enzymes_non_enzymes.shape[0],))
enzymes_non_enzymes[enzymes_non_enzymes==1]

ProteinStandardizer: 100%|██████████| 10/10 [00:00<00:00, 403.72it/s]
Truncator: 100%|██████████| 10/10 [00:00<00:00, 497.03it/s]
ESM: 100%|██████████| 10/10 [00:09<00:00,  1.08it/s]


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 108.81it/s]


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [6]:
pipeline.save("ESM1b_pipeline_2")

AttributeError: Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?