# Package Example

## Imports

In [1]:
from plants_sm.data_structures.dataset.multi_input_dataset import MultiInputDataset
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.compounds.deepmol_standardizers import DeepMolStandardizer
from plants_sm.featurization.compounds.deepmol_descriptors import DeepMolDescriptors
from plants_sm.featurization.proteins.propythia.propythia import PropythiaWrapper
from torch import nn
from sklearn.metrics import f1_score
from torch.optim import Adam
from plants_sm.models.constants import BINARY
from plants_sm.models.pytorch_model import PyTorchModel
from plants_sm.models.interaction.baseline_model import BaselineModel
from plants_sm.tokenisation.compounds.smilespe import AtomLevelTokenizer
import pandas as pd


from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

[14:39:17] Initializing Normalizer
2024-06-19 14:39:19.505085: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-19 14:39:19.508822: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-19 14:39:19.602059: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 14:39:19.602401: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 14:39:19.602550: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register

In [2]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"
pd.read_csv(multi_input_dataset_csv).head()

Unnamed: 0,ids,SEQ,SUBSTRATES,LogSpActivity
0,0,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C(C(=O)O)N,0
1,1,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C[C@@H](C(=O)O)N,1
2,2,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)[C@@H](C(=O)O)N,0
3,3,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)C[C@@H](C(=O)O)N,1
4,4,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC[C@H](C)[C@@H](C(=O)O)N,0


## Read datasets

In [2]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"

multi_input_dataset = MultiInputDataset.from_csv(file_path=multi_input_dataset_csv,
                                                 representation_field={"proteins": "SEQ",
                                                                       "ligands": "SUBSTRATES"},
                                                 instances_ids_field={"interaction": "ids"},
                                                 labels_field="LogSpActivity")

The MultiInputDataset converts everything into dictionaries to ensure that we are always processing unique sequences and not the same multiple times

In [6]:
multi_input_dataset.instances["proteins"]

{8: 'MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVLTSVKKAEQYLLENETTKNYLGIDGIPEFGRCTQELLFGKGSALINDKRARTAQTPGGTGALRVAADFLAKNTSVKRVWVSNPSWPNHKSVFNSAGLEVREYAYYDAENHTLDFDALINSLNEAQAGDVVLFHGCCHNPTGIDPTLEQWQTLAQLSVEKGWLPLFDFAYQGFARGLEEDAEGLRAFAAMHKELIVASSYSKNFGLYNERVGACTLVAADSETVDRAFSQMKAAIRANYSNPPAHGASVVATILSNDALRAIWEQELTDMRQRIQRMRQLFVNTLQEKGANRDFSFIIKQNGMFSFSGLTKEQVLRLREEFGVYAVASGRVNVAGMTPDNMAPLCEAIVAVL',
 5: 'MDYVTLASHAVRQYAPDQIFTASQRAKADAAALGEDAVINATLGECLDDDGKLMVLPTVERMIRQMPVEDICSYAPIAGIKGFNEAVQISLFGKCLDRFYVESVATPGGCGALRHAIWNFLNFGDALLTTNWIWGPYKNICEEHGRRMVTFDMFNRENTFNLEGMDRAIGEILAVQEQLLMILNTPANNPTGYSMTKQEMEQTVAILKKHAAANPDKNLTFCLDVSYIDFAGSFEESREIFDAIFDMPANTMTLLIFSMSKSYTMCGMRCGALVCLGSTAESAAVFKQAMSYSSRSTWSNAIHMAQKILVDINLNPEIRERVSQERAVFRNTITNRGRTFCAAAKEASLEICPYQYGYFVAIPCKNPVETARILMDQHIYVVPQAQGLRFSPCTVTTEKCRKAPAFIKAAMEQTQ',
 10: 'MFQKVDAYAGDPILTLMERFKEDPRSDKVNLSIGLYYNEDGIIPQLQAVAEAEARLNAQPHGASLYLPMEGLNCYRHAIAPLLFGADHPVLKQQRVATIQTLGGSGALKVGADFLKRYFPESGVWVSDPTWENHVAIFAGAGFEVSTYPWYDEATNGVRFNDLLATL

In [7]:
multi_input_dataset.instances["ligands"]

{1: 'C(C(=O)O)N',
 17: 'C[C@@H](C(=O)O)N',
 14: 'CC(C)[C@@H](C(=O)O)N',
 13: 'CC(C)C[C@@H](C(=O)O)N',
 15: 'CC[C@H](C)[C@@H](C(=O)O)N',
 11: 'C1=CC=C(C=C1)C[C@@H](C(=O)O)N',
 10: 'C1=CC(=CC=C1C[C@@H](C(=O)O)N)O',
 12: 'C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N',
 7: 'C([C@@H](C(=O)O)N)O',
 18: 'C[C@H]([C@@H](C(=O)O)N)O',
 8: 'C([C@@H](C(=O)O)N)S',
 16: 'CSCC[C@@H](C(=O)O)N',
 5: 'C([C@@H](C(=O)O)N)C(=O)N',
 2: 'C(CC(=O)N)[C@@H](C(=O)O)N',
 6: 'C([C@@H](C(=O)O)N)C(=O)O',
 3: 'C(CCN)C[C@@H](C(=O)O)N',
 4: 'C(C[C@@H](C(=O)O)N)CN=C(N)N',
 9: 'C1=C(NC=N1)C[C@@H](C(=O)O)N'}

## Standardize sequences

In [3]:
multi_input_dataset = ProteinStandardizer(n_jobs=50).fit_transform(multi_input_dataset, instance_type="proteins")

ProteinStandardizer:   0%|          | 0/25 [00:00<?, ?it/s]

ProteinStandardizer: 100%|██████████| 25/25 [00:00<00:00, 79.67it/s]


In [9]:
multi_input_dataset.instances["proteins"]

{8: 'MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVLTSVKKAEQYLLENETTKNYLGIDGIPEFGRCTQELLFGKGSALINDKRARTAQTPGGTGALRVAADFLAKNTSVKRVWVSNPSWPNHKSVFNSAGLEVREYAYYDAENHTLDFDALINSLNEAQAGDVVLFHGCCHNPTGIDPTLEQWQTLAQLSVEKGWLPLFDFAYQGFARGLEEDAEGLRAFAAMHKELIVASSYSKNFGLYNERVGACTLVAADSETVDRAFSQMKAAIRANYSNPPAHGASVVATILSNDALRAIWEQELTDMRQRIQRMRQLFVNTLQEKGANRDFSFIIKQNGMFSFSGLTKEQVLRLREEFGVYAVASGRVNVAGMTPDNMAPLCEAIVAVL',
 5: 'MDYVTLASHAVRQYAPDQIFTASQRAKADAAALGEDAVINATLGECLDDDGKLMVLPTVERMIRQMPVEDICSYAPIAGIKGFNEAVQISLFGKCLDRFYVESVATPGGCGALRHAIWNFLNFGDALLTTNWIWGPYKNICEEHGRRMVTFDMFNRENTFNLEGMDRAIGEILAVQEQLLMILNTPANNPTGYSMTKQEMEQTVAILKKHAAANPDKNLTFCLDVSYIDFAGSFEESREIFDAIFDMPANTMTLLIFSMSKSYTMCGMRCGALVCLGSTAESAAVFKQAMSYSSRSTWSNAIHMAQKILVDINLNPEIRERVSQERAVFRNTITNRGRTFCAAAKEASLEICPYQYGYFVAIPCKNPVETARILMDQHIYVVPQAQGLRFSPCTVTTEKCRKAPAFIKAAMEQTQ',
 10: 'MFQKVDAYAGDPILTLMERFKEDPRSDKVNLSIGLYYNEDGIIPQLQAVAEAEARLNAQPHGASLYLPMEGLNCYRHAIAPLLFGADHPVLKQQRVATIQTLGGSGALKVGADFLKRYFPESGVWVSDPTWENHVAIFAGAGFEVSTYPWYDEATNGVRFNDLLATL

### Padding

In [5]:
from plants_sm.data_standardization.proteins.padding import SequencePadder
from plants_sm.data_standardization.compounds.padding import SMILESPadder

multi_input_dataset = SequencePadder(n_jobs=50).fit_transform(multi_input_dataset, instance_type="proteins")

Featurizing: 100%|██████████| 25/25 [00:00<00:00, 4528.31it/s]


In [8]:
multi_input_dataset.instances["proteins"][1]

'MADTRPERRFTRIDRLPPYVFNITAELKMAARRRGEDIIDFSMGNPDGATPPHIVEKLCTVAQRPDTHGYSTSRGIPRLRRAISRWYQDRYDVEIDPESEAIVTIGSKEGLAHLMLATLDHGDTVLVPNPSYPIHIYGAVIAGAQVRSVPLVEGVDFFNELERAIRESYPKPKMMILGFPSNPTAQCVELEFFEKVVALAKRYDVLVVHDLAYADIVYDGWKAPSIMQVPGARDVAVEFFTLSKSYNMAGWRIGFMVGNKTLVSALARIKSYHDYGTFTPLQVAAIAALEGDQQCVRDIAEQYKRRRDVLVKGLHEAGWMVEMPKASMYVWAKIPEPYAAMGSLEFAKKLLNEAKVCVSPGIGFGDYGDTHVRFALIENRDRIRQAIRGIKAMFRADGLLPASSKHIHENAE----------------------------------------------------------------'

In [6]:
multi_input_dataset = DeepMolStandardizer(n_jobs=50).fit_transform(multi_input_dataset, "ligands")

Featurizing: 100%|██████████| 18/18 [00:00<00:00, 1528.75it/s]


In [7]:
SMILESPadder().fit_transform(multi_input_dataset, "ligands")

Featurizing: 100%|██████████| 18/18 [00:00<00:00, 32711.21it/s]


<plants_sm.data_structures.dataset.multi_input_dataset.MultiInputDataset at 0x7f7a2e4d9fa0>

In [11]:
multi_input_dataset.instances["ligands"]

{1: '[H]OC(=O)C([H])([H])N([H])[H]GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
 17: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])[H]GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
 14: '[H]OC(=O)C([H])(N([H])[H])C([H])(C([H])([H])[H])C([H])([H])[H]GGGGGGGGGGGGGGGGGGGG',
 13: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])C([H])(C([H])([H])[H])C([H])([H])[H]GGGGGGGGG',
 15: '[H]OC(=O)C([H])(N([H])[H])C([H])(C([H])([H])[H])C([H])([H])C([H])([H])[H]GGGGGGGGG',
 11: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])c1c([H])c([H])c([H])c([H])c1[H]GGGGGGGGGGGGGG',
 10: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])c1c([H])c([H])c(O[H])c([H])c1[H]GGGGGGGGGGGGG',
 12: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])c1c([H])n([H])c2c([H])c([H])c([H])c([H])c12GG',
 7: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])O[H]GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
 18: '[H]OC(=O)C([H])(N([H])[H])C([H])(O[H])C([H])([H])[H]GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
 8: '[H]OC(=O)C([H])(N([H])[H])C([H])([H])S[H]GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG',
 1

## Generating features and encodings

### Word2Vec

In [4]:
proteins_word2vec = PropythiaWrapper(n_jobs=30).fit(multi_input_dataset, "proteins")
multi_input_dataset = proteins_word2vec.transform(multi_input_dataset, "proteins")

PropythiaWrapper: 100%|██████████| 25/25 [00:02<00:00,  9.54it/s]


### DeepMol descriptors

In [5]:
deepmol_wrapper = DeepMolDescriptors(n_jobs=8).fit(multi_input_dataset, instance_type="ligands")
multi_input_dataset = deepmol_wrapper.transform(multi_input_dataset, instance_type="ligands")

DeepMolDescriptors: 100%|██████████| 18/18 [00:01<00:00, 17.85it/s]


In [9]:
multi_input_dataset.dataframe

Unnamed: 0,ids,LogSpActivity,proteins_identifiers,ligands_identifiers
0,0,0,8,1
1,1,1,8,17
2,2,0,8,14
3,3,1,8,13
4,4,0,8,15
...,...,...,...,...
445,445,1,2,2
446,446,1,2,6
447,447,0,2,3
448,448,0,2,4


In [12]:
multi_input_dataset.X["ligands"]

ValueError: Features for proteins are not defined

In [15]:
multi_input_dataset.y

array([[0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1]])

## Tokenizers

In [16]:
AtomLevelTokenizer().tokenize("CC[C@H](C)[C@@H](C(=O)O)N")

['C',
 'C',
 '[C@H]',
 '(',
 'C',
 ')',
 '[C@@H]',
 '(',
 'C',
 '(',
 '=',
 'O',
 ')',
 'O',
 ')',
 'N']

### One hot encoding

In [12]:


from plants_sm.featurization.encoding.one_hot_encoder import OneHotEncoder

OneHotEncoder(tokenizer=AtomLevelTokenizer()).fit_transform(multi_input_dataset, "ligands")
multi_input_dataset.X["ligands"]

OneHotEncoder: 100%|██████████| 18/18 [00:00<00:00, 26259.99it/s]


array([[[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

## Running models

In [6]:
input_size_proteins = multi_input_dataset.X["proteins"].shape[1]
input_size_compounds = multi_input_dataset.X["ligands"].shape[1]
model = BaselineModel(input_size_proteins, input_size_compounds, hidden_layers_proteins=[500, 250],
                      hidden_layers_compounds=[500, 500], hidden_layers_interaction=[500, 500])

wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=100,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")
wrapper.fit(multi_input_dataset)

2023-03-05 19:37:14.356642: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-05 19:37:15.429206: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-05 19:37:15.429335: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
INFO:plants_sm.models.pytorch_model:starting to fit the data...
INFO:plants_sm.models.pytorch_model:[1/100, 0/1] loss: 0.69325203
INFO:plants_sm.models.pytorch_model:

BaselineModel(
  (dense_proteins1): Linear(in_features=512, out_features=1024, bias=True)
  (fc_proteins0): Linear(in_features=1024, out_features=500, bias=True)
  (fc_proteins1): Linear(in_features=500, out_features=250, bias=True)
  (dense_compounds1): Linear(in_features=2048, out_features=4096, bias=True)
  (fc_compounds0): Linear(in_features=4096, out_features=500, bias=True)
  (fc_compounds1): Linear(in_features=500, out_features=500, bias=True)
  (dense_interaction_layer1): Linear(in_features=750, out_features=4096, bias=True)
  (fc_interaction0): Linear(in_features=4096, out_features=500, bias=True)
  (fc_interaction1): Linear(in_features=500, out_features=500, bias=True)
  (final_layer): Linear(in_features=500, out_features=1, bias=True)
)

In [10]:
from plants_sm.pipeline.pipeline import Pipeline

input_size_proteins = multi_input_dataset.X["proteins"].shape[1]
input_size_compounds = multi_input_dataset.X["ligands"].shape[1]
model = BaselineModel(input_size_proteins, input_size_compounds, hidden_layers_proteins=[500, 250],
                      hidden_layers_compounds=[500, 500], hidden_layers_interaction=[500, 500])

wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=4,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")

steps = {"proteins": [ProteinStandardizer(), Word2Vec()],
            "ligands": [DeepMolStandardizer(), DeepMolDescriptors()]}
pipeline = Pipeline(steps, models=[wrapper])

pipeline.fit(multi_input_dataset)

Featurizing: 100%|██████████| 25/25 [00:00<00:00, 35593.21it/s]
Word2Vec: 100%|██████████| 25/25 [00:00<00:00, 1057.39it/s]
Featurizing: 100%|██████████| 18/18 [00:00<00:00, 2236.96it/s]
DeepMolDescriptors: 100%|██████████| 18/18 [00:00<00:00, 1039.09it/s]
INFO:plants_sm.models.pytorch_model:starting to fit the data...
INFO:plants_sm.models.pytorch_model:[1/4, 0/9] loss: 0.69175535
INFO:plants_sm.models.pytorch_model:[1/4, 0/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:[1/4, 8/9] loss: 0.68787682
INFO:plants_sm.models.pytorch_model:[1/4, 8/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:Training loss: 0.68877766;  Metric result: 0.0
INFO:plants_sm.models.pytorch_model:[2/4, 0/9] loss: 0.68024886
INFO:plants_sm.models.pytorch_model:[2/4, 0/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:[2/4, 8/9] loss: 0.69293207
INFO:plants_sm.models.pytorch_model:[2/4, 8/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:Training loss: 0.68086646;  Metric resul

<plants_sm.pipeline.pipeline.Pipeline at 0x7f5882394340>

In [2]:
from torch import nn

params = {
    'features': 512,
    'lr': 0.001,
    'momentum': 0,
}

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, params['features']),
            nn.ReLU(),
            nn.Linear(params['features'], params['features']),
            nn.ReLU(),
            nn.Linear(params['features'], 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=100,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")
