# Package Example

## Imports

In [5]:
from plants_sm.data_structures.dataset.multi_input_dataset import MultiInputDataset
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.compounds.deepmol_standardizers import DeepMolStandardizer
from plants_sm.featurization.proteins.bio_embeddings.word2vec import Word2Vec
from plants_sm.featurization.compounds.deepmol_descriptors import DeepMolDescriptors
from torch import nn
from sklearn.metrics import f1_score
from torch.optim import Adam
from plants_sm.models.constants import BINARY
from plants_sm.models.pytorch_model import PyTorchModel
from plants_sm.models.interaction.baseline_model import BaselineModel
from plants_sm.tokenisation.compounds.smilespe import AtomLevelTokenizer
import pandas as pd


from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [4]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"
pd.read_csv(multi_input_dataset_csv).head()

Unnamed: 0,ids,SEQ,SUBSTRATES,LogSpActivity
0,0,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C(C(=O)O)N,0
1,1,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C[C@@H](C(=O)O)N,1
2,2,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)[C@@H](C(=O)O)N,0
3,3,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)C[C@@H](C(=O)O)N,1
4,4,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC[C@H](C)[C@@H](C(=O)O)N,0


## Read datasets

In [6]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"

multi_input_dataset = MultiInputDataset.from_csv(file_path=multi_input_dataset_csv,
                                                 representation_fields={"proteins": "SEQ",
                                                                       "ligands": "SUBSTRATES"},
                                                 instances_ids_field={"interaction": "ids"},
                                                 labels_field="LogSpActivity")

The MultiInputDataset converts everything into dictionaries to ensure that we are always processing unique sequences and not the same multiple times

In [7]:
multi_input_dataset.instances["proteins"]

{8: 'MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVLTSVKKAEQYLLENETTKNYLGIDGIPEFGRCTQELLFGKGSALINDKRARTAQTPGGTGALRVAADFLAKNTSVKRVWVSNPSWPNHKSVFNSAGLEVREYAYYDAENHTLDFDALINSLNEAQAGDVVLFHGCCHNPTGIDPTLEQWQTLAQLSVEKGWLPLFDFAYQGFARGLEEDAEGLRAFAAMHKELIVASSYSKNFGLYNERVGACTLVAADSETVDRAFSQMKAAIRANYSNPPAHGASVVATILSNDALRAIWEQELTDMRQRIQRMRQLFVNTLQEKGANRDFSFIIKQNGMFSFSGLTKEQVLRLREEFGVYAVASGRVNVAGMTPDNMAPLCEAIVAVL',
 5: 'MDYVTLASHAVRQYAPDQIFTASQRAKADAAALGEDAVINATLGECLDDDGKLMVLPTVERMIRQMPVEDICSYAPIAGIKGFNEAVQISLFGKCLDRFYVESVATPGGCGALRHAIWNFLNFGDALLTTNWIWGPYKNICEEHGRRMVTFDMFNRENTFNLEGMDRAIGEILAVQEQLLMILNTPANNPTGYSMTKQEMEQTVAILKKHAAANPDKNLTFCLDVSYIDFAGSFEESREIFDAIFDMPANTMTLLIFSMSKSYTMCGMRCGALVCLGSTAESAAVFKQAMSYSSRSTWSNAIHMAQKILVDINLNPEIRERVSQERAVFRNTITNRGRTFCAAAKEASLEICPYQYGYFVAIPCKNPVETARILMDQHIYVVPQAQGLRFSPCTVTTEKCRKAPAFIKAAMEQTQ',
 10: 'MFQKVDAYAGDPILTLMERFKEDPRSDKVNLSIGLYYNEDGIIPQLQAVAEAEARLNAQPHGASLYLPMEGLNCYRHAIAPLLFGADHPVLKQQRVATIQTLGGSGALKVGADFLKRYFPESGVWVSDPTWENHVAIFAGAGFEVSTYPWYDEATNGVRFNDLLATL

In [8]:
multi_input_dataset.instances["ligands"]

{1: 'C(C(=O)O)N',
 17: 'C[C@@H](C(=O)O)N',
 14: 'CC(C)[C@@H](C(=O)O)N',
 13: 'CC(C)C[C@@H](C(=O)O)N',
 15: 'CC[C@H](C)[C@@H](C(=O)O)N',
 11: 'C1=CC=C(C=C1)C[C@@H](C(=O)O)N',
 10: 'C1=CC(=CC=C1C[C@@H](C(=O)O)N)O',
 12: 'C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N',
 7: 'C([C@@H](C(=O)O)N)O',
 18: 'C[C@H]([C@@H](C(=O)O)N)O',
 8: 'C([C@@H](C(=O)O)N)S',
 16: 'CSCC[C@@H](C(=O)O)N',
 5: 'C([C@@H](C(=O)O)N)C(=O)N',
 2: 'C(CC(=O)N)[C@@H](C(=O)O)N',
 6: 'C([C@@H](C(=O)O)N)C(=O)O',
 3: 'C(CCN)C[C@@H](C(=O)O)N',
 4: 'C(C[C@@H](C(=O)O)N)CN=C(N)N',
 9: 'C1=C(NC=N1)C[C@@H](C(=O)O)N'}

## Standardize sequences

In [9]:
multi_input_dataset = ProteinStandardizer(n_jobs=50).fit_transform(multi_input_dataset, instance_type="proteins")

Featurizing: 100%|██████████| 25/25 [00:00<00:00, 221.80it/s]


In [None]:
multi_input_dataset.instances["proteins"]

In [10]:
multi_input_dataset = DeepMolStandardizer(n_jobs=50).fit_transform(multi_input_dataset, "ligands")

Featurizing: 100%|██████████| 18/18 [00:00<00:00, 165.75it/s]


In [None]:
multi_input_dataset.instances["ligands"]

## Generating features and encodings

In [11]:
proteins_one_hot = Word2Vec().fit(multi_input_dataset, "proteins")
multi_input_dataset = proteins_one_hot.transform(multi_input_dataset, "proteins")

Featurizing: 100%|██████████| 25/25 [00:00<00:00, 691.22it/s]


In [12]:
deepmol_wrapper = DeepMolDescriptors(n_jobs=8).fit(multi_input_dataset, instance_type="ligands")
multi_input_dataset = deepmol_wrapper.transform(multi_input_dataset, instance_type="ligands")

Featurizing: 100%|██████████| 18/18 [00:00<00:00, 43.48it/s]


In [None]:
multi_input_dataset.X["proteins"]

In [None]:
multi_input_dataset.X["ligands"]

In [None]:
multi_input_dataset.y

## Tokenizers

In [None]:
AtomLevelTokenizer().tokenize("CC[C@H](C)[C@@H](C(=O)O)N")

## Running models

In [None]:
input_size_proteins = multi_input_dataset.X["proteins"].shape[1]
input_size_compounds = multi_input_dataset.X["ligands"].shape[1]
model = BaselineModel(input_size_proteins, input_size_compounds, hidden_layers_proteins=[500, 250],
                      hidden_layers_compounds=[500, 500], hidden_layers_interaction=[500, 500])

wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=100,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")
wrapper.fit(multi_input_dataset)