# Package Example

In [1]:
from pydantic import BaseModel
from typing import Dict, List, Tuple

class MyModel(BaseModel):
    data: Dict[str, List[Tuple[str, float]]]

# Example usage
my_model = MyModel(data={"key1": [("apple", 1.5), ("banana", 2.3)]})

## Imports

In [2]:
from plants_sm.data_structures.dataset.multi_input_dataset import MultiInputDataset
from plants_sm.data_standardization.proteins.standardization import ProteinStandardizer
from plants_sm.data_standardization.compounds.deepmol_standardizers import DeepMolStandardizer
from plants_sm.featurization.compounds.deepmol_descriptors import DeepMolDescriptors
from plants_sm.featurization.proteins.propythia.propythia import PropythiaWrapper
from torch import nn
from sklearn.metrics import f1_score
from torch.optim import Adam
from plants_sm.models.constants import BINARY
from plants_sm.models.pytorch_model import PyTorchModel
from plants_sm.models.interaction.baseline_model import BaselineModel
from plants_sm.tokenisation.compounds.smilespe import AtomLevelTokenizer
import pandas as pd


from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

[07:22:30] Initializing Normalizer
2024-07-30 07:22:35.697188: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 07:22:36.185246: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-30 07:22:37.085573: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 07:22:37.085611: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 07:22:37.085647: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register

In [3]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"
pd.read_csv(multi_input_dataset_csv).head()

Unnamed: 0,ids,SEQ,SUBSTRATES,LogSpActivity
0,0,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C(C(=O)O)N,0
1,1,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,C[C@@H](C(=O)O)N,1
2,2,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)[C@@H](C(=O)O)N,0
3,3,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC(C)C[C@@H](C(=O)O)N,1
4,4,MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVL...,CC[C@H](C)[C@@H](C(=O)O)N,0


## Read datasets

In [4]:
multi_input_dataset_csv = "./data/aminotransferase_binary.csv"

multi_input_dataset = MultiInputDataset.from_csv(file_path=multi_input_dataset_csv,
                                                 representation_field={"proteins": "SEQ",
                                                                       "ligands": "SUBSTRATES"},
                                                 instances_ids_field={"interaction": "ids"},
                                                 labels_field="LogSpActivity")

The MultiInputDataset converts everything into dictionaries to ensure that we are always processing unique sequences and not the same multiple times

In [5]:
multi_input_dataset.instances["proteins"]

{8: 'MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVLTSVKKAEQYLLENETTKNYLGIDGIPEFGRCTQELLFGKGSALINDKRARTAQTPGGTGALRVAADFLAKNTSVKRVWVSNPSWPNHKSVFNSAGLEVREYAYYDAENHTLDFDALINSLNEAQAGDVVLFHGCCHNPTGIDPTLEQWQTLAQLSVEKGWLPLFDFAYQGFARGLEEDAEGLRAFAAMHKELIVASSYSKNFGLYNERVGACTLVAADSETVDRAFSQMKAAIRANYSNPPAHGASVVATILSNDALRAIWEQELTDMRQRIQRMRQLFVNTLQEKGANRDFSFIIKQNGMFSFSGLTKEQVLRLREEFGVYAVASGRVNVAGMTPDNMAPLCEAIVAVL',
 5: 'MDYVTLASHAVRQYAPDQIFTASQRAKADAAALGEDAVINATLGECLDDDGKLMVLPTVERMIRQMPVEDICSYAPIAGIKGFNEAVQISLFGKCLDRFYVESVATPGGCGALRHAIWNFLNFGDALLTTNWIWGPYKNICEEHGRRMVTFDMFNRENTFNLEGMDRAIGEILAVQEQLLMILNTPANNPTGYSMTKQEMEQTVAILKKHAAANPDKNLTFCLDVSYIDFAGSFEESREIFDAIFDMPANTMTLLIFSMSKSYTMCGMRCGALVCLGSTAESAAVFKQAMSYSSRSTWSNAIHMAQKILVDINLNPEIRERVSQERAVFRNTITNRGRTFCAAAKEASLEICPYQYGYFVAIPCKNPVETARILMDQHIYVVPQAQGLRFSPCTVTTEKCRKAPAFIKAAMEQTQ',
 10: 'MFQKVDAYAGDPILTLMERFKEDPRSDKVNLSIGLYYNEDGIIPQLQAVAEAEARLNAQPHGASLYLPMEGLNCYRHAIAPLLFGADHPVLKQQRVATIQTLGGSGALKVGADFLKRYFPESGVWVSDPTWENHVAIFAGAGFEVSTYPWYDEATNGVRFNDLLATL

In [6]:
multi_input_dataset.instances["ligands"]

{1: 'C(C(=O)O)N',
 17: 'C[C@@H](C(=O)O)N',
 14: 'CC(C)[C@@H](C(=O)O)N',
 13: 'CC(C)C[C@@H](C(=O)O)N',
 15: 'CC[C@H](C)[C@@H](C(=O)O)N',
 11: 'C1=CC=C(C=C1)C[C@@H](C(=O)O)N',
 10: 'C1=CC(=CC=C1C[C@@H](C(=O)O)N)O',
 12: 'C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N',
 7: 'C([C@@H](C(=O)O)N)O',
 18: 'C[C@H]([C@@H](C(=O)O)N)O',
 8: 'C([C@@H](C(=O)O)N)S',
 16: 'CSCC[C@@H](C(=O)O)N',
 5: 'C([C@@H](C(=O)O)N)C(=O)N',
 2: 'C(CC(=O)N)[C@@H](C(=O)O)N',
 6: 'C([C@@H](C(=O)O)N)C(=O)O',
 3: 'C(CCN)C[C@@H](C(=O)O)N',
 4: 'C(C[C@@H](C(=O)O)N)CN=C(N)N',
 9: 'C1=C(NC=N1)C[C@@H](C(=O)O)N'}

## Standardize sequences

In [7]:
multi_input_dataset = ProteinStandardizer(n_jobs=50).fit_transform(multi_input_dataset, instance_type="proteins")

ProteinStandardizer: 100%|██████████| 25/25 [00:00<00:00, 1890.38it/s]


In [8]:
multi_input_dataset.instances["proteins"]

{8: 'MFENITAAPADPILGLADLFRADERPGKINLGIGVYKDETGKTPVLTSVKKAEQYLLENETTKNYLGIDGIPEFGRCTQELLFGKGSALINDKRARTAQTPGGTGALRVAADFLAKNTSVKRVWVSNPSWPNHKSVFNSAGLEVREYAYYDAENHTLDFDALINSLNEAQAGDVVLFHGCCHNPTGIDPTLEQWQTLAQLSVEKGWLPLFDFAYQGFARGLEEDAEGLRAFAAMHKELIVASSYSKNFGLYNERVGACTLVAADSETVDRAFSQMKAAIRANYSNPPAHGASVVATILSNDALRAIWEQELTDMRQRIQRMRQLFVNTLQEKGANRDFSFIIKQNGMFSFSGLTKEQVLRLREEFGVYAVASGRVNVAGMTPDNMAPLCEAIVAVL',
 5: 'MDYVTLASHAVRQYAPDQIFTASQRAKADAAALGEDAVINATLGECLDDDGKLMVLPTVERMIRQMPVEDICSYAPIAGIKGFNEAVQISLFGKCLDRFYVESVATPGGCGALRHAIWNFLNFGDALLTTNWIWGPYKNICEEHGRRMVTFDMFNRENTFNLEGMDRAIGEILAVQEQLLMILNTPANNPTGYSMTKQEMEQTVAILKKHAAANPDKNLTFCLDVSYIDFAGSFEESREIFDAIFDMPANTMTLLIFSMSKSYTMCGMRCGALVCLGSTAESAAVFKQAMSYSSRSTWSNAIHMAQKILVDINLNPEIRERVSQERAVFRNTITNRGRTFCAAAKEASLEICPYQYGYFVAIPCKNPVETARILMDQHIYVVPQAQGLRFSPCTVTTEKCRKAPAFIKAAMEQTQ',
 10: 'MFQKVDAYAGDPILTLMERFKEDPRSDKVNLSIGLYYNEDGIIPQLQAVAEAEARLNAQPHGASLYLPMEGLNCYRHAIAPLLFGADHPVLKQQRVATIQTLGGSGALKVGADFLKRYFPESGVWVSDPTWENHVAIFAGAGFEVSTYPWYDEATNGVRFNDLLATL

### Padding

In [9]:
from plants_sm.data_standardization.proteins.padding import SequencePadder
from plants_sm.data_standardization.compounds.padding import SMILESPadder

multi_input_dataset = SequencePadder(n_jobs=50).fit_transform(multi_input_dataset, instance_type="proteins")

SequencePadder: 100%|██████████| 25/25 [00:00<00:00, 2100.77it/s]


In [10]:
multi_input_dataset.instances["proteins"][1]

'MADTRPERRFTRIDRLPPYVFNITAELKMAARRRGEDIIDFSMGNPDGATPPHIVEKLCTVAQRPDTHGYSTSRGIPRLRRAISRWYQDRYDVEIDPESEAIVTIGSKEGLAHLMLATLDHGDTVLVPNPSYPIHIYGAVIAGAQVRSVPLVEGVDFFNELERAIRESYPKPKMMILGFPSNPTAQCVELEFFEKVVALAKRYDVLVVHDLAYADIVYDGWKAPSIMQVPGARDVAVEFFTLSKSYNMAGWRIGFMVGNKTLVSALARIKSYHDYGTFTPLQVAAIAALEGDQQCVRDIAEQYKRRRDVLVKGLHEAGWMVEMPKASMYVWAKIPEPYAAMGSLEFAKKLLNEAKVCVSPGIGFGDYGDTHVRFALIENRDRIRQAIRGIKAMFRADGLLPASSKHIHENAE----------------------------------------------------------------'

In [11]:
multi_input_dataset = DeepMolStandardizer(n_jobs=50).fit_transform(multi_input_dataset, "ligands")

2024-07-30 07:24:34,378 — INFO — Standardizer CustomStandardizer initialized with -1 jobs.


custom_standardizer: 100%|██████████| 18/18 [00:00<00:00, 791.64it/s]


In [12]:
SMILESPadder().fit_transform(multi_input_dataset, "ligands")

SMILESPadder: 100%|██████████| 18/18 [00:00<00:00, 1307.90it/s]


<plants_sm.data_structures.dataset.multi_input_dataset.MultiInputDataset at 0x7f80899263b0>

In [13]:
multi_input_dataset.instances["ligands"]

{1: 'NCC(=O)OGGGGGGGGGGGGGGGGGGG',
 17: 'CC(N)C(=O)OGGGGGGGGGGGGGGGG',
 14: 'CC(C)C(N)C(=O)OGGGGGGGGGGGG',
 13: 'CC(C)CC(N)C(=O)OGGGGGGGGGGG',
 15: 'CCC(C)C(N)C(=O)OGGGGGGGGGGG',
 11: 'NC(Cc1ccccc1)C(=O)OGGGGGGGG',
 10: 'NC(Cc1ccc(O)cc1)C(=O)OGGGGG',
 12: 'NC(Cc1c[nH]c2ccccc12)C(=O)O',
 7: 'NC(CO)C(=O)OGGGGGGGGGGGGGGG',
 18: 'CC(O)C(N)C(=O)OGGGGGGGGGGGG',
 8: 'NC(CS)C(=O)OGGGGGGGGGGGGGGG',
 16: 'CSCCC(N)C(=O)OGGGGGGGGGGGGG',
 5: 'NC(=O)CC(N)C(=O)OGGGGGGGGGG',
 2: 'NC(=O)CCC(N)C(=O)OGGGGGGGGG',
 6: 'NC(CC(=O)O)C(=O)OGGGGGGGGGG',
 3: 'NCCCCC(N)C(=O)OGGGGGGGGGGGG',
 4: 'NC(N)=NCCCC(N)C(=O)OGGGGGGG',
 9: 'NC(Cc1cnc[nH]1)C(=O)OGGGGGG'}

## Generating features and encodings

### Word2Vec

In [14]:
proteins_word2vec = PropythiaWrapper(n_jobs=30).fit(multi_input_dataset, "proteins")
multi_input_dataset = proteins_word2vec.transform(multi_input_dataset, "proteins")

PropythiaWrapper: 100%|██████████| 25/25 [00:00<00:00, 26.12it/s]


### DeepMol descriptors

In [15]:
deepmol_wrapper = DeepMolDescriptors(n_jobs=8).fit(multi_input_dataset, instance_type="ligands")
multi_input_dataset = deepmol_wrapper.transform(multi_input_dataset, instance_type="ligands")

DeepMolDescriptors: 100%|██████████| 18/18 [00:00<00:00, 47.18it/s]


In [16]:
multi_input_dataset.dataframe

Unnamed: 0,ids,LogSpActivity,proteins_identifiers,ligands_identifiers
0,0,0,8,1
1,1,1,8,17
2,2,0,8,14
3,3,1,8,13
4,4,0,8,15
...,...,...,...,...
445,445,1,2,2
446,446,1,2,6
447,447,0,2,3
448,448,0,2,4


In [17]:
multi_input_dataset.X["ligands"]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
multi_input_dataset.y

array([[0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

## Tokenizers

In [19]:
AtomLevelTokenizer().tokenize("CC[C@H](C)[C@@H](C(=O)O)N")

['C',
 'C',
 '[C@H]',
 '(',
 'C',
 ')',
 '[C@@H]',
 '(',
 'C',
 '(',
 '=',
 'O',
 ')',
 'O',
 ')',
 'N']

### One hot encoding

In [20]:


from plants_sm.featurization.encoding.one_hot_encoder import OneHotEncoder

OneHotEncoder(tokenizer=AtomLevelTokenizer()).fit_transform(multi_input_dataset, "ligands")
multi_input_dataset.X["ligands"]

OneHotEncoder: 100%|██████████| 18/18 [00:00<00:00, 15436.00it/s]


array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

## Running models

In [21]:
input_size_proteins = multi_input_dataset.X["proteins"].shape[1]
input_size_compounds = multi_input_dataset.X["ligands"].shape[1]
model = BaselineModel(input_size_proteins, input_size_compounds, hidden_layers_proteins=[500, 250],
                      hidden_layers_compounds=[500, 500], hidden_layers_interaction=[500, 500])

wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=100,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")
wrapper.fit(multi_input_dataset)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1200x12 and 24x48)

In [10]:
from plants_sm.pipeline.pipeline import Pipeline

input_size_proteins = multi_input_dataset.X["proteins"].shape[1]
input_size_compounds = multi_input_dataset.X["ligands"].shape[1]
model = BaselineModel(input_size_proteins, input_size_compounds, hidden_layers_proteins=[500, 250],
                      hidden_layers_compounds=[500, 500], hidden_layers_interaction=[500, 500])

wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=4,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")

steps = {"proteins": [ProteinStandardizer(), Word2Vec()],
            "ligands": [DeepMolStandardizer(), DeepMolDescriptors()]}
pipeline = Pipeline(steps, models=[wrapper])

pipeline.fit(multi_input_dataset)

Featurizing: 100%|██████████| 25/25 [00:00<00:00, 35593.21it/s]
Word2Vec: 100%|██████████| 25/25 [00:00<00:00, 1057.39it/s]
Featurizing: 100%|██████████| 18/18 [00:00<00:00, 2236.96it/s]
DeepMolDescriptors: 100%|██████████| 18/18 [00:00<00:00, 1039.09it/s]
INFO:plants_sm.models.pytorch_model:starting to fit the data...
INFO:plants_sm.models.pytorch_model:[1/4, 0/9] loss: 0.69175535
INFO:plants_sm.models.pytorch_model:[1/4, 0/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:[1/4, 8/9] loss: 0.68787682
INFO:plants_sm.models.pytorch_model:[1/4, 8/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:Training loss: 0.68877766;  Metric result: 0.0
INFO:plants_sm.models.pytorch_model:[2/4, 0/9] loss: 0.68024886
INFO:plants_sm.models.pytorch_model:[2/4, 0/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:[2/4, 8/9] loss: 0.69293207
INFO:plants_sm.models.pytorch_model:[2/4, 8/9] metric result: 0.0
INFO:plants_sm.models.pytorch_model:Training loss: 0.68086646;  Metric resul

<plants_sm.pipeline.pipeline.Pipeline at 0x7f5882394340>

In [2]:
from torch import nn

params = {
    'features': 512,
    'lr': 0.001,
    'momentum': 0,
}

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, params['features']),
            nn.ReLU(),
            nn.Linear(params['features'], params['features']),
            nn.ReLU(),
            nn.Linear(params['features'], 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
wrapper = PyTorchModel(model=model, loss_function=nn.BCELoss(),
                               validation_metric=f1_score,
                               problem_type=BINARY, batch_size=50, epochs=100,
                               optimizer=Adam(model.parameters(), lr=0.0001), progress=50,
                               logger_path="small_dataset.log")


In [1]:
from rdkit.Chem import Mol, AllChem, MolFromSmiles, Descriptors, rdMolDescriptors

[x[0] for x in Descriptors._descList]

['MaxAbsEStateIndex',
 'MaxEStateIndex',
 'MinAbsEStateIndex',
 'MinEStateIndex',
 'qed',
 'SPS',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'AvgIpc',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',


In [19]:
descriptors = """
BalabanJ
BertzCT
Chi0
Chi0n
Chi0v
Chi1
Chi1n
Chi1v
Chi2n
Chi2v
Chi3n
Chi3v
Chi4n
Chi4v
EState VSA1
EState VSA10
EState VSA11
EState VSA2
EState VSA3
EState VSA4
EState VSA5
EState VSA6
EState VSA7
EState VSA8
EState VSA9
ExactMolWt
FpDensityMorgan1
FpDensityMorgan2
FpDensityMorgan3
FractionCSP3
HallKierAlpha
HeavyAtomCount
HeavyAtomMolWt
Ipc
Kappa1
Kappa2
Kappa3
LabuteASA
MaxAbsEStateIndex
MaxAbsPartialCharge
MaxEStateIndex
MaxPartialCharge
MinAbsEStateIndex
MinAbsPartialCharge
MinEStateIndex
MinPartialCharge
MolLogP
MolMR
MolWt
NHOHCount
NOCount
NumAliphaticCarbocycles 
NumAliphaticHeterocycles
NumAliphaticRings
NumAromaticCarbocycles
NumAromaticHeterocycles
NumAromaticRings
NumHAcceptors
NumHDonors
NumHeteroatoms
NumRadicalElectrons
NumRotatableBonds 
NumSaturatedCarbocycles 
NumSaturatedHeterocycles
NumSaturatedRings
NumValenceElectrons
PEOE VSA1
PEOE VSA10
PEOE VSA11
PEOE VSA12
PEOE VSA13
PEOE VSA14
PEOE VSA2
PEOE VSA3
PEOE VSA4
PEOE VSA5
PEOE VSA6
PEOE VSA7
PEOE VSA8
PEOE VSA9
RingCount
SMR VSA1
SMR VSA10
SMR VSA2
SMR VSA3
SMR VSA4
SMR VSA5
SMR VSA6
SMR VSA7
SMR VSA8
SMR VSA9
SlogP VSA1
SlogP VSA10
SlogP VSA11
SlogP VSA12
SlogP VSA2
SlogP VSA3
SlogP VSA4
SlogP VSA5
SlogP VSA6
SlogP VSA7
SlogP VSA8
SlogP VSA9
TPSA
VSA EState1
VSA EState10
VSA EState2
VSA EState3
VSA EState4
VSA EState5
VSA EState6
VSA EState7
VSA EState8
VSA EState9
fr Al COO
fr Al OH
fr Al OH noTert
fr ArN
fr Ar COO
fr Ar N
fr Ar NH
fr Ar OH
fr COO
fr COO2
fr C O
fr C O noCOO
fr C S
fr HOCCN
fr Imine
fr NH0
fr NH1
fr NH2
fr N O
fr Ndealkylation1
fr Ndealkylation2
fr Nhpyrrole
fr SH
fr aldehyde
fr alkyl carbamate
fr alkyl halide
fr allylic oxid
fr amide
fr amidine
fr aniline
fr aryl methyl
fr azide
fr azo
fr barbitur
fr benzene
fr benzodiazepine
fr bicyclic
fr diazo
fr dihydropyridine
fr epoxide
fr ester
fr ether
fr furan
fr guanido
fr halogen
fr hdrzine
fr hdrzone
fr imidazole
fr imide
fr isocyan
fr isothiocyan
fr ketone
fr ketone Topliss
fr lactam
fr lactone
fr methoxy
fr morpholine
fr nitrile
fr nitro
fr nitro arom
fr nitro arom nonortho
fr nitroso
fr oxazole
fr oxime
fr para hydroxylation
fr phenol
fr phenol noOrthoHbond
fr phos acid
fr phos ester
fr piperdine
fr piperzine
fr priamide
fr prisulfonamd
fr pyridine
fr quatN
fr sulfide
fr sulfonamd
fr sulfone
fr term acetylene
fr tetrazole
fr thiazole
fr thiocyan
fr thiophene
fr unbrch alkane
fr urea
qed
"""

In [20]:
descriptors = descriptors.split("\n")
descriptors = [x.replace(" ", "_") for x in descriptors if x]
len(descriptors)

200

In [21]:
from rdkit.ML.Descriptors import MoleculeDescriptors
mol = MolFromSmiles("CCO")
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
descriptors = list(calc.CalcDescriptors(mol))

In [22]:
len(descriptors)

200