In [21]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys

# A. Continuous descriptor

### * MOE 2D descriptor

The MOE 2D descriptors are generated from MOE software (https://www.chemcomp.com/index.htm), Rutgers has school license for this software.

### * Dragon 2D descriptor

The Dragon 2D descriptors are generated from Dragon software(https://chm.kode-solutions.net/index.php)

### * rdkit descriptor

There are several continuous descriptors in rdkit, the details could be found in http://rdkit.org/docs/source/rdkit.Chem.Descriptors.html 

In [1]:
#load all compounds as molecule object in rdkit, and store them in list named as molecules

from rdkit import Chem
import pandas as pd
import os
currentDirectory = os.getcwd()
d = os.path.join(currentDirectory, "Datasets", "example.csv")
dataset = pd.read_csv(d, index_col = 0)
molecules = [Chem.MolFromSmiles(mol) for mol in dataset.SMILES]

In [14]:
calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=dataset.index,
                     columns=list(calculator.GetDescriptorNames()))

In [15]:
print(X.isna().sum().sum())
# X_ = X.fillna(X.mean())

0


# B. Binary descriptor (fingerprint)

### * MACCS keys

MACCS keys could be generated using rdkit (http://rdkit.org/docs/source/rdkit.Chem.MACCSkeys.html).

In [16]:
data = []

for mol in molecules:
    maccs = [int(x) for x in MACCSkeys.GenMACCSKeys(mol)]
    data.append(maccs)

X = pd.DataFrame(data, index=dataset.index)

### * ECFP/FCFP fingerprints

ECFP/FCFP fingerprints could be generated using rdkit Morgan/Circular module  (https://www.rdkit.org/docs/GettingStartedInPython.html).

In [19]:
#ECFP4

data = []

for mol in molecules:
    ecfp6 = [int(x) for x in AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)]
    data.append(ecfp6)

X = pd.DataFrame(data, index=dataset.index)

In [17]:
#ECFP6

data = []

for mol in molecules:
    ecfp6 = [int(x) for x in AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024)]
    data.append(ecfp6)

X = pd.DataFrame(data, index=dataset.index)

In [18]:
#FCFP6
data = []

for mol in molecules:
    fcfp6 = [int(x) for x in AllChem.GetMorganFingerprintAsBitVect(mol, 3, 1024, useFeatures=True)]
    data.append(fcfp6)

X = pd.DataFrame(data, index=dataset.index)

In [20]:
#FCFP4
data = []

for mol in molecules:
    fcfp6 = [int(x) for x in AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024, useFeatures=True)]
    data.append(fcfp6)

X = pd.DataFrame(data, index=dataset.index)