In [1]:
import sys

import rdkit # required for cheminformatics functionality (small molecules)
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

import numpy as np
np.set_printoptions(threshold=sys.maxsize)

In [2]:
fp_length = 2048 # the length of the bit fingerprint (vector) representing the molecule

Create two lists from the data file, one with the molecules and the other with the classes of the molecules.  The classes are integers [0..10] representing the 11 classes.  Keep in mind that this is a multi-class problem, i.e. each  molecule (metabolite) may have more than one class.

In [3]:
# load the molecules
with open("../data/kegg_classes.txt") as f:
    mols_str, classes = zip(*[ line.strip().split() for line in f])

In [4]:
# generate the descriptors for ML 
fp_descriptors = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(m), 2, nBits=fp_length) for m in mols_str ]



In [5]:
# cannot use list comprehensions here as ConvertToNumpyArray has a void return
ecfp = []
for d in fp_descriptors:
    np_arr = np.zeros(0, dtype=np.int8)
    DataStructs.ConvertToNumpyArray(d, np_arr)
    ecfp.append(np_arr)

# but what does one of these look like?
ecfp[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,