In [9]:
import pandas as pd
import numpy as np

In [64]:
class DataWrangler:
    """
    Generalized wrapper class to simplify the process
    of feature engineering
    """
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

        F_unregularized = [
            self.compute_features(x)
            for x in self.X
        ]

        self.F = self.regularize_features(F_unregularized)

    def compute_features(self, x):
        raise NotImplemented

    def regularize_features(self, F):
        return F

    def save_features(self, fname):
        """
        Saves the features calculated for the data
        to a file
        """
        data = np.hstack((self.F, np.reshape(self.Y, (len(self.Y), 1))))
        pd.DataFrame(data).to_csv(fname)


class PaddedDataWrangler(DataWrangler):
    """
    Automatically pads features with 0s
    in order to ensure all vectors are of
    the same size
    """
    def regularize_features(self, F):
        """
        Pad the feature vectors to ensure they
        are all the same size
        """
        biggest_feature_vec_size = max(len(v) for v in F)
        padded = [
            np.pad(v, (0, biggest_feature_vec_size - len(v)), mode="constant")
            for v in F
        ]
        return padded

In [66]:
class AdjacencyEigenWrangler(PaddedDataWrangler):
    """
    Computes the eigenspectrum for the adjacency matrix
    of a molecule
    """
    def compute_features(self, mol):
        matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
        eigenvalues, _ = np.linalg.eig(matrix)
        return np.sort(eigenvalues.real)[::-1]

In [67]:
from deepchem.feat.coulomb_matrices import CoulombMatrix

class CoulombEigenWrangler(PaddedDataWrangler):
    """
    Computes the eigenspectrum for a molecule's coulomb
    matrix
    """
    coulomb_featureizer = CoulombMatrixEig(remove_hydrogens=True, max_atoms=200)

    def compute_features(self, mol):
        mol = Chem.AddHs(mol)
        Chem.AllChem.EmbedMultipleConfs(mol,1)
        Chem.AllChem.UFFOptimizeMoleculeConfs(mol,maxIters=1000)
        matrix = self.coulomb_featureizer.coulomb_matrix(mol)
        eigenvalues, _ = np.linalg.eig(matrix)
        return np.sort(eigenvalues[0].real)[::-1]

In [68]:
# read in training data
df_train = pd.read_csv("train.csv")

In [69]:
# extract the input data and target data
from rdkit import Chem

DEBUG = True
TEST_SIZE = 10000 if DEBUG else len(df_train)
X_train = [Chem.MolFromSmiles(smile) for smile in df_train.smiles.values[:TEST_SIZE]]
Y_train = df_train.gap.values[:TEST_SIZE]

In [70]:
adjacency_eigen = AdjacencyEigenWrangler(X_train, Y_train)
adjacency_eigen.save_features("data/adjacency_features.csv")

In [71]:
coulomb_eigen = CoulombEigenWrangler(X_train, Y_train)
coulomb_eigen.save_features("data/coulomb_features.csv")