In [1]:
from dscribe.descriptors import SOAP

In [2]:
from glob import glob
import numpy as np

In [3]:
from rdkit import Chem

In [4]:
def read_sdf(sdf):
    with open(sdf, "r") as f:
        txt = f.read().rstrip()
    return txt

In [5]:
def get_ncharges_coords(sdf):
    mol = Chem.MolFromMolBlock(sdf)
   #mol = Chem.AddHs(mol)
    # rdkit molobj
    ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    elements = [atom.GetSymbol() for atom in mol.GetAtoms()]
    conf = mol.GetConformer()
    coords = np.asarray(conf.GetPositions())
    return ncharges, elements, coords

In [6]:
import ase

In [7]:
def get_soap(sdf, elements=[6,7,8,16]):
    ncharges, atomtypes, coords = get_ncharges_coords(sdf)
    atomsobj = ase.Atoms(symbols=atomtypes, positions=coords)
    soap = SOAP(
             species=elements,
             rcut=5.0,
             nmax=8,
             lmax=8,
             sigma=0.2,
             periodic=False,
             crossover=True,
             sparse=False,
         )
    return soap.create(atomsobj)

In [9]:
target_files = sorted(glob("../targets/*.sdf"), reverse=True)
target_files

['../targets/qm9.sdf', '../targets/penicillin.sdf']

In [10]:
target_sdfs = [read_sdf(x) for x in target_files]

In [11]:
conf_data = [get_ncharges_coords(x) for x in target_sdfs]

In [12]:
ncharges_list, _, _ = zip(*conf_data)

In [13]:
elements_list = [np.unique(x) for x in ncharges_list]
elements_list

[array([6, 7, 8]), array([ 6,  7,  8, 16])]

In [14]:
sizes = [len(x) for x in ncharges_list]
sizes

[9, 23]

In [15]:
target_reps = [get_soap(target_sdfs[i]) for i in range(len(target_sdfs))]

In [16]:
target_reps[0].shape

(9, 4752)

In [17]:
target_labels = [t.split("/")[-1].split(".xyz")[0] for t in target_sdfs]

In [18]:
np.savez("target_SOAP_data.npz", 
         target_labels=target_labels, 
         target_reps=target_reps, 
         target_ncharges=ncharges_list,)

  return array(a, dtype, copy=False, order=order, subok=True)
