In [1]:
from dscribe.descriptors import SOAP

In [2]:
from glob import glob
import numpy as np

In [3]:
from rdkit import Chem

In [4]:
def read_sdf(sdf):
    with open(sdf, "r") as f:
        txt = f.read().rstrip()
    return txt

In [5]:
def get_ncharges_coords(sdf):
    mol = Chem.MolFromMolBlock(sdf)
   #mol = Chem.AddHs(mol)
    # rdkit molobj
    ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    elements = [atom.GetSymbol() for atom in mol.GetAtoms()]
    conf = mol.GetConformer()
    coords = np.asarray(conf.GetPositions())
    return ncharges, elements, coords

In [6]:
import ase

In [7]:
def get_soap(sdf, elements=[6,7,8,16]):
    ncharges, atomtypes, coords = get_ncharges_coords(sdf)
    atomsobj = ase.Atoms(symbols=atomtypes, positions=coords)
    soap = SOAP(
             species=elements,
             rcut=5.0,
             nmax=8,
             lmax=8,
             sigma=0.2,
             periodic=False,
             crossover=True,
             sparse=False,
         )
    return soap.create(atomsobj)

In [12]:
target_sdfs = sorted(glob("../targets/*.sdf"))
target_sdfs

['../targets/qm9.sdf', '../targets/vitc.sdf', '../targets/vitd.sdf']

In [13]:
qm9_amons_files = sorted(glob("../amons-qm9/*.sdf"))

In [14]:
qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]

In [15]:
conf_data = [get_ncharges_coords(x) for x in qm9_amons_sdfs]

In [16]:
ncharges_list, _, _ = zip(*conf_data)

In [17]:
qm9_ncharges = ncharges_list

In [21]:
qm9_reps = [np.mean(get_soap(x), axis=0) for x in qm9_amons_sdfs]

In [22]:
qm9_reps = np.array(qm9_reps)

In [23]:
qm9_reps[0].shape

(4752,)

In [24]:
qm9_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in qm9_amons_files]

In [30]:
vitc_amons_files = sorted(glob("../amons-vitc/*.sdf"))

In [31]:
vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]

In [32]:
conf_data = [get_ncharges_coords(x) for x in vitc_amons_sdfs]

In [33]:
ncharges_list, _, _ = zip(*conf_data)

In [34]:
vitc_ncharges = ncharges_list

In [36]:
vitc_reps = [np.mean(get_soap(x), axis=0) for x in vitc_amons_sdfs]

In [37]:
vitc_reps = np.array(vitc_reps)

In [38]:
vitc_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in vitc_amons_files]

In [43]:
vitd_amons_files = sorted(glob("../amons-vitd/*.sdf"))

In [44]:
vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]

In [45]:
conf_data = [get_ncharges_coords(x) for x in vitd_amons_sdfs]

In [46]:
ncharges_list, _, _ = zip(*conf_data)

In [47]:
vitd_ncharges = ncharges_list

In [49]:
vitd_reps = [np.mean(get_soap(x),axis=0) for x in vitd_amons_sdfs]

In [50]:
vitd_reps = np.array(vitd_reps)

In [51]:
vitd_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in vitd_amons_files]

In [35]:
# np save 

In [52]:
np.savez("../representations/amons_SOAP_global_data.npz", 
         vitd_amons_labels=vitd_amons_labels,
         vitc_amons_labels=vitc_amons_labels,
         qm9_amons_labels=qm9_amons_labels,
         vitd_amons_ncharges=vitd_ncharges,
         vitc_amons_ncharges=vitc_ncharges,
         qm9_amons_ncharges=qm9_ncharges,
         vitd_amons_reps=vitd_reps,
         vitc_amons_reps=vitc_reps,
         qm9_amons_reps=qm9_reps)

  return array(a, dtype, copy=False, order=order, subok=True)


In [53]:
vitd_reps[0].shape

(4752,)