In [1]:
import qml 

In [2]:
from glob import glob
import numpy as np

In [3]:
from rdkit import Chem

In [4]:
def read_sdf(sdf):
    with open(sdf, "r") as f:
        txt = f.read().rstrip()
    return txt

In [5]:
def get_CM(sdf):
    mol = Chem.MolFromMolBlock(sdf)
    # rdkit molobj
    ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    conf = mol.GetConformer()
    coords = np.asarray(conf.GetPositions())
    CM = np.zeros((len(coords), len(coords)))
    for i in range(len(coords)):
        for j in range(len(coords)):
            if i==j:
                CM[i,j] = 0.5 * ncharges[i]**2.4
            else:
                CM[i,j] = ncharges[i] * ncharges[j] / np.linalg.norm(coords[j] - coords[i])
                
    return ncharges, CM

In [6]:
target_sdfs = sorted(glob("targets/*.sdf"))
target_sdfs

['targets/qm9.sdf', 'targets/vitc.sdf', 'targets/vitd.sdf']

In [7]:
qm9_amons_files = sorted(glob("amons-qm9/*.sdf"))

In [8]:
qm9_amons_sdfs = [read_sdf(x) for x in qm9_amons_files]

In [9]:
qm9_amons_ncharges = []
qm9_amons_CMs = []
for sdf in qm9_amons_sdfs:
    ncharge, CM = get_CM(sdf)
    qm9_amons_ncharges.append(ncharge)
    qm9_amons_CMs.append(CM)

In [10]:
qm9_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in qm9_amons_files]

In [11]:
vitc_amons_files = sorted(glob("amons-vitc/*.sdf"))

In [12]:
vitc_amons_sdfs = [read_sdf(x) for x in vitc_amons_files]

In [13]:
vitc_amons_ncharges = []
vitc_amons_CMs = []
for sdf in vitc_amons_sdfs:
    ncharge, CM = get_CM(sdf)
    vitc_amons_ncharges.append(ncharge)
    vitc_amons_CMs.append(CM)

In [14]:
vitc_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in vitc_amons_files]

In [15]:
vitd_amons_files = sorted(glob("amons-vitd/*.sdf"))

In [16]:
vitd_amons_sdfs = [read_sdf(x) for x in vitd_amons_files]

In [17]:
vitd_amons_ncharges = []
vitd_amons_CMs = []
for sdf in vitd_amons_sdfs:
    ncharge, CM = get_CM(sdf)
    vitd_amons_ncharges.append(ncharge)
    vitd_amons_CMs.append(CM)

In [18]:
vitd_amons_labels = [t.split("/")[-1].split(".sdf")[0] for t in vitd_amons_files]

In [19]:
# np save 

In [20]:
np.savez("amons_data.npz", 
         vitd_amons_labels=vitd_amons_labels,
         vitc_amons_labels=vitc_amons_labels,
         qm9_amons_labels=qm9_amons_labels,
         vitd_amons_ncharges=vitd_amons_ncharges,
         vitc_amons_ncharges=vitc_amons_ncharges,
         qm9_amons_ncharges=qm9_amons_ncharges,
         vitd_amons_CMs=vitd_amons_CMs,
         vitc_amons_CMs=vitc_amons_CMs,
         qm9_amons_CMs=qm9_amons_CMs)

  return array(a, dtype, copy=False, order=order, subok=True)


In [21]:
x = np.load("amons_data.npz", allow_pickle=True)

In [22]:
x.files

['vitd_amons_labels',
 'vitc_amons_labels',
 'qm9_amons_labels',
 'vitd_amons_ncharges',
 'vitc_amons_ncharges',
 'qm9_amons_ncharges',
 'vitd_amons_CMs',
 'vitc_amons_CMs',
 'qm9_amons_CMs']