In [1]:
from glob import glob
import numpy as np

In [2]:
from rdkit import Chem

In [3]:
target_xyzs = sorted(glob("../targets/*.xyz"))

In [4]:
def read_sdf(sdf):
    with open(sdf, "r") as f:
        txt = f.read().rstrip()
    return txt

In [5]:
def get_ncharges_coords(sdf):
    mol = Chem.MolFromMolBlock(sdf)
    # rdkit molobj
    ncharges = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    conf = mol.GetConformer()
    coords = np.asarray(conf.GetPositions())
    return ncharges, coords

In [6]:
target_files = sorted(glob("../targets/*.sdf"))
target_files

['../targets/qm9.sdf', '../targets/vitc.sdf', '../targets/vitd.sdf']

In [7]:
target_sdfs = [read_sdf(x) for x in target_files]

In [8]:
conf_data = [get_ncharges_coords(x) for x in target_sdfs]

In [9]:
ncharges_list, coords_list = zip(*conf_data)

In [10]:
ncharges_list[0]

[8, 6, 6, 7, 6, 8, 8, 7, 6]

In [11]:
coords_list[0]

array([[-0.8878,  1.2014,  0.1382],
       [ 0.2284,  0.8375, -0.1253],
       [ 1.4153,  1.7609, -0.033 ],
       [ 2.569 ,  1.2735, -0.3387],
       [ 3.6767,  2.1255, -0.2555],
       [ 4.7965,  1.8088, -0.5068],
       [ 3.4544,  3.4527,  0.1558],
       [ 2.2146,  3.9513,  0.4772],
       [ 1.2283,  3.13  ,  0.385 ]])

In [12]:
sizes = [len(x) for x in ncharges_list]
sizes

[9, 12, 28]

In [13]:
import qml

In [19]:
target_reps = np.array(
[np.array(qml.representations.generate_coulomb_matrix(np.array(ncharges_list[i]), np.array(coords_list[i]),
                                size=sizes[i]))
for i in range(len(ncharges_list))])

  after removing the cwd from sys.path.


In [20]:
target_labels = [t.split("/")[-1].split(".xyz")[0] for t in target_sdfs]

In [21]:
np.savez("../representations/target_CM_global_data.npz", 
         target_labels=target_labels, 
         target_reps=target_reps, 
         target_ncharges=ncharges_list,)

In [22]:
target_reps[0].shape

(45,)