In [18]:
from glob import glob
import numpy as np

In [19]:
NUCLEAR_CHARGE = {
    "H":1,
    "C":6,
    "O":8,
    "N":7,
    "F":9,
    "Cl":17,
    "S":16
}

In [20]:
def read_xyz(filename):
    with open(filename, "r") as f:
        lines = f.readlines()

    natoms = int(lines[0])
    nuclear_charges = []
    coordinates = []

    for i, line in enumerate(lines[2:natoms+2]):
        tokens = line.split()

        if len(tokens) < 4:
            break
        
        ncharge = tokens[0]
        if ncharge != 'H':
            nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])
            coordinates.append([float(token) for token in tokens[1:4]])
   
    return nuclear_charges, coordinates

In [21]:
def cutoff_func(R_ij, central_cutoff=4.8, central_decay=1):
    if R_ij <= (central_cutoff - central_decay):
        func = 1.
    elif ((central_cutoff - central_decay) < R_ij) and (R_ij <= (central_cutoff + central_decay)):
        func = 0.5 * (1. + np.cos((np.pi * R_ij - central_cutoff + central_decay)/central_decay))
    else:
        func = 0.
    return func

In [22]:
def get_atomic_CM(ncharges, coords, max_natoms, central_cutoff=4.8, central_decay=1):
    size = int((max_natoms + 1)*max_natoms / 2)
    rep = np.zeros((len(ncharges), size))
    
    # central atom loop
    for k in range(len(ncharges)):
        M = np.zeros((len(ncharges), len(ncharges)))
        for i in range(len(ncharges)):
            R_ik = np.linalg.norm(coords[i]-coords[k])
           # print('R_ik', R_ik)
            f_ik = cutoff_func(R_ik, central_cutoff=central_cutoff,
                              central_decay=central_decay)
            for j in range(len(ncharges)):
                if i <=j:
                    if i == j:
                        M[i,j] = 0.5 * ncharges[i]**2.4 * f_ik**2
                        M[j,i] = M[i,j]

                    else:
                        R_jk = np.linalg.norm(coords[j]-coords[k])
                      #  print('R_jk', R_jk)
                        f_jk = cutoff_func(R_jk, central_cutoff=central_cutoff,
                                          central_decay=central_decay)
                        R_ij = np.linalg.norm(coords[i]-coords[j])
                      #  print('R_ij', R_ij)
                        f_ij = cutoff_func(R_ij, central_cutoff=central_cutoff,
                                          central_decay=central_decay)
                        M[i,j] = (ncharges[i]*ncharges[j]/R_ij)*f_ik*f_jk*f_ij
                        M[j,i] = M[i,j]


        # concat upper triangular and diagonal
        upper_triang = M[np.triu_indices(len(M))]
        s_upper_triang = np.sort(upper_triang)[::-1]
        
        # pad to full size
        n_zeros = size - len(s_upper_triang)
        zeros = np.zeros(n_zeros)
        rep[k] = np.concatenate((s_upper_triang, zeros))

    return rep

In [23]:
qm7_files = sorted(glob("../qm7-xyz/*.xyz"))

In [24]:
conf_data = [read_xyz(x) for x in qm7_files]

In [25]:
ncharges_list, coords_list = zip(*conf_data)

In [27]:
qm7_reps = [np.array(get_atomic_CM(np.array(ncharges_list[i]),
                                                                        np.array(coords_list[i]), 
                                                                        max_natoms=23))
            for i in range(len(ncharges_list))]

In [28]:
qm7_reps = np.array(qm7_reps)

  """Entry point for launching an IPython kernel.


In [29]:
qm7_reps[0].shape

(1, 276)

In [16]:
qm7_labels = [t.split("/")[-1].split(".sdf")[0] for t in qm7_files]

In [35]:
# np save 

In [30]:
np.savez("../representations/qm7_aCM_data.npz", 
         qm7_labels=qm7_labels,
         qm7_ncharges=ncharges_list,
         qm7_reps=qm7_reps)