In [23]:
from dscribe.descriptors import SOAP

In [24]:
from glob import glob
import numpy as np

In [25]:
def read_xyz(filename):
    with open(filename, "r") as f:
        lines = f.readlines()

    natoms = int(lines[0])
    nuclear_charges = []
    coordinates = []

    for i, line in enumerate(lines[2:natoms+2]):
        tokens = line.split()

        if len(tokens) < 4:
            break
        
        ncharge = tokens[0]
        if ncharge != 'H':
            nuclear_charges.append(tokens[0])
            coordinates.append([float(token) for token in tokens[1:4]])
   
    return nuclear_charges, coordinates

In [26]:
import ase

In [27]:
def get_soap(atomtypes, coords, elements=[6,7,8,16]):
    atomsobj = ase.Atoms(symbols=atomtypes, positions=coords)
    soap = SOAP(
             species=elements,
             rcut=5.0,
             nmax=8,
             lmax=8,
             sigma=0.2,
             periodic=False,
             crossover=True,
             sparse=False,
         )
    return soap.create(atomsobj)

In [28]:
qm7_files = sorted(glob("../qm7-xyz/*.xyz"))

In [29]:
conf_data = [read_xyz(x) for x in qm7_files]

In [30]:
ncharges_list, coordinates_list = zip(*conf_data)

In [31]:
qm7_reps = [np.mean(get_soap(ncharges_list[i],
                            coordinates_list[i]), axis=0) for i in range(len(ncharges_list))]

In [32]:
qm7_reps = np.array(qm7_reps)

In [33]:
qm7_reps[0].shape

(4752,)

In [38]:
qm7_labels = [t.split("/")[-1].split(".xyz")[0] for t in qm7_files]

In [39]:
# np save 

In [40]:
np.savez("../representations/qm7_SOAP_global_data.npz", 
         qm7_labels=qm7_labels,
         qm7_ncharges=ncharges_list,
         qm7_reps=qm7_reps)