In [2]:
import qml 

In [3]:
from glob import glob
import numpy as np

In [4]:
NUCLEAR_CHARGE = {
    "H":1,
    "C":6,
    "O":8,
    "N":7,
    "F":9,
    "Cl":17,
    "S":16
}

In [5]:
def read_xyz(filename):
    with open(filename, "r") as f:
        lines = f.readlines()

    natoms = int(lines[0])
    nuclear_charges = []
    coordinates = []

    for i, line in enumerate(lines[2:natoms+2]):
        tokens = line.split()

        if len(tokens) < 4:
            break
        
        ncharge = tokens[0]
        if ncharge != 'H':
            nuclear_charges.append(NUCLEAR_CHARGE[tokens[0]])
            coordinates.append([float(token) for token in tokens[1:4]])
   
    return nuclear_charges, coordinates

In [6]:
qm7_files = sorted(glob("../qm7/*.xyz"))

In [7]:
conf_data = [read_xyz(x) for x in qm7_files]

In [8]:
ncharges_list, coords_list = zip(*conf_data)

In [9]:
len(ncharges_list)

7165

In [10]:
len(coords_list)

7165

In [11]:
qm7_ncharges = [6, 7, 8, 16]

In [12]:
mbtypes = qml.representations.get_slatm_mbtypes([qm7_ncharges])

In [13]:
mbtypes

[[8],
 [16],
 [6],
 [7],
 [8, 8],
 [16, 16],
 [6, 6],
 [7, 7],
 (8, 16),
 (8, 6),
 (8, 7),
 (16, 6),
 (16, 7),
 (6, 7),
 [8, 16, 6],
 [8, 6, 16],
 [16, 8, 6],
 [8, 16, 7],
 [8, 7, 16],
 [16, 8, 7],
 [8, 6, 7],
 [8, 7, 6],
 [6, 8, 7],
 [16, 6, 7],
 [16, 7, 6],
 [6, 16, 7]]

In [25]:
qm7_reps = [np.array(qml.representations.generate_slatm(coords_list[i], ncharges_list[i], 
                                                        mbtypes,
                                              local=False)) for i in 
       range(len(ncharges_list))]

In [26]:
qm7_reps = np.array(qm7_reps)

In [27]:
qm7_reps[0].shape

(3110,)

In [14]:
qm7_labels = [t.split("/")[-1].split(".xyz")[0] for t in qm7_files]

In [15]:
qm7_labels

['qm7_0',
 'qm7_1',
 'qm7_10',
 'qm7_100',
 'qm7_1000',
 'qm7_1001',
 'qm7_1002',
 'qm7_1003',
 'qm7_1004',
 'qm7_1005',
 'qm7_1006',
 'qm7_1007',
 'qm7_1008',
 'qm7_1009',
 'qm7_101',
 'qm7_1010',
 'qm7_1011',
 'qm7_1012',
 'qm7_1013',
 'qm7_1014',
 'qm7_1015',
 'qm7_1016',
 'qm7_1017',
 'qm7_1018',
 'qm7_1019',
 'qm7_102',
 'qm7_1020',
 'qm7_1021',
 'qm7_1022',
 'qm7_1023',
 'qm7_1024',
 'qm7_1025',
 'qm7_1026',
 'qm7_1027',
 'qm7_1028',
 'qm7_1029',
 'qm7_103',
 'qm7_1030',
 'qm7_1031',
 'qm7_1032',
 'qm7_1033',
 'qm7_1034',
 'qm7_1035',
 'qm7_1036',
 'qm7_1037',
 'qm7_1038',
 'qm7_1039',
 'qm7_104',
 'qm7_1040',
 'qm7_1041',
 'qm7_1042',
 'qm7_1043',
 'qm7_1044',
 'qm7_1045',
 'qm7_1046',
 'qm7_1047',
 'qm7_1048',
 'qm7_1049',
 'qm7_105',
 'qm7_1050',
 'qm7_1051',
 'qm7_1052',
 'qm7_1053',
 'qm7_1054',
 'qm7_1055',
 'qm7_1056',
 'qm7_1057',
 'qm7_1058',
 'qm7_1059',
 'qm7_106',
 'qm7_1060',
 'qm7_1061',
 'qm7_1062',
 'qm7_1063',
 'qm7_1064',
 'qm7_1065',
 'qm7_1066',
 'qm7_1067',
 

In [29]:
# np save 

In [30]:
np.savez("../representations/qm7_SLATM_global_data.npz", 
         qm7_labels=qm7_labels,
         qm7_ncharges=ncharges_list,
         qm7_reps=qm7_reps
        )