In [1]:
import qml
import numpy as np
import pandas as pd

In [2]:
from glob import glob

In [3]:
qm7_xyz = glob('../qm7/*.xyz')

In [4]:
qm7_mols = [qml.Compound(x) for x in qm7_xyz]

In [29]:
possible_ncharges = np.unique(np.concatenate([mol.nuclear_charges for mol in qm7_mols]))

In [30]:
possible_ncharges = [x for x in possible_ncharges]

In [31]:
nuclear_charges_list = np.zeros((len(qm7_mols), len(possible_ncharges)))
for i,mol in enumerate(qm7_mols):
    for atomtype in mol.nuclear_charges:
        idx = possible_ncharges.index(atomtype)
        nuclear_charges_list[i, idx] += 1

In [22]:
qm7_props = pd.read_csv("../qm7/energies_qm7.csv", index_col=0)

In [23]:
qm7_labels = [x.split('/')[-1].split('.xyz')[0] for x in qm7_xyz]

In [24]:
qm7_energy = np.array([float(qm7_props[qm7_props['file'] == label]['energy / Ha']) for label in qm7_labels])

In [25]:
from sklearn.linear_model import LinearRegression

In [32]:
reg = LinearRegression().fit(nuclear_charges_list, qm7_energy)

In [33]:
reg.coef_

array([  -0.61002842,  -38.01906467,  -54.64773278,  -75.09076869,
       -397.94238855])

In [34]:
reg.score(nuclear_charges_list, qm7_energy)

0.9999998316921461

In [35]:
possible_ncharges

[1, 6, 7, 8, 16]

In [36]:
elementwise_E_N = dict(zip(possible_ncharges, reg.coef_))

In [37]:
elementwise_E_N

{1: -0.6100284233535662,
 6: -38.01906466911168,
 7: -54.647732784759285,
 8: -75.0907686869464,
 16: -397.9423885459585}

In [38]:
import pickle

In [41]:
with open('../atom_energy_coeffs.pickle', 'wb') as f:
    pickle.dump(elementwise_E_N, f)