## ElemNet: A formation energy prediction tool from elemental composition

### A formation energy prediction tool using 17-layered deep neural network that achieves an accuracy of 0.042 on the Open Quantum Materials Database (OQMD).
### Input: Takes a 2D numpy array with the rows representing different compounds, and columns representing the elemental compositions with 86 elements in the set elements- ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'], elemental compositon does not contain any element from ['He', 'Ne', 'Ar', 'Po', 'At','Rn','Fr','Ra']
### Output: Returns a 1D numpy array with the predicted formation energy

In [1]:
import tensorflow as tf
import numpy as np
import time, os, re
from collections import OrderedDict, defaultdict

In [2]:
elements = ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 
            'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 
            'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 
            'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 
            'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']

In [3]:
formulare = re.compile(r'([A-Z][a-z]*)(\d*)')
def parse_formula(formula):
    pairs = formulare.findall(formula)
    length = sum((len(p[0]) + len(p[1]) for p in pairs))
    assert length == len(formula)
    formula_dict = defaultdict(int)
    for el, sub in pairs:
        formula_dict[el] += float(sub) if sub else 1
    return formula_dict

In [4]:
formulas = ['H2O','NaCl', 'H2SO4']

In [5]:
formulas = [parse_formula(x) for x in formulas]
print(formulas)

[defaultdict(<class 'int'>, {'H': 2.0, 'O': 1}), defaultdict(<class 'int'>, {'Na': 1, 'Cl': 1}), defaultdict(<class 'int'>, {'H': 2.0, 'S': 1, 'O': 4.0})]


In [6]:
samp_input = np.zeros(shape=(len(formulas), 86), dtype=np.float32)
i = -1
for formula in formulas:
    i+=1
    keys = formula.keys()
    values = formula.values()
    total = float(sum(values))
    for k in keys:
        samp_input[i][elements.index(k)] = formula[k]/total
samp_input[0]

array([0.6666667 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.33333334, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [14]:
from data_utils import load_csv
config = {"loss_type": "mae", "log_file": "sample.log", "config_file": "sample/job.config", "use_valid": False, 
         "test_metric": "mae", "test_data_path": "../training-data/test_set.csv", "label": "delta_e", 
         "project": "ElemNet", "input_types": ["elements_tl"], "train_data_path": "../training-data/train_set.csv", 
         "architecture": "1024x4D-512x3D-256x3D-128x3D-64x2-32x1-1", "save_path":"sample/sample_model","dataset": "OQMD"}

X_train, y_train, X_valid, y_valid, X_test, y_test = load_csv(config['train_data_path'],
                                                              test_data_path=config['test_data_path'],
                                                              input_types=config['input_types'],
                                                              label=config['label'])

X_train = X_train.astype('float32')
y_train = y_train.astype('float32')

X_valid = X_valid.astype('float32')
y_valid = y_valid.astype('float32')

X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

train data path is  ../training-data/train_set.csv
input attribute sets are:  ['elements_tl']
test data path is  ../training-data/test_set.csv
input attributes are:  ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']
label: delta_e
           energy_pa      volume_pa      magmom_pa        bandgap  \
count  307305.000000  307305.000000  208214.000000  307305.000000   
mean       -5.498033      22.045599       0.419167       0.141143   
std         1.940040       7.951182       0.604534       0.676448   
min       -13.575205       4.149110     

In [8]:
architecture = '1024x4D-512x3D-256x3D-128x3D-64x2-32x1-1'
activation = 'relu'
dropouts = [0.8, 0.9, 0.7, 0.8]

In [9]:
def elem_model(architecture=architecture, activation='relu', dropouts=dropouts,inp_shape=(86,)):
        assert '-' in architecture
        archs = architecture.strip().split('-')
        inp = x = tf.keras.Input(shape=inp_shape)
        for i in range(len(archs)):
            arch = archs[i]
            if 'x' in arch:
                arch = arch.split('x')
                num_outputs = int(re.findall(r'\d+',arch[0])[0])
                layers = int(re.findall(r'\d+',arch[1])[0])
                j = 0
                aux_layers = re.findall(r'[A-Z]',arch[0])
                for l in range(layers):
                    print('adding fully connected layers with %d outputs' % num_outputs)
                    x = tf.keras.layers.Dense(num_outputs,activation=activation)(x)
                    j += 1
                aux_layers_sub = re.findall(r'[A-Z]', arch[1])
                if 'D' in aux_layers_sub and len(dropouts) > i:
                    print('adding dropout', dropouts[i])
                    x = tf.keras.layers.Dropout(dropouts[i])(x,training=True)
            if i==len(archs)-1:
                print('adding ouput layer')
                y = tf.keras.layers.Dense(1,activation=None)(x)
                model = tf.keras.Model(inp,y)
        return model


In [10]:
hparams = {'batch_size':32, 'num_epochs':4000, 'EVAL_FREQUENCY':1000, 'learning_rate':1e-4, 
                'momentum':0.9, 'lr_drop_rate':0.5, 'epoch_step':500, 'nesterov':True, 'reg_W':0., 
                'reg_type':'L2', 'patience':200,'patience':1,'verbose':1}

def train_test_model(model,train_data,val_data,hparams=hparams):

    opt = tf.keras.optimizers.Adam(hparams['learning_rate'])
    model.compile(optimizer=opt,
                  loss='mae',metrics=['mae'])

    model.fit(train_data[0],train_data[1],validation_data=val_data,epochs=hparams['num_epochs'],verbose=hparams['verbose'])
    _, accuracy = model.evaluate(val_ds)
    return model,accuracy
        

In [11]:
model = elem_model()

adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding dropout 0.8
adding fully connected layers with 512 outputs
adding fully connected layers with 512 outputs
adding fully connected layers with 512 outputs
adding dropout 0.9
adding fully connected layers with 256 outputs
adding fully connected layers with 256 outputs
adding fully connected layers with 256 outputs
adding dropout 0.7
adding fully connected layers with 128 outputs
adding fully connected layers with 128 outputs
adding fully connected layers with 128 outputs
adding dropout 0.8
adding fully connected layers with 64 outputs
adding fully connected layers with 64 outputs
adding fully connected layers with 32 outputs
adding ouput layer


In [13]:
model,accuracy = train_test_model(model,(X_train,y_train),(X_valid,y_valid))