# Example of training the model using the data set from this paper https://doi.org/10.1073/pnas.2120333119.

## Training to Energies (eV) and gradients (eV/A)

## Here, energies are E_total - E_isolated_atoms

In [1]:
# ---------------------------- #
# to save output in log file #
# ---------------------------- #
##############################################
import sys
import logging

nblog = open("train1.log", "w+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 20
##############################################


# ---------------- #
# Imported Modules #
# ---------------- #
import os
import sys
### path to PYSEQM ###
sys.path.insert(1, "/home/maxim/Projects/git2/PYSEQM_dev/")
#sys.path.insert(1, '/home/maxim/Projects/pyseqm_d/My_d_combined/PYSEQM_dev/')

### path to HIPNN ###
sys.path.append('/home/maxim/Projects/hipnn/hippynn')

import numpy as np
import torch
from hippynn.interfaces.pyseqm_interface.seqm_nodes import *
from hippynn.interfaces.pyseqm_interface.callback import update_scf_eps, save_and_stop_after
import hippynn.interfaces.pyseqm_interface
import hippynn
from hippynn.graphs import inputs, networks, targets, physics
from hippynn.graphs import loss
from hippynn import plotting
from hippynn.databases import DirectoryDatabase
from hippynn.experiment.assembly import assemble_for_training
from hippynn.experiment.controllers import RaiseBatchSizeOnPlateau,PatienceController
from hippynn.experiment import setup_training
from hippynn.experiment import train_model
import seqm
from seqm.basics import parameterlist

from hippynn.graphs.nodes.base.algebra import ValueNode

### keeps SCF loops silent ###
seqm.seqm_functions.scf_loop.debug = False

hippynn.interfaces.pyseqm_interface.check.debug = True

### maximum allowed SCF iterations ###
seqm.seqm_functions.scf_loop.MAX_ITER = 100

# torch.cuda.set_device(0) # Don't try this if you want CPU training!

import matplotlib
matplotlib.use("agg")

'''
it is fine to see the following message below:

Javascript Error: IPython is not defined

Autosaving every 20 seconds

.../lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Decorating your function! <function KSA_XL_BOMD.one_step at 0x7fb9febfbe50>

'''

Autosaving every 20 seconds


  from .autonotebook import tqdm as notebook_tqdm




In [2]:
### PULL DATA SET. Don't run this cell if the data is already downloaded ###

from urllib.request import urlretrieve
url = 'https://figshare.com/ndownloader/articles/19640052/versions/1'
dst = 'training_set/data.zip'
urlretrieve(url, dst)

dlDict = {"training_set/EtEi.npy":"https://figshare.com/ndownloader/files/35845121",
          "training_set/Gradient_ev.npy":"https://figshare.com/ndownloader/files/35845133",
          "training_set/R.npy":"https://figshare.com/ndownloader/files/35845145",
          "training_set/Z.npy":"https://figshare.com/ndownloader/files/35845163"}
for file in list(dlDict):
    urlretrieve(dlDict[file],file)

In [None]:

########
#current_dir = '.../PYSEQM_dev/examples/hipnn_training/'
current_dir = '/home/maxim/Projects/git2/PYSEQM_dev/examples/hipnn_training/'
os.chdir(current_dir)

def main():

    ### directory csv with semiempirical parameters ###
    parameter_file_dir = "/home/maxim/Projects/git2/PYSEQM_dev//seqm/params"
    
    ### directory with training set ###
    dataset_path = "/home/maxim/Projects/git2/PYSEQM_dev//examples/hipnn_training/training_set/" 
    
    ### Prefix for arrays in folder ###
    dataset_name = ''

    ### folder with models and plots ###
    netname = 'TEST1'
    dirname = netname
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    else:
        pass
        #raise ValueError("Directory {} already exists!".format(dirname))
    os.chdir(dirname)

    TAG = 1 #False (0): first run, True(n): continue

    dtype=torch.float64
    torch.set_default_dtype(dtype)
    device = torch.device('cuda')
    DEVICE = 'cuda'


    ### list of parameters to be learned ###
    #"""
    learned = ['U_ss', 'U_pp',
               'zeta_s', 'zeta_p',
               #'beta_s', 
               'beta_p',
               #'g_ss',
               'g_sp', 'g_pp', 'g_p2', 'h_sp',
               #'alpha',
           # 'Gaussian1_K', 'Gaussian2_K', #'Gaussian3_K','Gaussian4_K',
           # 'Gaussian1_L', 'Gaussian2_L', #'Gaussian3_L','Gaussian4_L',
           # 'Gaussian1_M', 'Gaussian2_M', #'Gaussian3_M','Gaussian4_M',
          ]
    #"""

    
    ### SEQM parameters ###
    seqm_parameters = {
                "method": "PM6_SP",  # AM1, MNDO, PM3#
                "scf_eps": 5.0e-5,  # unit eV, change of electric energy, as nuclear energy doesnt' change during SCF
                   'scf_converger' : [0,0.5], # converger used for scf loop
                                           # [0, 0.1], [0, alpha] constant mixing, P = alpha*P + (1.0-alpha)*Pnew
                                           # [1], adaptive mixing
                                           # [1, K, L, M] # advanced adaptive mixing.
                                           # First, it does linear mixing for M steps. Mixing coeff is K the first 5 SCF steps.
                                           # Then it incrementally goes to L and becomes L at M-5 SCF step. From M-5 to M it's equal to L. After M'th SCF step, adaptive mixing begins.
                                           # 
                                           # [2], adaptive mixing, then pulay
                "sp2": [False, 1.0e-5],  # whether to use sp2 algorithm in scf loop,[True, eps] or [False], eps for SP2 conve criteria
                "elements": [0, 1, 6, 7, 8],
                "learned": learned,  # parameterlist[method], #['U_ss'], # learned parameters name list, e.g ['U_ss']
                "parameter_file_dir": parameter_file_dir + "/",  # file directory for other required parameters
                "pair_outer_cutoff": 1.0e10,  # consistent with the unit on coordinates
                "scf_backward": 2, # 0: Hellmann–Feynman theorem, 1: recursive formula, 2: backpropagate through SCF (needed for training)
                'UHF' : False, # use unrestricted HF
                }

    # Log the output of python to `training_log.txt`
    with hippynn.tools.log_terminal("training_log_tag_%d.txt" % TAG,'wt'):# and torch.autograd.set_detect_anomaly(True):

        ### Hyperparameters for the network ###
        network_params = {
            "possible_species": [0,1,6,7,8],   # Z values of the elements
            'n_features': 50,                     # Number of neurons at each layer
            "n_sensitivities": 18,                # Number of sensitivity functions in an interaction layer
            "dist_soft_min": 0.6,  # qm7 1.7  qm9 .85  AL100 .85
            "dist_soft_max": 4.0,  # qm7 10.  qm9 5.   AL100 5.
            "dist_hard_max": 5.0,  # qm7 15.  qm9 7.5  AL100 7.5
            "n_interaction_layers": 2,            # Number of interaction blocks
            "n_atom_layers": 3,                   # Number of atom layers in an interaction block
        }


        ### Define a model ###

        species = inputs.SpeciesNode(db_name="Z")

        positions = inputs.PositionsNode(db_name="R")

        network = networks.Hipnn("HIPNN_seqm", (species, positions), module_kwargs = network_params)

        n_target_peratom = len(seqm_parameters["learned"])

        decay_factor = 1.0e-4
        par_atom = HChargeNode("SEQM_Atom_Params",network,module_kwargs=dict(n_target=n_target_peratom,first_is_interacting=True))
        with torch.no_grad():
            for layer in par_atom.torch_module.layers:
                layer.weight.data *= decay_factor
                layer.bias.data *= decay_factor

        seqm_par = par_atom.atom_charges

        lenergy = SEQM_AllNode("SEQM_Energy",(par_atom, positions, species),seqm_parameters, decay_factor = 1.0e-4)

        molecule_energy = lenergy.Etot_m_Eiso

        gradient  = physics.GradientNode("gradients", (molecule_energy, positions), sign=+1)

        notconverged = lenergy.notconverged
        scale = ScaleNode("Scale", (notconverged,))

        gradient.db_name='Gradient_ev'
        molecule_energy.db_name="EtEi"


        mol_mask = SEQM_MolMaskNode("SEQM_MolMask", notconverged)
        atom_mask = AtomMaskNode("Atom_Mask", species)
        gradient_pred = SEQM_MaskOnMolAtomNode("SEQM_MaskMolAtom_Pred", (gradient, mol_mask, atom_mask)).pred
        gradient_true = SEQM_MaskOnMolAtomNode("SEQM_MaskMolAtom_True", (gradient.true, mol_mask.pred, atom_mask.pred))

        molecule_energy_pred = SEQM_MaskOnMolNode("SEQM_MaskMol_Pred", (molecule_energy, mol_mask)).pred
        molecule_energy_true = SEQM_MaskOnMolNode("SEQM_MaskMol_True", (molecule_energy.true, mol_mask.pred))


        ### define loss quantities ###

        rmse_gradient = loss.MSELoss(gradient_pred, gradient_true) ** (1./2.)
        rmse_mol_energy = loss.MSELoss(molecule_energy_pred, molecule_energy_true) ** (1. / 2.)


        mae_gradient = loss.MAELoss(gradient_pred, gradient_true)
        mae_mol_energy = loss.MAELoss(molecule_energy_pred, molecule_energy_true)

        rsq_gradient = loss.Rsq(gradient_pred, gradient_true)
        rsq_mol_energy = loss.Rsq(molecule_energy_pred, molecule_energy_true)

        ### SLIGHTLY MORE ADVANCED USAGE

        pred_per_atom1 = physics.PerAtom("PeratomPredicted",(molecule_energy,species))
        true_per_atom1 = physics.PerAtom("PeratomTrue",(molecule_energy.true,species.true))
        pred_per_atom = SEQM_MaskOnMolNode("SEQM_PerAtom_Pred", (pred_per_atom1, mol_mask)).pred
        true_per_atom = SEQM_MaskOnMolNode("SEQM_PerAtom_True", (true_per_atom1.pred, mol_mask.pred))
        mae_per_atom = loss.MAELoss(pred_per_atom,true_per_atom)
        rmse_per_atom = loss.MSELoss(pred_per_atom,true_per_atom) ** (1. / 2.)

        rmse_par = loss.MeanSq(seqm_par.pred)

        ### END SLIGHTLY MORE ADVANCED USAGE

        ratio = ValueNode(0.8)
        ratio.name ="EnCoeff"
        
        
        loss_error = rmse_gradient + mae_gradient + ratio*(rmse_mol_energy + mae_mol_energy) + 8.0*rmse_par #+ 0.2*(mae_mol_energy) #+ 0.5*rmse_par


        #rbar = loss.Mean.of_node(hierarchicality)
        l2_reg = loss.l2reg(network)
        loss_regularization = 1.0e-6 * loss.Mean(l2_reg) #+ rbar    # L2 regularization and hierarchicality regularization

        train_loss = loss_error*scale.pred + loss_regularization

        # Validation losses are what we check on the data between epochs -- we can only train to
        # a single loss, but we can check other metrics too to better understand how the model is training.
        # There will also be plots of these things over time when training completes.
        validation_losses = {
            "TperAtom RMSE": rmse_per_atom,
            "TperAtom MAE" : mae_per_atom,
            "Force-RMSE"   : rmse_gradient,
            "Force-MAE"    : mae_gradient,
            "Force-RSQ"    : rsq_gradient,
            "MolEn-RMSE"   : rmse_mol_energy,
            "MolEn-MAE"    : mae_mol_energy,
            "MolEn-RSQ"    : rsq_mol_energy,
            "L2Reg"        : l2_reg,
            "Loss-Err"     : loss_error,
            "Loss-Reg"     : loss_regularization,
            "Loss"         : train_loss,
        }
        early_stopping_key = "Loss-Err"



        plot_maker = plotting.PlotMaker(
            # Simple plots which compare the network to the database

            #plotting.Hist2D.compare(molecule_energy, saved=True),
            plotting.Hist2D(molecule_energy_true, molecule_energy_pred,
                            xlabel="True EtEi",ylabel="Predicted EtEi",
                            saved="EtEi.png"),
            plotting.Hist2D(gradient_true, gradient_pred,
                            xlabel="True Force",ylabel="Predicted Force",
                            saved="grad.png"),

            #Slightly more advanced control of plotting!
            plotting.Hist2D(true_per_atom,pred_per_atom,
                            xlabel="True Energy/Atom",ylabel="Predicted Energy/Atom",
                            saved="PerAtomEn.png"),

            #plotting.HierarchicalityPlot(hierarchicality.pred,
            #                             molecule_energy.pred - molecule_energy.true,
            #                             saved="HierPlot.pdf"),
            plot_every=1,   # How often to make plots -- here, epoch 0, 10, 20...
        )

        if TAG==0: #TRAINING FROM SCRATCH


            training_modules, db_info = \
                assemble_for_training(train_loss,validation_losses,plot_maker=plot_maker)
            training_modules[0].print_structure()

    # ----------------- #
    # Step 3: RUN MODEL #
    # ----------------- #

            database_params = {
                'name': dataset_name,                            # Prefix for arrays in folder
                'directory': dataset_path,
                'quiet': False,                           # Quiet==True: suppress info about loading database
                'seed': 1,                       # Random seed for data splitting
                #'test_size': 0.1,                # Fraction of data used for testing
                #'valid_size':0.1,
                **db_info                 # Adds the inputs and targets names from the model as things to load
            }


            database = DirectoryDatabase(**database_params)
            
            ### a fraction of the data set to ignore (i.e., 0.9 means to ignore 90% of the data set and use 10% for train/test/validation) ###
            database.make_random_split("ignore",0.9)
            del database.splits['ignore']
            database.make_trainvalidtest_split(test_size=0.1,valid_size=0.1)

            #from hippynn.pretraining import set_e0_values
            #set_e0_values(henergy,database,energy_name="T_transpose",trainable_after=False)

            init_lr = 0.5e-4
            optimizer = torch.optim.Adam(training_modules.model.parameters(),lr=init_lr)



            scheduler =  RaiseBatchSizeOnPlateau(optimizer=optimizer,
                                                max_batch_size=128,
                                                patience=5,
                                                factor=0.5)

            controller = PatienceController(optimizer=optimizer,
                                            scheduler=scheduler,
                                            batch_size=32,
                                            eval_batch_size=64,
                                            max_epochs=200,
                                            termination_patience=20,
                                            fraction_train_eval=0.1,
                                            stopping_key=early_stopping_key,
                                            )

            scheduler.set_controller(controller)

            experiment_params = hippynn.experiment.SetupParams(
                controller = controller,
                device=DEVICE,
            )
            print(experiment_params)

            # Parameters describing the training procedure.

            training_modules, controller, metric_tracker  = setup_training(training_modules=training_modules,
                                                            setup_params=experiment_params)
            
        if TAG>0: #CONTINUE INTERRUPTED TRAINING
            from hippynn.experiment.serialization import load_checkpoint_from_cwd, load_checkpoint
            from hippynn.experiment import train_model
            
            #load best model
            #structure = load_checkpoint_from_cwd()
            
            #load last model
            structure = load_checkpoint("experiment_structure.pt", "last_checkpoint.pt")
            
            training_modules = structure["training_modules"]
            
            database = structure["database"]
            
            ### a fraction of the data set to ignore (i.e., 0.9 means to ignore 90% of the data set and use 10% for train/test/validation) ###
            database.make_random_split("ignore",0.9)
            del database.splits['ignore']
            database.make_trainvalidtest_split(test_size=0.1,valid_size=0.1)
            

            
            #controller = structure["controller"]
            
            init_lr = 3.0e-4
            optimizer = torch.optim.Adam(training_modules.model.parameters(),lr=init_lr)
            
            scheduler =  RaiseBatchSizeOnPlateau(optimizer=optimizer,
                                                max_batch_size=128,
                                                patience=5,
                                                factor=0.5)
            
            controller = PatienceController(optimizer=optimizer,
                                            scheduler=scheduler,
                                            batch_size=32,
                                            eval_batch_size=64,
                                            max_epochs=100,
                                            termination_patience=20,
                                            fraction_train_eval=0.1,
                                            stopping_key=early_stopping_key,
                                            )
            
            metric_tracker = structure["metric_tracker"]
    
    from hippynn.experiment import train_model
    
    store_all_better=True
    store_best=True
    if isinstance(training_modules[0], torch.nn.DataParallel):
        seqm_module = training_modules[0].module.node_from_name('SEQM_Energy').torch_module
    else:
        seqm_module = training_modules[0].node_from_name('SEQM_Energy').torch_module
    callbacks = [update_scf_eps(seqm_module, 0.92),
                    save_and_stop_after(training_modules, controller, metric_tracker, store_all_better, store_best, [10,0,0,0])]
    
    train_model(training_modules=training_modules,
                database=database,
                controller=controller,
                metric_tracker=metric_tracker,
                callbacks=callbacks,batch_callbacks=None,
                store_all_better=store_all_better,
                store_best=store_best)

if __name__=="__main__":
    main()


restarting <class 'hippynn.databases.ondisk.DirectoryDatabase'>
Arrays found:  {'R': 'R.npy', 'Z': 'Z.npy', 'EtEi': 'EtEi.npy', 'Gradient_ev': 'Gradient_ev.npy'}
Data types:
{'R': dtype('float64'), 'Z': dtype('int64'), 'EtEi': dtype('float64'), 'Gradient_ev': dtype('float64')}
All arrays:
--------------------------------------------------------------------------------------
| Name               | dtype              | shape                                    |
--------------------------------------------------------------------------------------
| R                  | dtype('float64')   | (618409, 18, 3)                          |
| Z                  | dtype('int64')     | (618409, 18)                             |
| EtEi               | dtype('float64')   | (618409,)                                |
| Gradient_ev        | dtype('float64')   | (618409, 18, 3)                          |
--------------------------------------------------------------------------------------
Database: Usin

Training Batches:  21%|██        | 326/1547 [35:15<2:14:00,  6.59s/batch]

did not converge 1
not converged:  tensor(30, device='cuda:0')


Training Batches:  30%|██▉       | 459/1547 [49:55<2:01:01,  6.67s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


Training Batches:  68%|██████▊   | 1053/1547 [1:55:23<55:24,  6.73s/batch]  

did not converge 1
not converged:  tensor(26, device='cuda:0')


Training Batches:  73%|███████▎  | 1136/1547 [2:04:37<44:53,  6.55s/batch]  

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  83%|████████▎ | 1285/1547 [2:21:14<29:58,  6.86s/batch]

did not converge 1
not converged:  tensor(11, device='cuda:0')


Training Batches:  84%|████████▍ | 1302/1547 [2:23:11<28:07,  6.89s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  93%|█████████▎| 1433/1547 [2:37:44<12:39,  6.66s/batch]

did not converge 1
not converged:  tensor(11, device='cuda:0')


                                                                          

Training time:  10218.34 s
Validating...


Evaluating train:  35%|███▍      | 27/78 [03:04<05:50,  6.87s/batch]

did not converge 1
not converged:  tensor(63, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch21/train
Saving plot at plots/epochs/epoch21/train/EtEi.png
Saving plot at plots/epochs/epoch21/train/grad.png
Saving plot at plots/epochs/epoch21/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:33<01:26,  7.18s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch21/valid
Saving plot at plots/epochs/epoch21/valid/EtEi.png
Saving plot at plots/epochs/epoch21/valid/grad.png
Saving plot at plots/epochs/epoch21/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:    *  0.019708   *  0.019889
Force-RMSE  :        0.33826        0.3314
Force-MAE   :        0.22633       0.22474
Force-RSQ   :        0.97175       0.97273
MolEn-RMSE  :    *   0.31731   *   0.31175
MolEn-MAE   :    *   0.24225   *   0.24138
MolEn-RSQ   :        0.99943       0.99944
L2Reg       :         684.29        684.29
Loss-Err    :    *    1.0379   *    1.0244
Loss-Reg    :     0.00068429    0.00068429
Loss        :    *    1.0386   *    1.0251
Best Loss-Err so far:   1.0244
Epochs since last best: 0
Current max epochs: 41
Total epoch time:  11412.98 s
from callback update_scf_eps
SCF eps is updated:  3.893440000000001e-05 ==> 3.581964800000001e-05
**** NEW BEST MODE

Training Batches:  17%|█▋        | 266/1547 [29:06<2:23:51,  6.74s/batch]

did not converge 1
not converged:  tensor(2, device='cuda:0')


Training Batches:  40%|███▉      | 616/1547 [1:07:26<1:41:33,  6.54s/batch]

did not converge 1
not converged:  tensor(17, device='cuda:0')


Training Batches:  43%|████▎     | 670/1547 [1:13:21<1:35:21,  6.52s/batch]

did not converge 1
not converged:  tensor(6, device='cuda:0')


Training Batches:  51%|█████     | 783/1547 [1:25:51<1:23:25,  6.55s/batch]

did not converge 1
not converged:  tensor(19, device='cuda:0')


Training Batches:  83%|████████▎ | 1283/1547 [2:21:05<28:47,  6.54s/batch]  

did not converge 1
not converged:  tensor(29, device='cuda:0')


Training Batches:  94%|█████████▍| 1453/1547 [2:39:52<10:16,  6.55s/batch]

did not converge 1
not converged:  tensor(3, device='cuda:0')


                                                                          

Training time:  10209.09 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch22/train
Saving plot at plots/epochs/epoch22/train/EtEi.png
Saving plot at plots/epochs/epoch22/train/grad.png
Saving plot at plots/epochs/epoch22/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:32<01:27,  7.26s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch22/valid
Saving plot at plots/epochs/epoch22/valid/EtEi.png
Saving plot at plots/epochs/epoch22/valid/grad.png
Saving plot at plots/epochs/epoch22/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.025294      0.025434
Force-RMSE  :    *   0.31514   *   0.32029
Force-MAE   :    *   0.21298   *   0.21163
Force-RSQ   :        0.97489       0.97453
MolEn-RMSE  :        0.39832       0.39087
MolEn-MAE   :        0.30562       0.30206
MolEn-RSQ   :        0.99905       0.99912
L2Reg       :         697.53        697.53
Loss-Err    :         1.1128        1.1078
Loss-Reg    :     0.00069753    0.00069753
Loss        :         1.1135        1.1085
Best Loss-Err so far:   1.0244
Epochs since last best: 1
Current max epochs: 41
Total epoch time:  11404.67 s
__________________________________________________
Epoch 23:
Learning rate:    0.0003


Training Batches:  11%|█▏        | 175/1547 [19:09<2:30:55,  6.60s/batch]

did not converge 1
not converged:  tensor(27, device='cuda:0')


Training Batches:  18%|█▊        | 279/1547 [30:35<2:17:09,  6.49s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  44%|████▍     | 687/1547 [1:15:48<1:33:20,  6.51s/batch]

did not converge 1
not converged:  tensor(4, device='cuda:0')


Training Batches:  51%|█████     | 790/1547 [1:27:15<1:21:36,  6.47s/batch]

did not converge 1
not converged:  tensor(26, device='cuda:0')


Training Batches:  74%|███████▍  | 1148/1547 [2:06:33<43:42,  6.57s/batch] 

did not converge 1
not converged:  tensor(6, device='cuda:0')


Training Batches:  78%|███████▊  | 1210/1547 [2:13:20<36:00,  6.41s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


                                                                          

Training time:  10207.65 s
Validating...


Evaluating train:  69%|██████▉   | 54/78 [06:09<02:44,  6.84s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Evaluating train:  82%|████████▏ | 64/78 [07:18<01:35,  6.80s/batch]

did not converge 1
not converged:  tensor(24, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch23/train
Saving plot at plots/epochs/epoch23/train/EtEi.png
Saving plot at plots/epochs/epoch23/train/grad.png
Saving plot at plots/epochs/epoch23/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:29<01:26,  7.24s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch23/valid
Saving plot at plots/epochs/epoch23/valid/EtEi.png
Saving plot at plots/epochs/epoch23/valid/grad.png
Saving plot at plots/epochs/epoch23/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:    *  0.016104   *  0.016567
Force-RMSE  :        0.31888   *   0.31085
Force-MAE   :    *    0.2104   *   0.20852
Force-RSQ   :        0.97512       0.97601
MolEn-RMSE  :    *   0.25653   *   0.25414
MolEn-MAE   :    *   0.19612   *    0.1985
MolEn-RSQ   :        0.99962       0.99963
L2Reg       :          711.7         711.7
Loss-Err    :    *   0.91578   *   0.90595
Loss-Reg    :      0.0007117     0.0007117
Loss        :    *   0.91649   *   0.90667
Best Loss-Err so far:  0.90595
Patience for training restored.
Epochs since last best: 0
Current max epochs: 43
Total epoch time:  11398.35 s
from callback update_scf_eps
SCF eps is updated:  3.581964800000001e-05 ==> 3.295407

Training Batches:  14%|█▍        | 213/1547 [23:26<2:25:43,  6.55s/batch]

did not converge 1
not converged:  tensor(15, device='cuda:0')


Training Batches:  37%|███▋      | 580/1547 [1:04:07<1:47:31,  6.67s/batch]

did not converge 1
not converged:  tensor(9, device='cuda:0')


Training Batches:  39%|███▉      | 604/1547 [1:06:48<1:43:00,  6.55s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  42%|████▏     | 651/1547 [1:11:58<1:38:33,  6.60s/batch]

did not converge 1
not converged:  tensor(9, device='cuda:0')


Training Batches:  43%|████▎     | 662/1547 [1:13:15<1:38:52,  6.70s/batch]

did not converge 1
not converged:  tensor(9, device='cuda:0')


Training Batches:  45%|████▌     | 703/1547 [1:17:46<1:31:41,  6.52s/batch]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training Batches:  45%|████▌     | 700/1547 [1:17:50<1:33:32,  6.63s/batch]

did not converge 1
not converged:  tensor(20, device='cuda:0')


Training Batches:  54%|█████▍    | 833/1547 [1:32:39<1:19:18,  6.66s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


Training Batches:  58%|█████▊    | 901/1547 [1:40:10<1:10:44,  6.57s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  83%|████████▎ | 1278/1547 [2:22:01<33:24,  7.45s/batch]  

did not converge 1
not converged:  tensor(19, device='cuda:0')


                                                                          

Training time:  10309.82 s
Validating...


Evaluating train:  14%|█▍        | 11/78 [01:14<07:34,  6.78s/batch]

did not converge 1
not converged:  tensor(49, device='cuda:0')


Evaluating train:  46%|████▌     | 36/78 [04:07<04:48,  6.86s/batch]

did not converge 1
not converged:  tensor(51, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch25/train
Saving plot at plots/epochs/epoch25/train/EtEi.png
Saving plot at plots/epochs/epoch25/train/grad.png
Saving plot at plots/epochs/epoch25/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:33<01:27,  7.25s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch25/valid
Saving plot at plots/epochs/epoch25/valid/EtEi.png
Saving plot at plots/epochs/epoch25/valid/grad.png
Saving plot at plots/epochs/epoch25/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.051769      0.051872
Force-RMSE  :        0.36352       0.36745
Force-MAE   :        0.24791       0.24766
Force-RSQ   :        0.96736       0.96648
MolEn-RMSE  :        0.76599       0.77002
MolEn-MAE   :        0.61619       0.61368
MolEn-RSQ   :        0.99668       0.99659
L2Reg       :         742.32        742.32
Loss-Err    :         1.7426        1.7476
Loss-Reg    :     0.00074232    0.00074232
Loss        :         1.7434        1.7484
Best Loss-Err so far:  0.90595
Epochs since last best: 2
Current max epochs: 43
Total epoch time:  11504.77 s
__________________________________________________
Epoch 26:
Learning rate:    0.0003


Training Batches:   3%|▎         | 45/1547 [04:56<2:44:26,  6.57s/batch]

did not converge 1
not converged:  tensor(14, device='cuda:0')


Training Batches:  18%|█▊        | 276/1547 [30:42<2:19:08,  6.57s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  84%|████████▍ | 1303/1547 [2:24:21<26:44,  6.58s/batch]  

did not converge 1
not converged:  tensor(17, device='cuda:0')


Training Batches:  90%|████████▉ | 1386/1547 [2:33:34<18:55,  7.05s/batch]

did not converge 1
not converged:  tensor(28, device='cuda:0')


Training Batches:  90%|█████████ | 1395/1547 [2:34:35<16:43,  6.61s/batch]

did not converge 1
not converged:  tensor(7, device='cuda:0')


Training Batches:  97%|█████████▋| 1507/1547 [2:47:02<04:26,  6.67s/batch]

did not converge 1
not converged:  tensor(15, device='cuda:0')


                                                                          

Training time:  10295.75 s
Validating...


Evaluating train:  22%|██▏       | 17/78 [01:57<06:55,  6.81s/batch]

did not converge 1
not converged:  tensor(46, device='cuda:0')


Evaluating train:  29%|██▉       | 23/78 [02:40<06:24,  6.99s/batch]

did not converge 1
not converged:  tensor(39, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch26/train
Saving plot at plots/epochs/epoch26/train/EtEi.png
Saving plot at plots/epochs/epoch26/train/grad.png
Saving plot at plots/epochs/epoch26/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:39<01:29,  7.49s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch26/valid
Saving plot at plots/epochs/epoch26/valid/EtEi.png
Saving plot at plots/epochs/epoch26/valid/grad.png
Saving plot at plots/epochs/epoch26/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.018766      0.018376
Force-RMSE  :    *   0.30037   *    0.3003
Force-MAE   :    *   0.20329   *   0.20124
Force-RSQ   :        0.97748       0.97761
MolEn-RMSE  :        0.29212       0.27798
MolEn-MAE   :        0.22721       0.22068
MolEn-RSQ   :         0.9995       0.99956
L2Reg       :         759.26        759.26
Loss-Err    :        0.94816       0.92942
Loss-Reg    :     0.00075926    0.00075926
Loss        :        0.94892       0.93018
Best Loss-Err so far:  0.90595
Epochs since last best: 3
Current max epochs: 43
Total epoch time:  11502.34 s
__________________________________________________
Epoch 27:
Learning rate:    0.0003


Training Batches:   0%|          | 1/1547 [00:06<2:42:48,  6.32s/batch]

did not converge 1
not converged:  tensor(28, device='cuda:0')


Training Batches:  13%|█▎        | 205/1547 [22:44<2:28:27,  6.64s/batch]

did not converge 1
not converged:  tensor(27, device='cuda:0')


Training Batches:  14%|█▎        | 212/1547 [23:31<2:27:25,  6.63s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  14%|█▍        | 223/1547 [24:46<2:25:36,  6.60s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  16%|█▌        | 251/1547 [27:55<2:24:45,  6.70s/batch]

did not converge 1
not converged:  tensor(15, device='cuda:0')


Training Batches:  33%|███▎      | 515/1547 [57:20<1:58:11,  6.87s/batch]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training Batches:  80%|███████▉  | 1234/1547 [2:18:16<35:36,  6.82s/batch]  

did not converge 1
not converged:  tensor(0, device='cuda:0')


Training Batches:  86%|████████▌ | 1330/1547 [2:29:14<24:19,  6.73s/batch]

did not converge 1
not converged:  tensor(19, device='cuda:0')


Training Batches:  98%|█████████▊| 1513/1547 [2:49:56<03:51,  6.82s/batch]

did not converge 1
not converged:  tensor(28, device='cuda:0')


                                                                          

Training time:  10423.62 s
Validating...


Evaluating train:  62%|██████▏   | 48/78 [05:34<03:26,  6.88s/batch]

did not converge 1
not converged:  tensor(9, device='cuda:0')


Evaluating train:  64%|██████▍   | 50/78 [05:49<03:20,  7.17s/batch]

did not converge 1
not converged:  tensor(45, device='cuda:0')


Evaluating train:  95%|█████████▍| 74/78 [08:37<00:27,  6.89s/batch]

did not converge 1
not converged:  tensor(17, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch28/train
Saving plot at plots/epochs/epoch28/train/EtEi.png
Saving plot at plots/epochs/epoch28/train/grad.png
Saving plot at plots/epochs/epoch28/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:41<01:28,  7.41s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch28/valid
Saving plot at plots/epochs/epoch28/valid/EtEi.png
Saving plot at plots/epochs/epoch28/valid/grad.png
Saving plot at plots/epochs/epoch28/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:    *  0.013818   *  0.013376
Force-RMSE  :    *   0.29873   *   0.29597
Force-MAE   :    *   0.19985   *   0.19833
Force-RSQ   :        0.97802       0.97825
MolEn-RMSE  :    *   0.22481   *   0.21419
MolEn-MAE   :    *   0.16831   *   0.16262
MolEn-RSQ   :        0.99973       0.99974
L2Reg       :          787.7         787.7
Loss-Err    :    *   0.84116   *    0.8238
Loss-Reg    :      0.0007877     0.0007877
Loss        :    *   0.84195   *   0.82459
Best Loss-Err so far:   0.8238
Epochs since last best: 0
Current max epochs: 48
Total epoch time:  11637.95 s
from callback update_scf_eps
SCF eps is updated:  3.031775006720001e-05 ==> 2.789233006182401e-05
**** NEW BEST MODE

Training Batches:  12%|█▏        | 187/1547 [21:18<2:33:17,  6.76s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  24%|██▎       | 365/1547 [41:26<2:11:51,  6.69s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  35%|███▌      | 549/1547 [1:02:13<1:50:08,  6.62s/batch]

did not converge 1
not converged:  tensor(0, device='cuda:0')


Training Batches:  38%|███▊      | 595/1547 [1:07:26<1:49:32,  6.90s/batch]

did not converge 1
not converged:  tensor(24, device='cuda:0')


Training Batches:  42%|████▏     | 656/1547 [1:14:20<1:39:07,  6.67s/batch]

did not converge 1
not converged:  tensor(7, device='cuda:0')


Training Batches:  49%|████▉     | 758/1547 [1:25:43<1:26:47,  6.60s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  56%|█████▌    | 861/1547 [1:37:36<1:18:23,  6.86s/batch]

did not converge 1
not converged:  tensor(30, device='cuda:0')


Training Batches:  78%|███████▊  | 1204/1547 [2:16:19<39:32,  6.92s/batch]  

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  91%|█████████ | 1408/1547 [2:39:33<15:59,  6.90s/batch]

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  99%|█████████▉| 1532/1547 [2:53:39<01:41,  6.75s/batch]

did not converge 1
not converged:  tensor(18, device='cuda:0')


                                                                          

Training time:  10518.13 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch29/train
Saving plot at plots/epochs/epoch29/train/EtEi.png
Saving plot at plots/epochs/epoch29/train/grad.png
Saving plot at plots/epochs/epoch29/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:47<01:29,  7.48s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch29/valid
Saving plot at plots/epochs/epoch29/valid/EtEi.png
Saving plot at plots/epochs/epoch29/valid/grad.png
Saving plot at plots/epochs/epoch29/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.043032      0.042354
Force-RMSE  :        0.30808        0.3088
Force-MAE   :        0.20746       0.20784
Force-RSQ   :        0.97621       0.97633
MolEn-RMSE  :        0.61497       0.60796
MolEn-MAE   :        0.51865       0.50959
MolEn-RSQ   :        0.99775       0.99788
L2Reg       :          803.6         803.6
Loss-Err    :         1.4468         1.435
Loss-Reg    :      0.0008036     0.0008036
Loss        :         1.4476        1.4358
Best Loss-Err so far:   0.8238
Epochs since last best: 1
Current max epochs: 48
Total epoch time:  11743.18 s
__________________________________________________
Epoch 30:
Learning rate:    0.0003


Training Batches:   1%|▏         | 23/1547 [02:37<2:53:35,  6.83s/batch]

did not converge 1
not converged:  tensor(26, device='cuda:0')


Training Batches:  13%|█▎        | 200/1547 [22:34<2:33:00,  6.82s/batch]

did not converge 1
not converged:  tensor(30, device='cuda:0')


Training Batches:  28%|██▊       | 440/1547 [49:52<2:07:40,  6.92s/batch]

did not converge 1
not converged:  tensor(22, device='cuda:0')


Training Batches:  37%|███▋      | 579/1547 [1:05:42<1:49:37,  6.79s/batch]

did not converge 1
not converged:  tensor(27, device='cuda:0')


Training Batches:  44%|████▍     | 677/1547 [1:16:55<1:37:58,  6.76s/batch]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training Batches:  37%|███▋      | 565/1547 [1:04:04<1:51:35,  6.82s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  41%|████      | 629/1547 [1:11:25<1:43:36,  6.77s/batch]

did not converge 1
not converged:  tensor(13, device='cuda:0')


Training Batches:  57%|█████▋    | 875/1547 [1:39:13<1:16:31,  6.83s/batch]

did not converge 1
not converged:  tensor(31, device='cuda:0')


Training Batches:  59%|█████▊    | 906/1547 [1:42:46<1:10:44,  6.62s/batch]

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  66%|██████▌   | 1022/1547 [1:56:01<1:00:40,  6.93s/batch]

did not converge 1
not converged:  tensor(17, device='cuda:0')


Training Batches:  68%|██████▊   | 1053/1547 [1:59:33<56:56,  6.92s/batch]  

did not converge 1
not converged:  tensor(3, device='cuda:0')


Training Batches:  78%|███████▊  | 1206/1547 [2:16:51<38:27,  6.77s/batch]  

did not converge 1
not converged:  tensor(21, device='cuda:0')


Training Batches:  79%|███████▉  | 1229/1547 [2:19:29<36:26,  6.88s/batch]

did not converge 1
not converged:  tensor(18, device='cuda:0')


Training Batches:  88%|████████▊ | 1355/1547 [2:33:47<21:45,  6.80s/batch]

did not converge 1
not converged:  tensor(5, device='cuda:0')


                                                                          

Training time:  10521.71 s
Validating...


Evaluating train:  29%|██▉       | 23/78 [02:40<06:20,  6.92s/batch]

did not converge 1
not converged:  tensor(50, device='cuda:0')


Evaluating train:  62%|██████▏   | 48/78 [05:34<03:27,  6.90s/batch]

did not converge 1
not converged:  tensor(62, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch33/train
Saving plot at plots/epochs/epoch33/train/EtEi.png
Saving plot at plots/epochs/epoch33/train/grad.png
Saving plot at plots/epochs/epoch33/train/PerAtomEn.png


                                                                    

Making plots. Saved location: plots/epochs/epoch33/valid
Saving plot at plots/epochs/epoch33/valid/EtEi.png
Saving plot at plots/epochs/epoch33/valid/grad.png
Saving plot at plots/epochs/epoch33/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.017114      0.017069
Force-RMSE  :    *   0.28688   *   0.28058
Force-MAE   :    *   0.18781   *   0.18717
Force-RSQ   :        0.97986       0.98046
MolEn-RMSE  :        0.26974       0.26419
MolEn-MAE   :         0.2045       0.20325
MolEn-RSQ   :        0.99959        0.9996
L2Reg       :         860.05        860.05
Loss-Err    :        0.88202       0.86955
Loss-Reg    :     0.00086005    0.00086005
Loss        :        0.88288       0.87041
Best Loss-Err so far:   0.8238
Epochs since last best: 5
Current max epochs: 48
Total epoch time:  11738.48 s
__________________________________________________
Epoch 34:
Learning rate:    0.0003


Training Batches:   6%|▌         | 90/1547 [10:09<2:41:52,  6.67s/batch]

did not converge 1
not converged:  tensor(8, device='cuda:0')


Training Batches:   9%|▉         | 142/1547 [16:03<2:35:43,  6.65s/batch]

did not converge 1
not converged:  tensor(16, device='cuda:0')


Training Batches:  14%|█▍        | 224/1547 [25:20<2:27:47,  6.70s/batch]

did not converge 1
not converged:  tensor(17, device='cuda:0')


Training Batches:  33%|███▎      | 510/1547 [57:20<1:55:17,  6.67s/batch]

did not converge 1
not converged:  tensor(27, device='cuda:0')


Training Batches:  41%|████▏     | 641/1547 [1:12:18<1:42:06,  6.76s/batch]

did not converge 1
not converged:  tensor(4, device='cuda:0')


Training Batches:  42%|████▏     | 657/1547 [1:14:09<1:42:32,  6.91s/batch]

did not converge 1
not converged:  tensor(6, device='cuda:0')


Training Batches:  48%|████▊     | 749/1547 [1:24:36<1:28:57,  6.69s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


Training Batches:  51%|█████     | 789/1547 [1:29:10<1:26:03,  6.81s/batch]

did not converge 1
not converged:  tensor(13, device='cuda:0')


Training Batches:  63%|██████▎   | 982/1547 [1:51:03<1:04:06,  6.81s/batch]

did not converge 1
not converged:  tensor(8, device='cuda:0')


Training Batches:  85%|████████▌ | 1322/1547 [2:29:35<25:47,  6.88s/batch]  

did not converge 1
not converged:  tensor(10, device='cuda:0')


                                                                          

Training time:  10489.81 s
Validating...


Evaluating train:  83%|████████▎ | 65/78 [07:35<01:31,  7.07s/batch]

did not converge 1
not converged:  tensor(0, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch34/train
Saving plot at plots/epochs/epoch34/train/EtEi.png
Saving plot at plots/epochs/epoch34/train/grad.png
Saving plot at plots/epochs/epoch34/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:44<01:29,  7.46s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch34/valid
Saving plot at plots/epochs/epoch34/valid/EtEi.png
Saving plot at plots/epochs/epoch34/valid/grad.png
Saving plot at plots/epochs/epoch34/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.046909       0.04696
Force-RMSE  :    *   0.27994   *   0.27845
Force-MAE   :        0.18952       0.18797
Force-RSQ   :         0.9807       0.98075
MolEn-RMSE  :        0.63145       0.63062
MolEn-MAE   :        0.57345       0.57077
MolEn-RSQ   :        0.99774       0.99771
L2Reg       :         872.05        872.05
Loss-Err    :         1.4594        1.4536
Loss-Reg    :     0.00087205    0.00087205
Loss        :         1.4603        1.4545
Best Loss-Err so far:   0.8238
Raising batch size to 64
Epochs since last best: 6
Current max epochs: 48
Total epoch time:  11710.32 s
__________________________________________________
Epoch 35:
Learning rate:    0.0003


Training Batches:  19%|█▉        | 148/774 [26:49<1:56:36, 11.18s/batch]

did not converge 1
not converged:  tensor(31, device='cuda:0')


Training Batches:  26%|██▋       | 204/774 [37:01<1:42:47, 10.82s/batch]

did not converge 1
not converged:  tensor(12, device='cuda:0')


Training Batches:  27%|██▋       | 207/774 [37:36<1:47:04, 11.33s/batch]

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  37%|███▋      | 285/774 [51:49<1:27:58, 10.79s/batch]

did not converge 1
not converged:  tensor(3, device='cuda:0')


Training Batches:  44%|████▍     | 339/774 [1:01:36<1:18:32, 10.83s/batch]

did not converge 1
not converged:  tensor(29, device='cuda:0')


Training Batches:  52%|█████▏    | 399/774 [1:12:34<1:09:16, 11.08s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


Training Batches:  64%|██████▍   | 498/774 [1:30:38<49:49, 10.83s/batch]  

did not converge 1
not converged:  tensor(42, device='cuda:0')


Training Batches:  93%|█████████▎| 716/774 [2:10:16<11:08, 11.53s/batch]

did not converge 1
not converged:  tensor(14, device='cuda:0')


Training Batches:  97%|█████████▋| 749/774 [2:16:17<04:28, 10.75s/batch]

did not converge 1
not converged:  tensor(12, device='cuda:0')


                                                                        

Training time:  8439.8 s
Validating...


Evaluating train:  49%|████▊     | 38/78 [04:28<04:41,  7.04s/batch]

did not converge 1
not converged:  tensor(14, device='cuda:0')


Evaluating train:  71%|███████   | 55/78 [06:28<02:42,  7.06s/batch]

did not converge 1
not converged:  tensor(31, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch35/train
Saving plot at plots/epochs/epoch35/train/EtEi.png
Saving plot at plots/epochs/epoch35/train/grad.png
Saving plot at plots/epochs/epoch35/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:48<01:29,  7.48s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch35/valid
Saving plot at plots/epochs/epoch35/valid/EtEi.png
Saving plot at plots/epochs/epoch35/valid/grad.png
Saving plot at plots/epochs/epoch35/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.032341      0.031812
Force-RMSE  :        0.28259       0.27961
Force-MAE   :        0.19033       0.18891
Force-RSQ   :        0.98004       0.98059
MolEn-RMSE  :        0.48115       0.47516
MolEn-MAE   :        0.40173       0.39402
MolEn-RSQ   :        0.99869        0.9987
L2Reg       :         879.28        879.28
Loss-Err    :         1.2079        1.1924
Loss-Reg    :     0.00087928    0.00087928
Loss        :         1.2088        1.1933
Best Loss-Err so far:   0.8238
Epochs since last best: 7
Current max epochs: 48
Total epoch time:  9667.41 s
__________________________________________________
Epoch 36:
Learning rate:    0.0003


Training Batches:   4%|▍         | 30/774 [05:25<2:13:02, 10.73s/batch]

did not converge 1
not converged:  tensor(22, device='cuda:0')


Training Batches:   7%|▋         | 56/774 [10:09<2:08:19, 10.72s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


Training Batches:  19%|█▉        | 149/774 [26:59<1:54:09, 10.96s/batch]

did not converge 1
not converged:  tensor(57, device='cuda:0')


Training Batches:  34%|███▎      | 261/774 [47:15<1:32:12, 10.78s/batch]

did not converge 1
not converged:  tensor(27, device='cuda:0')


Training Batches:  45%|████▌     | 351/774 [1:03:32<1:16:48, 10.90s/batch]

did not converge 1
not converged:  tensor(43, device='cuda:0')


Training Batches:  53%|█████▎    | 410/774 [1:14:16<1:07:07, 11.06s/batch]

did not converge 1
not converged:  tensor(5, device='cuda:0')


Training Batches:  68%|██████▊   | 524/774 [1:34:52<45:23, 10.89s/batch]  

did not converge 1
not converged:  tensor(7, device='cuda:0')


Training Batches:  78%|███████▊  | 604/774 [1:49:22<30:32, 10.78s/batch]

did not converge 1
not converged:  tensor(9, device='cuda:0')


Training Batches:  95%|█████████▍| 733/774 [2:12:41<07:37, 11.16s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                        

Training time:  8393.6 s
Validating...


Evaluating train:   4%|▍         | 3/78 [00:21<08:45,  7.01s/batch]

did not converge 1
not converged:  tensor(41, device='cuda:0')


Evaluating train:  27%|██▋       | 21/78 [02:29<06:40,  7.03s/batch]

did not converge 1
not converged:  tensor(14, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch36/train
Saving plot at plots/epochs/epoch36/train/EtEi.png
Saving plot at plots/epochs/epoch36/train/grad.png
Saving plot at plots/epochs/epoch36/train/PerAtomEn.png


Evaluating valid:  88%|████████▊ | 85/97 [09:50<01:30,  7.52s/batch]

did not converge 1
not converged:  tensor(44, device='cuda:0')


                                                                    

Making plots. Saved location: plots/epochs/epoch36/valid
Saving plot at plots/epochs/epoch36/valid/EtEi.png
Saving plot at plots/epochs/epoch36/valid/grad.png
Saving plot at plots/epochs/epoch36/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.024564       0.02486
Force-RMSE  :    *   0.27695   *    0.2746
Force-MAE   :    *    0.1859   *   0.18462
Force-RSQ   :        0.98135       0.98128
MolEn-RMSE  :        0.37403       0.37324
MolEn-MAE   :        0.31193       0.31271
MolEn-RSQ   :        0.99919        0.9992
L2Reg       :         884.64        884.64
Loss-Err    :         1.0385        1.0349
Loss-Reg    :     0.00088464    0.00088464
Loss        :         1.0394        1.0357
Best Loss-Err so far:   0.8238
Epochs since last best: 8
Current max epochs: 48
Total epoch time:  9629.77 s
__________________________________________________
Epoch 37:
Learning rate:    0.0003


Training Batches:   1%|          | 8/774 [01:28<2:21:40, 11.10s/batch]

did not converge 1
not converged:  tensor(13, device='cuda:0')


Training Batches:  15%|█▌        | 118/774 [21:27<1:57:34, 10.75s/batch]

did not converge 1
not converged:  tensor(19, device='cuda:0')


Training Batches:  29%|██▊       | 222/774 [40:15<1:39:36, 10.83s/batch]

did not converge 1
not converged:  tensor(53, device='cuda:0')


Training Batches:  38%|███▊      | 295/774 [53:28<1:27:15, 10.93s/batch]

did not converge 1
not converged:  tensor(47, device='cuda:0')


Training Batches:  42%|████▏     | 322/774 [58:25<1:22:44, 10.98s/batch]

did not converge 1
not converged:  tensor(32, device='cuda:0')


Training Batches:  54%|█████▎    | 415/774 [1:15:24<1:04:19, 10.75s/batch]

did not converge 1
not converged:  tensor(53, device='cuda:0')


Training Batches:  64%|██████▍   | 496/774 [1:30:07<54:17, 11.72s/batch]  

did not converge 1
not converged:  tensor(26, device='cuda:0')


Training Batches:  67%|██████▋   | 520/774 [1:34:26<44:53, 10.61s/batch]

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  82%|████████▏ | 631/774 [1:54:21<25:34, 10.73s/batch]

did not converge 1
not converged:  tensor(29, device='cuda:0')


Training Batches:  95%|█████████▍| 733/774 [2:12:47<07:20, 10.75s/batch]

did not converge 1
not converged:  tensor(33, device='cuda:0')


                                                                        

Training time:  8401.85 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch37/train
Saving plot at plots/epochs/epoch37/train/EtEi.png
Saving plot at plots/epochs/epoch37/train/grad.png
Saving plot at plots/epochs/epoch37/train/PerAtomEn.png


                                                                    

Making plots. Saved location: plots/epochs/epoch37/valid
Saving plot at plots/epochs/epoch37/valid/EtEi.png
Saving plot at plots/epochs/epoch37/valid/grad.png
Saving plot at plots/epochs/epoch37/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.079587      0.079955
Force-RMSE  :        0.28881       0.28903
Force-MAE   :        0.19589       0.19441
Force-RSQ   :        0.97931       0.97927
MolEn-RMSE  :         1.0416        1.0368
MolEn-MAE   :         0.9918       0.98747
MolEn-RSQ   :        0.99387       0.99382
L2Reg       :         891.65        891.65
Loss-Err    :         2.1388        2.1302
Loss-Reg    :     0.00089165    0.00089165
Loss        :         2.1397        2.1311
Best Loss-Err so far:   0.8238
Epochs since last best: 9
Current max epochs: 48
Total epoch time:  9627.79 s
__________________________________________________
Epoch 38:
Learning rate:    0.0003


Training Batches:  24%|██▍       | 188/774 [34:08<1:46:47, 10.93s/batch]

did not converge 1
not converged:  tensor(49, device='cuda:0')


Training Batches:  31%|███       | 240/774 [43:36<1:35:51, 10.77s/batch]

did not converge 1
not converged:  tensor(35, device='cuda:0')


Training Batches:  35%|███▍      | 268/774 [48:45<1:34:36, 11.22s/batch]

did not converge 1
not converged:  tensor(25, device='cuda:0')


Training Batches:  44%|████▎     | 338/774 [1:01:29<1:18:32, 10.81s/batch]

did not converge 1
not converged:  tensor(41, device='cuda:0')


Training Batches:  53%|█████▎    | 408/774 [1:14:09<1:05:22, 10.72s/batch]

did not converge 1
not converged:  tensor(1, device='cuda:0')


Training Batches:  59%|█████▉    | 457/774 [1:23:04<57:26, 10.87s/batch]  

did not converge 1
not converged:  tensor(43, device='cuda:0')


Training Batches:  65%|██████▍   | 503/774 [1:31:21<48:01, 10.63s/batch]  

did not converge 1
not converged:  tensor(33, device='cuda:0')


Training Batches:  67%|██████▋   | 518/774 [1:34:05<45:58, 10.77s/batch]

did not converge 1
not converged:  tensor(34, device='cuda:0')


Training Batches:  78%|███████▊  | 600/774 [1:49:01<31:24, 10.83s/batch]

did not converge 1
not converged:  tensor(57, device='cuda:0')


Training Batches:  88%|████████▊ | 678/774 [2:03:23<17:13, 10.77s/batch]

did not converge 1
not converged:  tensor(23, device='cuda:0')


                                                                        

Training time:  8434.28 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch38/train
Saving plot at plots/epochs/epoch38/train/EtEi.png
Saving plot at plots/epochs/epoch38/train/grad.png
Saving plot at plots/epochs/epoch38/train/PerAtomEn.png


                                                                    

Making plots. Saved location: plots/epochs/epoch38/valid
Saving plot at plots/epochs/epoch38/valid/EtEi.png
Saving plot at plots/epochs/epoch38/valid/grad.png
Saving plot at plots/epochs/epoch38/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.057585      0.057878
Force-RMSE  :        0.28485       0.28738
Force-MAE   :        0.19415       0.19443
Force-RSQ   :        0.97964        0.9795
MolEn-RMSE  :        0.75737       0.76093
MolEn-MAE   :        0.70334       0.70506
MolEn-RSQ   :        0.99668       0.99667
L2Reg       :         899.73        899.73
Loss-Err    :          1.679        1.6861
Loss-Reg    :     0.00089973    0.00089973
Loss        :         1.6799         1.687
Best Loss-Err so far:   0.8238
Epochs since last best: 10
Current max epochs: 48
Total epoch time:  9651.45 s
__________________________________________________
Epoch 39:
Learning rate:    0.0003


Training Batches:   7%|▋         | 52/774 [09:18<2:09:15, 10.74s/batch]

did not converge 1
not converged:  tensor(0, device='cuda:0')


Training Batches:   8%|▊         | 59/774 [10:36<2:09:21, 10.85s/batch]

did not converge 1
not converged:  tensor(20, device='cuda:0')


Training Batches:  18%|█▊        | 137/774 [24:45<1:57:40, 11.08s/batch]

did not converge 1
not converged:  tensor(36, device='cuda:0')


Training Batches:  31%|███▏      | 243/774 [43:55<1:35:10, 10.75s/batch]

did not converge 1
not converged:  tensor(48, device='cuda:0')


Training Batches:  32%|███▏      | 251/774 [45:26<1:35:15, 10.93s/batch]

did not converge 1
not converged:  tensor(36, device='cuda:0')


Training Batches:  58%|█████▊    | 446/774 [1:20:45<1:02:14, 11.39s/batch]

did not converge 1
not converged:  tensor(19, device='cuda:0')


Training Batches:  65%|██████▍   | 503/774 [1:31:06<49:30, 10.96s/batch]  

did not converge 1
not converged:  tensor(10, device='cuda:0')


Training Batches:  95%|█████████▍| 732/774 [2:12:08<07:30, 10.72s/batch]

did not converge 1
not converged:  tensor(18, device='cuda:0')


                                                                        

Training time:  8372.94 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch39/train
Saving plot at plots/epochs/epoch39/train/EtEi.png
Saving plot at plots/epochs/epoch39/train/grad.png
Saving plot at plots/epochs/epoch39/train/PerAtomEn.png


                                                                    

Making plots. Saved location: plots/epochs/epoch39/valid
Saving plot at plots/epochs/epoch39/valid/EtEi.png
Saving plot at plots/epochs/epoch39/valid/grad.png
Saving plot at plots/epochs/epoch39/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:        0.02586       0.02648
Force-RMSE  :        0.29139        0.2899
Force-MAE   :        0.19483       0.19308
Force-RSQ   :          0.979       0.97914
MolEn-RMSE  :        0.37271       0.37738
MolEn-MAE   :        0.30998       0.31443
MolEn-RSQ   :        0.99917       0.99918
L2Reg       :         906.88        906.88
Loss-Err    :         1.0595        1.0635
Loss-Reg    :     0.00090688    0.00090688
Loss        :         1.0604        1.0645
Best Loss-Err so far:   0.8238
Epochs since last best: 11
Current max epochs: 48
Total epoch time:  9595.65 s
__________________________________________________
Epoch 40:
Learning rate:    0.0003


Training Batches:   5%|▍         | 38/774 [06:52<2:10:47, 10.66s/batch]

did not converge 1
not converged:  tensor(46, device='cuda:0')


Training Batches:  11%|█         | 87/774 [15:41<2:05:21, 10.95s/batch]

did not converge 1
not converged:  tensor(2, device='cuda:0')


Training Batches:  13%|█▎        | 102/774 [18:24<1:59:20, 10.66s/batch]

did not converge 1
not converged:  tensor(14, device='cuda:0')


Training Batches:  36%|███▌      | 278/774 [50:22<1:29:03, 10.77s/batch]

did not converge 1
not converged:  tensor(56, device='cuda:0')


Training Batches:  37%|███▋      | 286/774 [51:50<1:26:29, 10.63s/batch]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training Batches:  96%|█████████▌| 370/387 [1:59:21<05:27, 19.26s/batch]

did not converge 1
not converged:  tensor(32, device='cuda:0')


                                                                        

Training time:  7480.41 s
Validating...


                                                                    

Making plots. Saved location: plots/epochs/epoch41/train
Saving plot at plots/epochs/epoch41/train/EtEi.png
Saving plot at plots/epochs/epoch41/train/grad.png
Saving plot at plots/epochs/epoch41/train/PerAtomEn.png


                                                                    

Making plots. Saved location: plots/epochs/epoch41/valid
Saving plot at plots/epochs/epoch41/valid/EtEi.png
Saving plot at plots/epochs/epoch41/valid/grad.png
Saving plot at plots/epochs/epoch41/valid/PerAtomEn.png
                       train         valid
------------------------------------------
TperAtom MAE:       0.037154      0.037456
Force-RMSE  :    *   0.27388       0.27588
Force-MAE   :    *   0.18397   *   0.18375
Force-RSQ   :        0.98112       0.98111
MolEn-RMSE  :        0.50599       0.50993
MolEn-MAE   :        0.44907       0.44952
MolEn-RSQ   :        0.99848       0.99851
L2Reg       :         917.43        917.43
Loss-Err    :         1.2533        1.2586
Loss-Reg    :     0.00091743    0.00091743
Loss        :         1.2542        1.2595
Best Loss-Err so far:   0.8238
Epochs since last best: 13
Current max epochs: 48
Total epoch time:  8699.38 s
__________________________________________________
Epoch 42:
Learning rate:    0.0003


Training Batches:   2%|▏         | 6/387 [01:53<2:00:22, 18.96s/batch]

did not converge 1
not converged:  tensor(53, device='cuda:0')


Training Batches:   3%|▎         | 11/387 [03:32<2:02:07, 19.49s/batch]

did not converge 1
not converged:  tensor(126, device='cuda:0')


Training Batches:  10%|▉         | 38/387 [12:12<1:52:29, 19.34s/batch]

did not converge 1
not converged:  tensor(104, device='cuda:0')


Training Batches:  33%|███▎      | 128/387 [41:15<1:21:25, 18.86s/batch]

did not converge 1
not converged:  tensor(48, device='cuda:0')


Training Batches:  38%|███▊      | 147/387 [47:24<1:17:34, 19.39s/batch]