Imports:

In [3]:
import sys

# change to your directory

sys.path.append("/home/saxelrod/Repo")
sys.path.append("/home/saxelrod/Repo/projects/multi_task")
sys.path.append("/home/saxelrod/Repo/projects/multi_task/NeuralForceField")



import os
import shutil
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.optim import Adam

from nff.data import Dataset, split_train_validation_test
from nff.train import Trainer, get_trainer, get_model, loss, hooks, metrics, evaluate

from nff.train.builders.model import get_model
import data_gathering
from nff.data import loader 
import pickle
import pdb

In [5]:
import os

import sys
import django
import pprint
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Change to your directory
sys.path.append('/Users/simonaxelrod/htvs/djangochem')
sys.path.append('/Users/simonaxelrod/htvs')


# setup the django settings file.  Change this to use the settings file that connects you to your desired database
os.environ["DJANGO_SETTINGS_MODULE"] = "djangochem.settings.orgel"
# this must be run to setup access to the django settings and make database access work etc.
django.setup()

# Shell Plus Model Imports
from django.contrib.auth.models import Group, Permission, User
from jobs.models import Job, JobConfig, WorkBatch
from pgmols.models import Batch, Calc, Cluster, Geom, Hessian, Jacobian, Mechanism, Method, Mol, MolGroupObjectPermission, MolSet, MolUserObjectPermission, ProductLink, ReactantLink, Reaction, ReactionType, SinglePoint, Species, Stoichiometry
from django.contrib.admin.models import LogEntry
from django.contrib.contenttypes.models import ContentType
from django.contrib.sessions.models import Session
from features.models import AtomDescriptor, BondDescriptor, ConnectivityMatrix, DistanceMatrix, Fingerprint, ProximityMatrix, SpeciesDescriptor, TrainingSet, Transformation

from pgmols.models import Trajectory, MDFrame


from guardian.models import GroupObjectPermission, UserObjectPermission
# Shell Plus Django Imports
from django.urls import reverse
from django.conf import settings
from django.utils import timezone
from django.core.cache import cache
from django.db.models import SmallIntegerField, Avg, Case, Count, F, Max, Min, Prefetch, Q, Sum, When, Exists, OuterRef, Subquery, FloatField
from django.db import transaction
from django.contrib.auth import get_user_model

Functions for gathering data from the database:

In [9]:
TRANSLATE_DIC = {"calcs__jacobian__forces": "force_0",
                 "calcs__props__totalenergy": "energy_0",
                 "species__smiles": "smiles",
                 "stoichiometry__formula": "formula",
                 "xyz": "nxyz"}
GROUP_NAME = "switches"
METHOD_NAME = "sf_tddft_bhhlyp"
MAX_GEOMS = 10

FILE_PATH = ""
DATA_FILE = os.path.join(FILE_PATH, "data", "{}.pickle".format(METHOD_NAME))


def translate_excited(name):
    state = int(name.split("__")[-2]) + 1
    quant = name.split("__")[-1]
    quant = "force" if (quant == "forces") else quant
    new_name = "{}_{}".format(quant, state)
    return new_name


def translate_all(name):
    if "excitedstates" in name:
        return translate_excited(name)
    elif name in TRANSLATE_DIC:
        return TRANSLATE_DIC[name]
    return name


def get_data(group_name, method_name, max_geoms):
    geoms = Geom.objects.filter(
        calcs__method__name=method_name, species__group__name=group_name).distinct()
    values = geoms.order_by('stoichiometry', 'id').values(
        'xyz', 'calcs__props__totalenergy', 'calcs__jacobian__forces',
               'calcs__props__excitedstates__0__energy', 'calcs__props__excitedstates__0__forces',
        'stoichiometry__formula', 'species__smiles', 'id')[:int(max_geoms*2)]
    data = []
    old_pk = None
    for value in values:
        pk = value["id"]
        if pk != old_pk:
            new_dic = dict()
            for key, val in value.items():
                new_dic.update({translate_all(key): val})
            data.append(new_dic)
        else:
            for key, val in value.items():
                if val != None:
                    data[-1].update({translate_all(key): val})
        old_pk = pk
    return data


def get_and_save_data():
    data = get_data(group_name=GROUP_NAME,
                    method_name=METHOD_NAME, max_geoms=MAX_GEOMS)
    with open(DATA_FILE, "wb") as handle:
        pickle.dump(data, handle)
        


Get the data, save it, and load it back up (only need to get it and save it the first time around, then comment that part out). Set some of the quantities to be None to make sure Dataset handles these properly.

In [10]:
get_and_save_data()

with open(DATA_FILE, "rb") as f:
    data = pickle.load(f)
props = data
props[0]["force_1"] = None
props[0]["energy_1"] = None



Here's the structure of props. Note that it's a list of dicts rather than a dict of lists, because this is how the data comes out of a `values` query of the database.

In [17]:
props

[{'energy_0': -569.0118621472,
  'energy_1': None,
  'force_0': [[-0.016909665, 0.024476257, 0.098884938],
   [0.013802112, -0.03424701, -0.037627308],
   [0.01895837, -0.008069412, 0.007082978],
   [-0.033425755, 0.004157544, 0.052195572],
   [0.022284687, -0.003379596, -0.016916221],
   [-0.056446686, -0.004053487, 0.009094824],
   [0.009112833, 0.018262588, -0.017639168],
   [0.014742618, -0.033972365, -0.059260794],
   [-0.013090793, 0.043369143, 0.081267158],
   [0.012081809, -0.033166524, -0.037385255],
   [0.065905431, -0.021741473, -0.017811093],
   [-0.057519661, 0.036994076, 0.066264044],
   [0.034000659, 0.019505285, -0.060536278],
   [0.001363385, -0.005077309, -0.08718219],
   [0.07606586, 0.000178634, 0.003268399],
   [0.007624037, 0.005428062, 0.032036146],
   [-0.015521493, 0.003730761, 0.044432814],
   [0.025968399, -0.00408894, 0.03358349],
   [0.018482651, -0.023836344, -0.073949602],
   [-0.068658394, 0.028853933, 0.002959216],
   [-0.02484006, 0.013752394, 0.021675

Create the dataset from props:

In [None]:
dataset = Dataset(props=props)

Visualize the structure of `dataset.props`:

In [16]:
dataset.props

{'energy_0': [tensor(-357060.0312),
  tensor(-357056.4062),
  tensor(-357047.2500),
  tensor(-357037.5938),
  tensor(-357030.6875),
  tensor(-357027.0625),
  tensor(-357025.4062),
  tensor(-357023.6250),
  tensor(-357021.6562),
  tensor(-357020.2812)],
 'energy_1': [tensor(nan),
  tensor(-356986.1875),
  tensor(-356978.4688),
  tensor(-356970.9062),
  tensor(-356966.6562),
  tensor(-356973.6562),
  tensor(-356979.5625),
  tensor(-356981.7812),
  tensor(-356981.3750),
  tensor(-356981.1875)],
 'force_0': [tensor([[ -20.0518,   29.0244,  117.2598],
          [  16.3668,  -40.6108,  -44.6192],
          [  22.4812,   -9.5689,    8.3991],
          [ -39.6369,    4.9301,   61.8946],
          [  26.4256,   -4.0076,  -20.0596],
          [ -66.9356,   -4.8067,   10.7848],
          [  10.8062,   21.6562,  -20.9169],
          [  17.4821,  -40.2851,  -70.2727],
          [ -15.5233,   51.4280,   96.3683],
          [  14.3269,  -39.3295,  -44.3322],
          [  78.1520,  -25.7815,  -21.1208

`dataset.props` is a dict of lists rather than a list of dicts, because this is the most natural way for the neural network to handle the data.