In [1]:
# test train task
# train a model with enough runs
# compare results with default MD DFT
!nvidia-smi

Tue Mar 26 23:17:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:04:00.0 Off |                  N/A |
| 31%   35C    P8               7W / 250W |      2MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 2080 Ti     On  | 00000000:0B:00.0 Off |  

In [1]:
# import modules
from pathlib import Path
import torch
import logging
import shutil
from collections import deque
from dataclasses import dataclass
from random import shuffle, sample
from typing import Dict, Any, Optional
import json
from functools import partial, update_wrapper
import numpy as np
import time
import pickle

import ase
from ase.db import connect
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from fff.learning.gc.ase import SchnetCalculator
from fff.learning.gc.functions import GCSchNetForcefield
from fff.learning.gc.models import SchNet, load_pretrained_model
from fff.learning.util.messages import TorchMessage
from fff.sampling.md import MolecularDynamics
from fff.simulation import run_calculator, _run_calculator
from fff.simulation.utils import read_from_string, write_to_string

In [3]:
## pynvml
import pynvml
from torch.utils.data.distributed import DistributedSampler

# def get_gpu_info():
#     gpu_info = {}
#     try:
#         pynvml.nvmlInit()
#         device_count = pynvml.nvmlDeviceGetCount()
#         for i in range(device_count):
#             handle = pynvml.nvmlDeviceGetHandleByIndex(i)
#             gpu_name = pynvml.nvmlDeviceGetName(handle).decode('utf-8')
#             gpu_info[f'GPU {i+1}'] = gpu_name.strip()
#         pynvml.nvmlShutdown()
#     except pynvml.NVMLError as error:
#         print("Error: Failed to retrieve GPU information -", error)
#     return gpu_info

# # 调用函数获取GPU信息并存储在字典中
# gpu_dict = get_gpu_info()

# # 打印GPU字典
# for gpu, model in gpu_dict.items():
#     print(f"{gpu}: {model}")

# torch.distributed.init_process_group(backend="nccl")
print(torch.cuda.device_count())
device = torch.device('cuda:1')
print(device)
# print(torch.distributed.get_rank())

4
cuda:1


In [2]:
# path and varaibles
multisite_path = "/home/lizz_lab/cse12232433/project/colmena/multisite_"
training_set = multisite_path + \
    "/data/forcefields/starting-model/initial-database.db"
model_path = multisite_path + "/data/forcefields/starting-model/starting-model"
search_path = training_set
out_dir = Path(multisite_path) / f"my_test/temp"
out_dir.mkdir(parents=True, exist_ok=True)

starting_model = torch.load(model_path, map_location='cpu')

num_epochs = 128
huber_deltas = (1, 10)
sampler_kwargs = {'device': "cpu", 'timestep': 0.1, 'log_interval': 10}
sampler = MolecularDynamics()
n_models = 1
n_qc_workers = 8
min_run_length = 200
max_run_length = 2000
energy_tolerance = 0.1

In [3]:
# train model pretreat

# Apply wrappers to functions that will be used to fix certain requirements
def _wrap(func, **kwargs):
    out = partial(func, **kwargs)
    update_wrapper(out, func)
    return out

# MD objectives


@dataclass
class Trajectory:
    """Tracks the state of searching along individual trajectories

    We mark the starting point, the last point produced from sampling,
    and the last point we produced that has been validated
    """
    id: int  # ID number of the
    starting: ase.Atoms  # Starting point of the trajectory
    current_timestep = 0  # How many timesteps have been used so far
    last_validated: ase.Atoms = None  # Last validated point on the trajectory
    current: ase.Atoms = None  # Last point produced along the trajectory
    last_run_length: int = 0  # How long between current and last_validated
    name: str = None  # Name of the trajectory

    def __post_init__(self):
        self.last_validated = self.current = self.starting

    def update_current_structure(self, strc: ase.Atoms, run_length: int):
        """Update the structure that has yet to be updated

        Args:
            strc: Structure produced by sampling
            run_length: How many timesteps were performed in sampling run
        """
        self.current = strc.copy()
        self.last_run_length = run_length

    def set_validation(self, success: bool):
        """Set whether the trajectory was successfully validated

        Args:
            success: Whether the validation was successful
        """
        if success:
            self.last_validated = self.current  # Move the last validated forward
            self.current_timestep += self.last_run_length


@dataclass
class SimulationTask:
    atoms: ase.Atoms  # Structure to be run
    traj_id: int  # Which trajectory this came from
    ml_eng: float  # Energy predicted from machine learning model
    ml_std: Optional[float] = None  # Uncertainty of the model


# get model
schnet = GCSchNetForcefield(starting_model)

# copy training data
train_path = out_dir / "train.db"
shutil.copyfile(training_set, train_path)

device = torch.device('cuda')
# wrap functions
# train model
my_train_schnet = _wrap(schnet.train, num_epochs=num_epochs, device=device,
                        patience=8, reset_weights=False,
                        huber_deltas=huber_deltas)

# evaluate model
my_eval_schnet = _wrap(schnet.evaluate, device=device)

# use model sampling
my_run_dynamics = _wrap(sampler.run_sampling, **sampler_kwargs)


# prepare input
# Load in the search space
with connect(search_path) as db:
    search_space = [Trajectory(i, x.toatoms(), name=x.get(
        'filename', f'traj-{i}')) for i, x in enumerate(db.select(''))]
    shuffle(search_space)
    search_space = deque(search_space)

# Load in the training dataset
with connect(train_path) as db:
    all_examples = np.array([x.toatoms() for x in db.select("")], dtype=object)

    # Remove the unrealistic structures
    # if self.max_force is not None:
    #     all_examples = [a for a in all_examples if np.abs(a.get_forces()).max() < max_force]

# search space queue
to_audit: dict[int, Trajectory] = {}  # Trajectories that need to be audited
audit_results: deque[float] = deque(maxlen=50)  # Results of the last 50 audits
task_queue_audit = []

# Prepare the initial model
StartModelMessage = TorchMessage(starting_model)
ActiveModelMessage = SchnetCalculator(starting_model)
# Prepare the dataset
train_sets = []
valid_sets = []
n_train = int(len(all_examples) * 0.9)
for _ in range(n_models):
    shuffle(all_examples)
    train_sets.append(all_examples[:n_train])
    valid_sets.append(all_examples[n_train:])

# store model and log
model_msgs = []
train_logs = []

In [4]:
# train model
for i in range(0, 1):
    for i, train_set in enumerate(valid_sets):
        model_msg, train_log = my_train_schnet(
            model_msg=StartModelMessage, train_data=train_set, valid_data=valid_sets[i])
        model_msgs.append(model_msg)
        train_logs.append(train_log)

    # store model
    # now we just test one model
    model_save_path = out_dir / "model.pth"
    with open(model_save_path, 'wb') as fp:
        torch.save(model_msgs[0].get_model(), fp)
    # Save the training data
    with open(out_dir / 'training-history.json', 'a') as fp:
        print(json.dumps(train_logs[0].to_dict(orient='list')), file=fp)

    active_model_proxy = SchnetCalculator(model_msgs[0].get_model())
    StartModelMessage = TorchMessage(model_msgs[0].get_model())
    model_msgs = []
    train_logs = []

StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/lizz_lab/cse12232433/miniconda3/envs/multisite/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/lizz_lab/cse12232433/miniconda3/envs/multisite/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lizz_lab/cse12232433/project/colmena/multisite_/finetuning-surrogates/fff/learning/gc/models.py", line 191, in forward
    device = next(self.parameters()).device
StopIteration


In [6]:
# use model for sampling
# sampling tasks loop
for i in range(0, 100):
    # Pick the next eligible trajectory and start from the last validated structure
    trajectory = search_space.popleft()
    starting_point = trajectory.starting

    # Initialize the structure if need be
    if trajectory.current_timestep == 0:
        MaxwellBoltzmannDistribution(starting_point, temperature_K=100)
        print('Initialized temperature to 100K')
    # Add the structure to a list of those being validated
    to_audit[trajectory.id] = trajectory

    # Determine the run length based on observations of errors
    run_length = min_run_length
    if len(audit_results) > n_qc_workers:
        # Predict run length given audit error
        error_per_step = np.median(audit_results)
        target_error = energy_tolerance * 2
        estimated_run_length = int(target_error / error_per_step)
        print(
            f'Estimated run length of {estimated_run_length} steps to have an error of {target_error:.3f} eV/atom')
        # Keep to within the user-defined bounds
        run_length = max(min_run_length, min(
            max_run_length, estimated_run_length))

    # do sampling
    audit, traj = my_run_dynamics(
        atoms=starting_point, steps=run_length, calc=active_model_proxy)
    # print(audit)
    # print(len(traj))
    # add to list
    to_audit[trajectory.id].update_current_structure(audit, run_length)
    task_queue_audit.append(SimulationTask(
        atoms=traj[-1], traj_id=trajectory.id, ml_eng=traj[-1].get_potential_energy()))

print(len(task_queue_audit))

Initialized temperature to 100K


NameError: name 'active_model_proxy' is not defined

In [None]:
# store sampling result
import pickle
with open(out_dir / 'task_queue_audit', 'wb') as fp:
    pickle.dump(task_queue_audit, fp)

In [None]:
# test simulation
# real MD simulation
tempdir = "./temp"

# set calculator
calc = dict(calc='psi4', method='pbe0-d3', basis='aug-cc-pvdz', num_threads=24)
my_run_simulation = _wrap(run_calculator, calc=calc, temp_path=tempdir)

# prepare input
to_run = task_queue_audit[-1]
ml_eng = to_run.ml_eng
atoms = to_run.atoms
atoms.set_center_of_mass([0, 0, 0])
xyz = write_to_string(atoms, 'xyz')

# run
value = my_run_simulation(xyz)

# result
atoms = read_from_string(value, 'json')
dft_energy = atoms.get_potential_energy()
diff_energy = abs(dft_energy - ml_eng) / len(atoms)
print(diff_energy)

  Threads set to 24 by Python driver.
  Threads set to 24 by Python driver.
0.07407622481029345


In [None]:
# simple grid search
import time
import itertools
from ase.build import molecule
from concurrent.futures import ProcessPoolExecutor, as_completed
from fff.simulation import run_calculator, _run_calculator
from fff.simulation.utils import read_from_string, write_to_string

parameter_space = []
tempdir = "./temp"

def generate_search_space(num_cpus, max_parallelism):
    search_space = []
    for parallelism in range(1, max_parallelism + 1):
        combinations = itertools.combinations_with_replacement(
            range(1, num_cpus + 1), parallelism)
        for combination in combinations:
            if sum(combination) == num_cpus:
                search_space.append(combination)
    return search_space


def bundle_simulation_task(run_parames=[8, 8, 8], atoms_queue=[]):
    # simulation here
    atoms = []
    task_infos = []
    # for _ in range(0,len(run_parames)):
    #     ##TODO we should choose proper atoms here
    #     atoms.append(atoms_queue.pop())
    
    # simple test
    # atoms.append(molecule('H2O'))
    atoms = molecule('H2O')
    with ProcessPoolExecutor(max_workers=len(run_parames)) as exe:
        start_times = {}
        futs = []
        for i,cpus in enumerate(run_parames):
            # atoms[0].set_center_of_mass([0, 0, 0])
            xyz = write_to_string(atoms, 'xyz')
            calc = dict(calc='psi4', method='pbe0-d3', basis='aug-cc-pvdz', num_threads=cpus)            
            
            fut = exe.submit(_run_calculator, str(xyz), calc, tempdir)
            start_times[fut] = time.time()
            futs.append(fut)
            
    execution_times = []
    for fut in as_completed(futs):
        execution_times.append(time.time() - start_times[fut])
        calculated = read_from_string(fut.result(), 'json')
        dft_energy = calculated.get_potential_energy()
        # diff_energy = abs(dft_energy - ml_eng) / len(calculated)

    # task_info = {
    #     'atoms': calculated,
    #     'run_parames': cpus,
    #     'time': end - start,
    #     #TODO 'ml_eng' : to_run.ml_eng,
    #     'dft_eng': dft_energy,
    # }
    # task_infos.append(task_info)
    return execution_times


num_cpus = 24
max_parallelism = 8
parameter_space = generate_search_space(num_cpus, max_parallelism)

print(len(parameter_space))
print(parameter_space)

gird_results = []
for run_parames in parameter_space:
    gird_results.append((run_parames,bundle_simulation_task(run_parames)))


In [None]:
import json
print(len(gird_results))

with open('./temp/grid_results.json', 'w') as json_file:
    json.dump(gird_results, json_file)

821


In [None]:
# test pickle serialization

with open(out_dir / 'task_queue_audit', 'rb') as fp:
    task_queue_audit_test = pickle.load(fp)

# test simulation
# real MD simulation
tempdir = "./temp"

# set calculator
calc = dict(calc='psi4', method='pbe0-d3', basis='aug-cc-pvdz', num_threads=24)
my_run_simulation = _wrap(run_calculator, calc=calc, temp_path=tempdir)

# prepare input

# to_run = task_queue_audit_test[-1]
# ml_eng = to_run.ml_eng
# atoms = to_run.atoms
# atoms.set_center_of_mass([0, 0, 0])

# atoms = molecule('H2O')
# xyz = write_to_string(atoms, 'xyz')

# run
# value = my_run_simulation(xyz)

# result
# atoms = read_from_string(value, 'json')
# dft_energy = atoms.get_potential_energy()
# diff_energy = abs(dft_energy - ml_eng) / len(atoms)
# print(diff_energy)


execution_times = []
diff_energies = []
simulation_lists = []
for to_run in task_queue_audit_test:
    start = time.time()
    ml_eng = to_run.ml_eng
    atoms = to_run.atoms
    atoms.set_center_of_mass([0, 0, 0])
    xyz = write_to_string(atoms, 'xyz')

    # run
    value = my_run_simulation(xyz)
    
    # result
    atoms = read_from_string(value, 'json')
    dft_energy = atoms.get_potential_energy()
    diff_energy = abs(dft_energy - ml_eng) / len(atoms)
    diff_energies.append(diff_energy)
    simulation_lists.append(atoms)
    execution_times.append(time.time() - start)
    # print(diff_energy)

data = []
for x,y,z in simulation_lists,diff_energies,execution_times:
    data.append((x,y,z))
with open('./temp/simulation_tasks_execution_time.json', 'w') as json_file:
    json.dump(data, json_file)

  Threads set to 24 by Python driver.
  Threads set to 24 by Python driver.


ValueError: Calculation failed: 
Fatal Error: Matrix::power: C_DSYEV failed
Error occurred in file: /scratch/psilocaluser/conda-builds/psi4-multiout_1657298395608/work/psi4/src/psi4/libmints/matrix.cc on line: 2330
The most recent 5 function calls were:

psi::FittingMetric::form_eig_inverse(double)
psi::DiskDFJK::preiterations()


In [1]:
# import modules
from pathlib import Path
import torch
import logging
import shutil
from collections import deque
from dataclasses import dataclass
from random import shuffle, sample
from typing import Dict, Any, Optional
import json
from functools import partial, update_wrapper
import numpy as np
import time
import pickle
import multiprocessing

import ase
from ase.db import connect
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
import fff
from fff.learning.gc.ase import SchnetCalculator
from fff.learning.gc.functions import GCSchNetForcefield
from fff.learning.gc.models import SchNet, load_pretrained_model
from fff.learning.util.messages import TorchMessage
from fff.sampling.md import MolecularDynamics
from fff.simulation import run_calculator, _run_calculator
from fff.simulation.utils import read_from_string, write_to_string

import os
import time
from collections import defaultdict
from contextlib import redirect_stderr
from pathlib import Path
from tempfile import TemporaryDirectory

# path and varaibles
multisite_path = "/home/lizz_lab/cse12232433/project/colmena/multisite_"
training_set = multisite_path + \
    "/data/forcefields/starting-model/initial-database.db"
model_path = multisite_path + "/data/forcefields/starting-model/starting-model"
search_path = training_set
out_dir = Path(multisite_path) / f"my_test/temp"
out_dir.mkdir(parents=True, exist_ok=True)

starting_model = torch.load(model_path, map_location='cpu')

num_epochs = 12
huber_deltas = (1, 10)
sampler_kwargs = {'device': "cpu", 'timestep': 0.1, 'log_interval': 10}
sampler = MolecularDynamics()
n_models = 1
n_qc_workers = 8
min_run_length = 200
max_run_length = 2000
energy_tolerance = 0.1

# train model pretreat

# Apply wrappers to functions that will be used to fix certain requirements
def _wrap(func, **kwargs):
    out = partial(func, **kwargs)
    update_wrapper(out, func)
    return out

# MD objectives


@dataclass
class Trajectory:
    """Tracks the state of searching along individual trajectories

    We mark the starting point, the last point produced from sampling,
    and the last point we produced that has been validated
    """
    id: int  # ID number of the
    starting: ase.Atoms  # Starting point of the trajectory
    current_timestep = 0  # How many timesteps have been used so far
    last_validated: ase.Atoms = None  # Last validated point on the trajectory
    current: ase.Atoms = None  # Last point produced along the trajectory
    last_run_length: int = 0  # How long between current and last_validated
    name: str = None  # Name of the trajectory

    def __post_init__(self):
        self.last_validated = self.current = self.starting

    def update_current_structure(self, strc: ase.Atoms, run_length: int):
        """Update the structure that has yet to be updated

        Args:
            strc: Structure produced by sampling
            run_length: How many timesteps were performed in sampling run
        """
        self.current = strc.copy()
        self.last_run_length = run_length

    def set_validation(self, success: bool):
        """Set whether the trajectory was successfully validated

        Args:
            success: Whether the validation was successful
        """
        if success:
            self.last_validated = self.current  # Move the last validated forward
            self.current_timestep += self.last_run_length


@dataclass
class SimulationTask:
    atoms: ase.Atoms  # Structure to be run
    traj_id: int  # Which trajectory this came from
    ml_eng: float  # Energy predicted from machine learning model
    ml_std: Optional[float] = None  # Uncertainty of the model


# get model
schnet = GCSchNetForcefield(starting_model)

# copy training data
train_path = out_dir / "train.db"
shutil.copyfile(training_set, train_path)

device = torch.device('cuda')
# wrap functions
# train model
my_train_schnet = _wrap(schnet.train, num_epochs=num_epochs, device=device,
                        patience=8, reset_weights=False,
                        huber_deltas=huber_deltas)

# evaluate model
my_eval_schnet = _wrap(schnet.evaluate, device=device)

# use model sampling
my_run_dynamics = _wrap(sampler.run_sampling, **sampler_kwargs)


# prepare input
# Load in the search space
with connect(search_path) as db:
    search_space = [Trajectory(i, x.toatoms(), name=x.get(
        'filename', f'traj-{i}')) for i, x in enumerate(db.select(''))]
    shuffle(search_space)
    search_space = deque(search_space)

# Load in the training dataset
with connect(train_path) as db:
    all_examples = np.array([x.toatoms() for x in db.select("")], dtype=object)

    # Remove the unrealistic structures
    # if self.max_force is not None:
    #     all_examples = [a for a in all_examples if np.abs(a.get_forces()).max() < max_force]

# search space queue
to_audit: dict[int, Trajectory] = {}  # Trajectories that need to be audited
audit_results: deque[float] = deque(maxlen=50)  # Results of the last 50 audits
task_queue_audit = []

# Prepare the initial model
StartModelMessage = TorchMessage(starting_model)
ActiveModelMessage = SchnetCalculator(starting_model)
# Prepare the dataset
train_sets = []
valid_sets = []
n_train = int(len(all_examples) * 0.9)
for _ in range(n_models):
    shuffle(all_examples)
    train_sets.append(all_examples[:n_train])
    valid_sets.append(all_examples[n_train:])

# store model and log
model_msgs = []
train_logs = []



In [2]:
import importlib
importlib.reload(fff)


schnet = fff.learning.gc.functions.GCSchNetForcefield(starting_model)
## DDP
train_data = train_sets[0]
valid_data = valid_sets[0]
gpu=[0,1,2,3]
my_train_schnet = _wrap(schnet.train, num_epochs=2, patience=8, reset_weights=False, huber_deltas=huber_deltas,parallel=2)
gpu=[2,3]
# gpu_str = ','.join(map(str, gpu))
# os.environ['CUDA_VISIBLE_DEVICES'] = gpu_str
for i in range(1):
    start_time = time.time()
    result,log = my_train_schnet(train_data=train_data,valid_data=valid_data, device="cuda", gpu=gpu,cpu=4,model_msg=StartModelMessage)
    # result, log = schnet.train(num_epochs=2,patience=8,reset_weights=False,huber_deltas=huber_deltas,train_data=train_data,valid_data=valid_data, device="cuda", gpu=gpu,cpu=4,model_msg=StartModelMessage, parallel=2)
    print(type(result))
    print(log)
    print(f"running time {start_time - time.time()}")

# test multi process with DDP
# gpu0 = [0,1]
# gpu1 = [2,3]
# p1 = multiprocessing.Process(target=my_train_schnet, kwargs={"train_data":train_data,"valid_data":valid_data,"device":"cuda", "gpu":gpu0,"cpu":4,"model_msg":StartModelMessage})
# p2 = multiprocessing.Process(target=my_train_schnet, kwargs={"train_data":train_data,"valid_data":valid_data,"device":"cuda", "gpu":gpu1,"cpu":4,"model_msg":StartModelMessage})
# p1.start()
# p2.start()

<class 'fff.learning.util.messages.TorchMessage'>
   epoch      time  train_loss_force  train_loss_energy  train_loss_total  \
0      0  0.632660          0.174050           0.001451          0.156790   
1      1  1.082573          1.044265           0.000632          0.939902   

   valid_loss_force  valid_loss_energy  valid_loss_total  
0          0.945775           0.000142          0.851212  
1          0.200702           0.000388          0.180671  
running time -12.918474912643433


In [5]:
!export CUDA_VISIBLE_DEVICES=0,3
!echo $CUDA_VISIBLE_DEVICES




In [3]:
import parsl
from parsl.executors import HighThroughputExecutor, WorkQueueExecutor
from parsl.providers import CobaltProvider, AdHocProvider, SlurmProvider, LocalProvider
from parsl.addresses import address_by_hostname
from parsl.launchers import AprunLauncher,SrunLauncher
from parsl.channels import SSHChannel, LocalChannel, SSHInteractiveLoginChannel
from parsl import Config
from parsl.app.app import python_app

# 配置 Parsl 运行时
local_config = Config(
    executors=[
        HighThroughputExecutor(
            label="htex_Local",
            worker_debug=True,
            available_accelerators=2,
            provider=LocalProvider(
                channel=LocalChannel(),
                init_blocks=1,
                max_blocks=1,
                worker_init='''
                which python
                '''
            ),
        )
    ],
    strategy='none',
)
parsl.load(local_config)



<parsl.dataflow.dflow.DataFlowKernel at 0x7f9b08147040>

In [5]:
# options = {'executors': 'all'}
my_train = python_app(my_train_schnet)
result1 = my_train(train_data=train_data,valid_data=valid_data, device="cuda", gpu=[2,3],cpu=4,model_msg=starting_model)
# result2 = my_train(train_data=train_data,valid_data=valid_data, device="cuda", gpu=[1,2],cpu=4,model_msg=starting_model)
print(result1.result())
# print(result2.result())

(<fff.learning.util.messages.TorchMessage object at 0x7f9a3f600580>,    epoch      time  train_loss_force  train_loss_energy  train_loss_total  \
0      0  0.870774          0.174067           0.001451          0.156805   
1      1  1.414091          1.046277           0.000634          0.941713   

   valid_loss_force  valid_loss_energy  valid_loss_total  
0          0.945953           0.000142          0.851372  
1          0.201242           0.000388          0.181156  )


In [82]:
## origin
import ase
import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch.nn import functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.loader import DataListLoader

from fff.learning.gc.data import AtomsDataset
from fff.learning.gc.models import SchNet
from fff.learning.base import BaseLearnableForcefield, ModelMsgType
from fff.learning.util.messages import TorchMessage

from torch_geometric.nn import data_parallel
from torch_geometric.data import Batch
## torch DDP, not completed
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel


sch = GCSchNetForcefield()
model = sch.get_model(StartModelMessage)

# model = data_parallel.DataParallel(model)
model.to('cuda')

# Unpack some inputs
huber_eng, huber_force = huber_deltas
batch_size: int = 32
learning_rate: float = 1e-3
huber_deltas: (float, float) = (0.5, 1)
energy_weight: float = 0.1
reset_weights: bool = False
patience: int = None
train_data = train_sets[0]
valid_data = valid_sets[0]
num_epochs = 32


# Start the training process
with TemporaryDirectory(prefix='spk') as td:
    td = Path(td)
    # Save the batch to an ASE Atoms database
    with open(os.devnull, 'w') as fp, redirect_stderr(fp):
        train_dataset = AtomsDataset.from_atoms(train_data, td / 'train')
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        # train_loader = DataListLoader(train_dataset, batch_size=batch_size, shuffle=True)

        valid_dataset = AtomsDataset.from_atoms(valid_data, td / 'valid')
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        # valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, shuffle=False)


    # Make the trainer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    if patience is None:
        patience = num_epochs // 8
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.8, min_lr=1e-6)

    # Store the best loss
    best_loss = torch.inf

    # Loop over epochs
    log = []
    model.train()
    for epoch in range(num_epochs):
        # Iterate over all batches in the training set
        train_losses = defaultdict(list)
        for batch in train_loader:
            batch.to(device)
            print(batch)
            batch.pos.requires_grad = True
            energy = model(batch)
            force = -torch.autograd.grad(energy, batch.pos, grad_outputs=torch.ones_like(energy), retain_graph=True)[0]

            # Get the forces in energy and forces
            energy_loss = F.huber_loss(energy / batch.n_atoms, batch.y / batch.n_atoms, reduction='mean', delta=huber_eng)
            force_loss = F.huber_loss(force, batch.f, reduction='mean', delta=huber_force)
            print(force.size())
            print(batch.f.size())
            break
        break
    

DataBatch(x=[1533], y=[32], pos=[1533, 3], z=[1533], f=[1533, 3], n_atoms=[32], size=[32], batch=[1533], ptr=[33])
torch.Size([1533, 3])
torch.Size([1533, 3])


In [88]:
## no dataparallel
print(batch.pos.requires_grad)
batch.pos.requires_grad = True
    
energ_batch = model(batch)
print(energ_batch.size())
force_batch = -torch.autograd.grad(energ_batch, batch.pos, grad_outputs=torch.ones_like(energ_batch), retain_graph=True)[0]
print(force_batch.size())
print(batch.pos.size())
print(len(batch.f))
print(batch.f.size())
print(batch.pos)

True
torch.Size([32])
torch.Size([1533, 3])
torch.Size([1533, 3])
1533
torch.Size([1533, 3])
tensor([[-2.4443, -1.2409,  2.5896],
        [-1.6220, -0.8657,  2.2367],
        [-3.1409, -0.8484,  2.0540],
        ...,
        [ 2.2207,  3.9629,  0.8153],
        [ 2.7830,  4.6864,  1.0768],
        [ 2.7988,  3.2752,  0.4353]], device='cuda:0', requires_grad=True)


In [2]:
import ase
import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch.nn import functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.loader import DataListLoader

from fff.learning.gc.data import AtomsDataset
from fff.learning.gc.models import SchNet
from fff.learning.base import BaseLearnableForcefield, ModelMsgType
from fff.learning.util.messages import TorchMessage
from fff.learning.gc.functions import eval_batch

from torch_geometric.nn import data_parallel
from torch_geometric.data import Batch
## torch DDP, not completed
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel


sch = GCSchNetForcefield()
model = sch.get_model(StartModelMessage)

model = data_parallel.DataParallel(model)
model.to('cuda')

# Unpack some inputs
huber_eng, huber_force = huber_deltas
batch_size: int = 32
learning_rate: float = 1e-3
huber_deltas: (float, float) = (0.5, 1)
energy_weight: float = 0.1
reset_weights: bool = False
patience: int = None
train_data = train_sets[0]
valid_data = valid_sets[0]
num_epochs = 32


# Start the training process
with TemporaryDirectory(prefix='spk') as td:
    td = Path(td)
    # Save the batch to an ASE Atoms database
    with open(os.devnull, 'w') as fp, redirect_stderr(fp):
        train_dataset = AtomsDataset.from_atoms(train_data, td / 'train')
        # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        train_loader = DataListLoader(train_dataset, batch_size=batch_size, shuffle=True)

        valid_dataset = AtomsDataset.from_atoms(valid_data, td / 'valid')
        # valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, shuffle=False)


    # Make the trainer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    if patience is None:
        patience = num_epochs // 8
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=patience, factor=0.8, min_lr=1e-6)

    # Store the best loss
    best_loss = torch.inf

    # Loop over epochs
    log = []
    model.train()
    
    for epoch in range(num_epochs):
        # Iterate over all batches in the training set
        train_losses = defaultdict(list)
        for batch in train_loader:
            # batch.to(device)
            print(batch)
            optimizer.zero_grad()
            # energy, force = eval_batch(model, batch)
            break
        break

[Data(x=[60], y=[1], pos=[60, 3], z=[60], f=[60, 3], n_atoms=[1], size=[1]), Data(x=[66], y=[1], pos=[66, 3], z=[66], f=[66, 3], n_atoms=[1], size=[1]), Data(x=[36], y=[1], pos=[36, 3], z=[36], f=[36, 3], n_atoms=[1], size=[1]), Data(x=[36], y=[1], pos=[36, 3], z=[36], f=[36, 3], n_atoms=[1], size=[1]), Data(x=[24], y=[1], pos=[24, 3], z=[24], f=[24, 3], n_atoms=[1], size=[1]), Data(x=[21], y=[1], pos=[21, 3], z=[21], f=[21, 3], n_atoms=[1], size=[1]), Data(x=[27], y=[1], pos=[27, 3], z=[27], f=[27, 3], n_atoms=[1], size=[1]), Data(x=[57], y=[1], pos=[57, 3], z=[57], f=[57, 3], n_atoms=[1], size=[1]), Data(x=[51], y=[1], pos=[51, 3], z=[51], f=[51, 3], n_atoms=[1], size=[1]), Data(x=[33], y=[1], pos=[33, 3], z=[33], f=[33, 3], n_atoms=[1], size=[1]), Data(x=[75], y=[1], pos=[75, 3], z=[75], f=[75, 3], n_atoms=[1], size=[1]), Data(x=[9], y=[1], pos=[9, 3], z=[9], f=[9, 3], n_atoms=[1], size=[1]), Data(x=[63], y=[1], pos=[63, 3], z=[63], f=[63, 3], n_atoms=[1], size=[1]), Data(x=[30], y=

In [3]:
## dataparallel

print(batch)
# energ_batch = model(batch)
# print(energ_batch)


batch_byBatch = Batch.from_data_list(batch)
print(batch_byBatch.pos.requires_grad)
print(batch_byBatch)
batch_byBatch.pos.requires_grad = True
print(type(batch))
batch_pos= [i.pos for i in batch]
for _ in batch_pos:
    _.requires_grad = True
print(len(batch_pos[:][0]))
pos_tensor = torch.cat(batch_pos,dim=0).to('cuda')
print(pos_tensor.size())
print(pos_tensor.requires_grad)
b_f = torch.cat([i.f for i in batch],dim=0)
print(b_f.size())
# force_batch = -torch.autograd.grad(energ_batch, batch.pos, grad_outputs=torch.ones_like(energ_batch), retain_graph=True)[0]
# print(len(force_batch[:][-1]))
# print(len(batch.pos[:][0]))

[Data(x=[60], y=[1], pos=[60, 3], z=[60], f=[60, 3], n_atoms=[1], size=[1]), Data(x=[66], y=[1], pos=[66, 3], z=[66], f=[66, 3], n_atoms=[1], size=[1]), Data(x=[36], y=[1], pos=[36, 3], z=[36], f=[36, 3], n_atoms=[1], size=[1]), Data(x=[36], y=[1], pos=[36, 3], z=[36], f=[36, 3], n_atoms=[1], size=[1]), Data(x=[24], y=[1], pos=[24, 3], z=[24], f=[24, 3], n_atoms=[1], size=[1]), Data(x=[21], y=[1], pos=[21, 3], z=[21], f=[21, 3], n_atoms=[1], size=[1]), Data(x=[27], y=[1], pos=[27, 3], z=[27], f=[27, 3], n_atoms=[1], size=[1]), Data(x=[57], y=[1], pos=[57, 3], z=[57], f=[57, 3], n_atoms=[1], size=[1]), Data(x=[51], y=[1], pos=[51, 3], z=[51], f=[51, 3], n_atoms=[1], size=[1]), Data(x=[33], y=[1], pos=[33, 3], z=[33], f=[33, 3], n_atoms=[1], size=[1]), Data(x=[75], y=[1], pos=[75, 3], z=[75], f=[75, 3], n_atoms=[1], size=[1]), Data(x=[9], y=[1], pos=[9, 3], z=[9], f=[9, 3], n_atoms=[1], size=[1]), Data(x=[63], y=[1], pos=[63, 3], z=[63], f=[63, 3], n_atoms=[1], size=[1]), Data(x=[30], y=