# 03 Train PIP-NN

## Prepare Dataset

In [1]:
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train_tensor = torch.from_numpy(np.loadtxt("X_train.txt"))
X_valid_tensor = torch.from_numpy(np.loadtxt("X_valid.txt"))
Y_train_tensor = torch.from_numpy(np.loadtxt("Y_train.txt").reshape(-1, 1))
Y_valid_tensor = torch.from_numpy(np.loadtxt("Y_valid.txt").reshape(-1, 1))

X_train_tensor.shape, X_valid_tensor.shape, Y_train_tensor.shape, Y_valid_tensor.shape

(torch.Size([12720, 34]),
 torch.Size([3181, 34]),
 torch.Size([12720, 1]),
 torch.Size([3181, 1]))

## Min Max Scaling

To train a network, it is wise to perform min max scaler on input and target data, here we will map the data to range $[-1, 1]$.

$$
T_\text{scaled} = 2 \times \frac{T - T_\text{min}}{T_\text{max} - T_\text{min}} - 1
$$

In [3]:
def scale(t: torch.Tensor, t_min: torch.Tensor, t_max: torch.Tensor) -> torch.Tensor:
    return 2 * (t - t_min) / (t_max - t_min) - 1


X_all_tensor = torch.cat((X_train_tensor, X_valid_tensor))
X_min_tensor = X_all_tensor.min(dim=0).values
X_max_tensor = X_all_tensor.max(dim=0).values
X_scaled_train_tensor = scale(X_train_tensor, X_min_tensor, X_max_tensor)
X_scaled_valid_tensor = scale(X_valid_tensor, X_min_tensor, X_max_tensor)

Y_all_tensor = torch.cat((Y_train_tensor, Y_valid_tensor))
Y_min_tensor = Y_all_tensor.min(dim=0).values
Y_max_tensor = Y_all_tensor.max(dim=0).values
Y_scaled_train_tensor = scale(Y_train_tensor, Y_min_tensor, Y_max_tensor)
Y_scaled_valid_tensor = scale(Y_valid_tensor, Y_min_tensor, Y_max_tensor)

np.savetxt("X_min.txt", X_min_tensor.detach().numpy())
np.savetxt("X_max.txt", X_max_tensor.detach().numpy())
np.savetxt("Y_min.txt", Y_min_tensor.detach().numpy())
np.savetxt("Y_max.txt", Y_max_tensor.detach().numpy())

### DataLoader

Here we will wrap scaled data with `Dataset` and `Dataloader` from torch for later trainning. A customized dataset class `PIPDataset` inherited from `torch.utils.data.Dataset` will be loaded from `PIPDataset.py`.

In [4]:
from torch.utils.data import DataLoader
from PIP_Dataset import PIP_Dataset

In [5]:
# dataset
train_dataset = PIP_Dataset(pip=X_scaled_train_tensor, E=Y_scaled_train_tensor)
valid_dataset = PIP_Dataset(pip=X_scaled_valid_tensor, E=Y_scaled_valid_tensor)

# dataloader
batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

## Build a Model

### Define a Neural Network

To build a model, first we need to define the structure of nerual network. In PIP-NN, the NN part is a simple BPNN. Here a pre-defined model in `PIPNN.py` will be loaded.

In [6]:
from PIP_NN import PIP_NN

In [7]:
# the length of pip is required by the input layer in PIP-NN
n_pip = X_scaled_train_tensor.shape[1]
model = PIP_NN(n_pip)
model

PIP_NN(
  (layer_stack): Sequential(
    (0): Linear(in_features=34, out_features=10, bias=True)
    (1): Tanh()
    (2): Linear(in_features=10, out_features=50, bias=True)
    (3): Tanh()
    (4): Linear(in_features=50, out_features=1, bias=True)
  )
)

### Choose the Device

It is well known that GPU can accelerate the training progress, but not all machines have GPUs, and not all GPUs are compatibale with deep learning frameworks.

Here we try to use `CUDA` from nVIDIA as backend, if failed we can also turn back to CPU.

In [8]:
import torch

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# we need to move our model to the target device
model = model.to(device)
# check the device
device

device(type='cuda')

### Hyperparameters

Hyperparameters are parameters that can be tuned to enhance the performance of a model. Usually they are set by researchers or eigineers manually based on experience, but frameworks can tune the parameters automatically also exists. Here we directly define hyperparameters.

- Batch Size
- Learning Rate
- Epoches

In [10]:
# batch size is already defined in dataloader section
learning_rate = 1e-2
epoches = 300

### Optimizer and Loss Function

We want to build a model which has a great performance on tasks we concerned, so we have to find out how to access the model. Different loss functions are defined for different tasks. For classfication tasks we have cross entropy function and for regression tasks we have RMSE (root mean squared error) function.

PIP-NN is model which maps the structures of a molecule to its energy. Obviously it is a regression task, so MSE is a good choice.

We already have a loss function, but how can we lower the loss? Besides hyperparameters, a model has its internal parameters, aka weights and biases. Optimizer will tune the internal parameters in order to lower the total loss using some algorithms, such as SGD and Adam.

Here we use Adam as optimizer and a schedular to tune the learning rate.

In [11]:
from torch.nn import MSELoss
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [12]:
loss_fn = MSELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
schedular = ReduceLROnPlateau(optimizer)

### Train and Validate

When we train a model, these steps will be done in order:

1. give the model a pair of input and output data `(X, y)`
2. use the model to predict, then we get `y_pred = model(X)`
3. calculate the loss using loss function, `loss = loss_fn(y_pred, y)`
4. backward propagation and update the internal parameters

After training, we need to validate the model. The steps are almost the same as training, but backward propagation will never be done.

We are concerned about how the loss changes during training, which indicates the changes inside the model. It is wise to save the value of variables during training by using tools such as tensorboard.

In [13]:
from torch.utils.tensorboard import SummaryWriter
from time import time
from datetime import timedelta

In [14]:
def train(train_loader, model, loss_fn, optimizer):
    # size of train dataset
    train_size = len(train_loader.dataset)
    # set model to train mode
    model.train()
    # loss
    loss = 0.0

    # start train
    for batch, (X, y) in enumerate(train_loader):
        # move data to target device
        X = X.to(device)
        y = y.to(device)

        # predict and loss
        y_pred = model(X)
        loss = loss_fn(y_pred, y)

        # backward propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # logs
        if batch % 1000 == 0:
            current = batch * len(X)
            print(f"loss: {loss.item():.7f} [{current:7d} / {train_size:7d}]")

    print(f"loss: {loss.item():.7f} [{train_size:7d} / {train_size:7d}]")

    return loss

In [15]:
def valid(valid_loader, model, loss_fn):
    # size of valid dataset
    valid_size = len(valid_loader.dataset)
    # number of batches
    n_batches = len(valid_loader)

    # set model to eval mode
    model.eval()

    # total loss
    loss_tot = 0

    with torch.no_grad():
        for (X, y) in valid_loader:
            # to device
            X = X.to(device)
            y = y.to(device)
            
            # loss
            y_pred = model(X)
            loss = loss_fn(y_pred, y)
            loss_tot += loss.item()
    
    loss_tot /= n_batches

    print(f"loss: {loss_tot:.7f} [{valid_size:7d} / {valid_size:7d}]")

    return loss_tot

In [16]:
writer = SummaryWriter("logs/NaOH-PIP-NN")

In [17]:
# Training!

start = time()

for t in range(epoches):
    # start
    print(f"Epoch {t + 1}")
    print("-" * 80)
    epoch_start = time()

    # save learning rate
    current_lr = optimizer.state_dict()["param_groups"][0]["lr"]
    writer.add_scalar("learning_rate", current_lr, t)
    print(f"Learning rate: {current_lr:.7f}")

    # train
    print("Train:")
    train_loss = train(train_loader, model, loss_fn, optimizer)
    writer.add_scalar("train_loss", train_loss, t)
    train_end = time()
    print(f"Train time: {timedelta(seconds=(train_end - epoch_start))}")

    # valid
    print("Valid:")
    valid_loss = valid(valid_loader, model, loss_fn)
    writer.add_scalar("valid_loss", valid_loss, t)
    writer.flush()
    valid_end = time()
    print(f"Valid time: {timedelta(seconds=(valid_end - train_end))}")

    # update learning rate according to loss
    schedular.step(valid_loss)
    
    # total epoch
    epoch_end = time()
    print(f"Epoch time: {timedelta(seconds=(epoch_end - epoch_start))}")
    print()

end = time()

print("Done!")
print(f"Total time: {timedelta(seconds=(end - start))}")

Epoch 1
--------------------------------------------------------------------------------
Learning rate: 0.0100000
Train:
loss: 0.4943043 [      0 /   12720]
loss: 0.0038137 [  12720 /   12720]
Train time: 0:00:01.959363
Valid:
loss: 0.0069190 [   3181 /    3181]
Valid time: 0:00:00.062537
Epoch time: 0:00:02.021948

Epoch 2
--------------------------------------------------------------------------------
Learning rate: 0.0100000
Train:
loss: 0.0089397 [      0 /   12720]
loss: 0.0045110 [  12720 /   12720]
Train time: 0:00:00.611709
Valid:
loss: 0.0046500 [   3181 /    3181]
Valid time: 0:00:00.064368
Epoch time: 0:00:00.676151

Epoch 3
--------------------------------------------------------------------------------
Learning rate: 0.0100000
Train:
loss: 0.0040889 [      0 /   12720]
loss: 0.0064444 [  12720 /   12720]
Train time: 0:00:00.624989
Valid:
loss: 0.0028120 [   3181 /    3181]
Valid time: 0:00:00.064295
Epoch time: 0:00:00.689337

Epoch 4
--------------------------------------

### Save the Model

In [18]:
torch.save(model, f"NaOH-PIP-NN-model.pth")