Hyperparameter optimization for FCNN models

In [None]:
# Loading modules

from __future__ import print_function, division

import random
import numpy as np
import torch.nn as nn
import torch
import os

from ray import tune
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.schedulers import ASHAScheduler
from copy import deepcopy
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data.sampler import SubsetRandomSampler
from ase.db import connect

In [None]:
# Data loader

def get_train_val_test_loader(dataset,
                              idx_validation=0,
                              idx_test=None,
                              collate_fn=default_collate,
                              batch_size=64,
                              num_workers=0,
                              pin_memory=False,
                              random_seed=None):
    
    indices = np.arange(len(dataset))[:-38]
    tmp = np.arange(len(dataset))[-38:] # Last 38 images are pure metals
    
    if random_seed:
        random.Random(random_seed).shuffle(indices)
    else:
        random.shuffle(indices)
    
    kfold = np.array_split(indices,10)
    
    kfold_val = deepcopy(kfold[idx_validation])
    
    try:
        kfold_test = deepcopy(kfold[idx_test])
    except:
        kfold_test = []
    
    kfold_train = deepcopy([kfold[i]
                            for i in range(0,10)
                            if i != idx_validation and i != idx_test])
    
    kfold_train = np.array([item for sl in kfold_train for item in sl])
    
    kfold_train = np.concatenate((kfold_train,tmp))
    
    if random_seed:
        random.Random(random_seed).shuffle(kfold_train)
    else:
        random.shuffle(kfold_train)
    
    val_sampler = SubsetRandomSampler(deepcopy(kfold_val))
    test_sampler = SubsetRandomSampler(deepcopy(kfold_test))
    train_sampler = SubsetRandomSampler(deepcopy(kfold_train))
    
    train_loader = DataLoader(dataset, batch_size=batch_size,
                              sampler=train_sampler,
                              num_workers=num_workers,
                              collate_fn=collate_fn,
                              pin_memory=pin_memory)
    
    val_loader = DataLoader(dataset, batch_size=4096,
                            sampler=val_sampler,
                            num_workers=num_workers,
                            collate_fn=collate_fn,
                            pin_memory=pin_memory)
    
    test_loader = DataLoader(dataset, batch_size=4096,
                             sampler=test_sampler,
                             num_workers=num_workers,
                             collate_fn=collate_fn,
                             pin_memory=pin_memory)
    
    return train_loader, val_loader, test_loader

In [None]:
# Network

class Net(nn.Module):
    def __init__(self, n_feature, n_h, h_fea_len, n_output):
        super(Net, self).__init__()

        self.fc_in = nn.Linear(n_feature, h_fea_len)
        self.fc_in_softplus = nn.Softplus()
        if n_h > 1:
            self.fcs = nn.ModuleList([nn.Linear(h_fea_len, h_fea_len)
                                      for _ in range(n_h-1)])
            self.softpluses = nn.ModuleList([nn.Softplus()
                                             for _ in range(n_h-1)])
        self.fc_out = nn.Linear(h_fea_len, n_output)

    def forward(self, x):
        crys_fea = self.fc_in(x)
        crys_fea = self.fc_in_softplus(crys_fea)
        
        if hasattr(self, 'fcs') and hasattr(self, 'softpluses'):
            for fc, softplus in zip(self.fcs, self.softpluses):
                crys_fea = softplus(fc(crys_fea))
        
        out = self.fc_out(crys_fea)
        return out

In [None]:
# Train the network
class TrainFCNN(tune.Trainable):
    def _setup(self, config):
        
        self.lr = config.get('lr', 0.01)
        self.h_fea_len = int(config.get('h_fea_len', 128))
        self.n_h = int(config.get('n_h', 1))
        
    def _train(self):
        
        random_seed = 1234    # reproducible
        batch_size = 64
        num_workers = 0
        weight_decay = 0.0001
        
        collate_fn = default_collate
        
        best_val_loss_mae = 1e10
        best_val_loss_mse = 1e10
        best_test_loss_mae = 1e10
        best_test_loss_mse = 1e10
        
        best_counter = 0
        
        db = connect('../Database.db')
        
        d_cen = np.array([r['data']['d_cen'] for r in db.select()])
        full_width = np.array([r['data']['full_width'] for r in db.select()])
        target = np.stack((d_cen,full_width)).T

        v2ds = np.array([r['data']['tabulated_v2ds'] for r in db.select()])
        v2dd = np.array([r['data']['tabulated_v2dd'] for r in db.select()])
        mulliken = np.array([r['data']['tabulated_mulliken'] for r in db.select()])
        d_cen_inf = np.array([r['data']['tabulated_d_cen_inf'] for r in db.select()])
        full_width_inf = np.array([r['data']['tabulated_full_width_inf'] for r in db.select()])
        fea = np.stack((np.sum((v2ds + v2dd), axis=1), mulliken, d_cen_inf, full_width_inf**2.0/12.0)).T
        
        target = Variable(torch.Tensor(target))
        fea = Variable(torch.Tensor(fea))
        
        name_images = np.arange(len(fea))
        
        dataset = [(torch.Tensor(fea[i]),
                    name_images[i])
                   for i in range(len(fea))]
        
        train_loader, val_loader, test_loader =\
            get_train_val_test_loader(dataset=dataset,
                                      collate_fn=collate_fn,
                                      batch_size=batch_size,
                                      idx_validation=0,
                                      idx_test=1,
                                      num_workers=num_workers,
                                      pin_memory=torch.cuda.is_available(),
                                      random_seed=random_seed)
        
        self.net = Net(n_feature=fea.shape[-1],
                       n_h=self.n_h,
                       h_fea_len=self.h_fea_len,
                       n_output=2).cuda()
        optimizer = torch.optim.AdamW(self.net.parameters(),
                                      lr=self.lr,
                                      weight_decay=weight_decay)
        loss_func = nn.MSELoss()
        
        for epoch in range(100000):
            
            # switch to train mode
            self.net.train()
            
            for i, (input, batch_cif_ids) in enumerate(train_loader):
                prediction = self.net(input.cuda(non_blocking=True))
                # loss must be (1. nn output, 2. target)
                loss = loss_func(prediction, target[batch_cif_ids].cuda(non_blocking=True))*prediction.shape[-1]
                optimizer.zero_grad()   # clear gradients for next train
                loss.backward()         # backpropagation, compute gradients
                optimizer.step()        # apply gradients
            
            # switch to evaluate mode
            self.net.eval()
            
            for i, (input, batch_cif_ids) in enumerate(val_loader):
                prediction = self.net(input.cuda(non_blocking=True))
                val_loss_mae = torch.mean(torch.abs(target[batch_cif_ids].cuda(non_blocking=True) - prediction))*prediction.shape[-1]
                val_loss_mse = loss_func(prediction, target[batch_cif_ids].cuda(non_blocking=True))*prediction.shape[-1]
            
            for i, (input, batch_cif_ids) in enumerate(test_loader):
                prediction = self.net(input.cuda(non_blocking=True))
                test_loss_mae = torch.mean(torch.abs(target[batch_cif_ids].cuda(non_blocking=True) - prediction))*prediction.shape[-1]
                test_loss_mse = loss_func(prediction, target[batch_cif_ids].cuda(non_blocking=True))*prediction.shape[-1]
            
            best_counter += 1
            
            if best_val_loss_mse > val_loss_mse:
                best_val_loss_mae = val_loss_mae
                best_val_loss_mse = val_loss_mse
                best_test_loss_mae = test_loss_mae
                best_test_loss_mse = test_loss_mse
                
                best_counter = 0
            
            if best_counter >= 50: # Exit due to converged
                final_ans_val_mae = best_val_loss_mae.detach().cpu().numpy()
                final_ans_val_mse = best_val_loss_mse.detach().cpu().numpy()
                final_ans_test_mae = best_test_loss_mae.detach().cpu().numpy()
                final_ans_test_mse = best_test_loss_mse.detach().cpu().numpy()
                break
            
            if test_loss_mse != test_loss_mse: # Exit due to NaN
                final_ans_val_mae = best_val_loss_mae.detach().cpu().numpy()
                final_ans_val_mse = best_val_loss_mse.detach().cpu().numpy()
                final_ans_test_mae = best_test_loss_mae.detach().cpu().numpy()
                final_ans_test_mse = best_test_loss_mse.detach().cpu().numpy()
                break
        
        np.savetxt(path + 'final_ans_val_mae_'
                   + str(self.lr)
                   + '_'
                   + str(self.h_fea_len)
                   + '_'
                   + str(self.n_h)
                   + '.txt', [final_ans_val_mae])
        
        np.savetxt(path + 'final_ans_val_mse_'
                   + str(self.lr)
                   + '_'
                   + str(self.h_fea_len)
                   + '_'
                   + str(self.n_h)
                   + '.txt', [final_ans_val_mse])
        
        np.savetxt(path + 'final_ans_test_mae_'
                   + str(self.lr)
                   + '_'
                   + str(self.h_fea_len)
                   + '_'
                   + str(self.n_h)
                   + '.txt', [final_ans_test_mae])
        
        np.savetxt(path + 'final_ans_test_mse_'
                   + str(self.lr)
                   + '_'
                   + str(self.h_fea_len)
                   + '_'
                   + str(self.n_h)
                   + '.txt', [final_ans_test_mse])
        
        return {'mean_loss': final_ans_test_mse}
    
    def _save(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, 'model.pth')
        torch.save(self.net.state_dict(), checkpoint_path)
        return checkpoint_path

    def _restore(self, checkpoint_path):
        self.net.load_state_dict(torch.load(checkpoint_path))



In [None]:
if __name__ == '__main__':
    
    algo = BayesOptSearch(utility_kwargs={
        'kind': 'ucb',
        'kappa': 2.5,
        'xi': 0.0
    })
    algo = ConcurrencyLimiter(algo, max_concurrent=4)
    scheduler = AsyncHyperBandScheduler()
    
    analysis = tune.run(
        TrainFCNN,
        name='TrainFCNN',
        metric='mean_loss',
        mode='min',
        search_alg=algo,
        scheduler=scheduler,
        stop={
            'mean_loss': 0.001,
            'training_iteration': 20,
        },
        resources_per_trial={
            'cpu': 12,
            'gpu': 1
        },
        num_samples= 500,
        checkpoint_at_end=True,
        checkpoint_freq=20,
        config={
            'lr': tune.loguniform(lower=0.0001, upper=0.01, base=10),
            'h_fea_len': tune.uniform(lower=16, upper=301),
            'n_h': tune.uniform(lower=1, upper=11),
        })
    
    print('Best config is:', analysis.get_best_config(metric='mean_loss',
                                                      mode='min'))
