In [None]:
# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
import time
import sys
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from copy import deepcopy
from dgllife.utils import Meter, EarlyStopping
from hyperopt import fmin, tpe
from shutil import copyfile
from torch.optim import Adam
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from hyper import init_hyper_space
from utils import get_configure, mkdir_p, init_trial_path, \
    split_dataset, collate_molgraphs, load_model, predict, init_featurizer, load_dataset

In [None]:
start_time = time.time()
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
patienceNum = 50

In [None]:
def run_a_train_epoch(args, epoch, model, data_loader, loss_criterion, optimizer):
    model.train()
    train_meter = Meter()
    for batch_id, batch_data in enumerate(data_loader):
        smiles, bg, labels, masks = batch_data
        if len(smiles) == 1:
            # Avoid potential issues with batch normalization
            continue

        labels, masks = labels.to(args['device']), masks.to(args['device'])
        prediction = predict(args, model, bg)
        loss = (loss_criterion(prediction, labels) * (masks != 0).float()).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        torch.cuda.empty_cache()
        
        train_meter.update(prediction, labels, masks)
        if batch_id % args['print_every'] == 0:
            print('epoch {:d}/{:d}, batch {:d}/{:d}, loss {:.4f}'.format(
                epoch + 1, args['num_epochs'], batch_id + 1, len(data_loader), loss.item()))
    train_score = np.mean(train_meter.compute_metric(args['metric']))
    
    print('epoch {:d}/{:d}, training {} {:.4f}'.format(
        epoch + 1, args['num_epochs'], args['metric'], train_score))
    
    if args['metric'] == 'r2':
        r2_score = np.mean(train_meter.compute_metric(args['metric']))  # in case of multi-tasks
        mae_score = np.mean(train_meter.compute_metric('mae'))  # in case of multi-tasks
        rmse_score = np.mean(train_meter.compute_metric('rmse'))  # in case of multi-tasks
        return {'r2': r2_score, 'mae': mae_score,'rmse': rmse_score }
    else:
        roc_score = np.mean(train_meter.compute_metric(args['metric']))  # in case of multi-tasks
        prc_score = np.mean(train_meter.compute_metric('prc_auc'))  # in case of multi-tasks
        acc_score = np.mean(train_meter.compute_metric('acc'))  # in case of multi-tasks
        return {'roc_auc': roc_score, 'prc_auc': prc_score, 'acc': acc_score}

def run_an_eval_epoch(args, model, data_loader):
    model.eval()
    eval_meter = Meter()
    with torch.no_grad():
        for batch_id, batch_data in enumerate(data_loader):
            smiles, bg, labels, masks = batch_data
            labels = labels.to(args['device'])
            prediction = predict(args, model, bg)
            
            # loss.cpu()
            torch.cuda.empty_cache()
            
            eval_meter.update(prediction, labels, masks)
            
    if args['metric'] == 'r2':
        r2_score = np.mean(eval_meter.compute_metric(args['metric']))  # in case of multi-tasks
        mae_score = np.mean(eval_meter.compute_metric('mae'))  # in case of multi-tasks
        rmse_score = np.mean(eval_meter.compute_metric('rmse'))  # in case of multi-tasks
        return {'r2': r2_score, 'mae': mae_score,'rmse': rmse_score }
    else:
        roc_score = np.mean(eval_meter.compute_metric(args['metric']))  # in case of multi-tasks
        prc_score = np.mean(eval_meter.compute_metric('prc_auc'))  # in case of multi-tasks
        acc_score = np.mean(eval_meter.compute_metric('acc'))  # in case of multi-tasks
        return {'roc_auc': roc_score, 'prc_auc': prc_score, 'acc': acc_score}

def main(args, exp_config, train_set, val_set, test_set):
    # Record settings
    exp_config.update({
        'model': args['model'],
        'n_tasks': args['n_tasks'],
        'atom_featurizer_type': args['atom_featurizer_type'],
        'bond_featurizer_type': args['bond_featurizer_type'],
        'patience': patienceNum
    })
    if args['atom_featurizer_type'] != 'pre_train':
        exp_config['in_node_feats'] = args['node_featurizer'].feat_size()
    if args['edge_featurizer'] is not None and args['bond_featurizer_type'] != 'pre_train':
        exp_config['in_edge_feats'] = args['edge_featurizer'].feat_size()

    # Set up directory for saving results
    args = init_trial_path(args)

    train_loader = DataLoader(dataset=train_set, batch_size=exp_config['batch_size'], shuffle=True,
                              collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    val_loader = DataLoader(dataset=val_set, batch_size=exp_config['batch_size'],
                            collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    test_loader = DataLoader(dataset=test_set, batch_size=exp_config['batch_size'],
                             collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    model = load_model(exp_config).to(args['device'])

    loss_criterion = nn.SmoothL1Loss(reduction='none')
    optimizer = Adam(model.parameters(), lr=exp_config['lr'],
                     weight_decay=exp_config['weight_decay'])
    stopper = EarlyStopping(patience=exp_config['patience'],
                            filename=args['trial_path'] + '/model.pth',
                            metric=args['metric'])

    for epoch in range(args['num_epochs']):
        # Train
        run_a_train_epoch(args, epoch, model, train_loader, loss_criterion, optimizer)

        # Validation and early stop
        val_score = run_an_eval_epoch(args, model, val_loader)
        early_stop = stopper.step(val_score[args['metric']], model)

        if early_stop:
            break

    stopper.load_checkpoint(model)
    
    tr_scores = run_an_eval_epoch(args, model, train_loader)
    val_scores = run_an_eval_epoch(args, model, val_loader)
    te_scores = run_an_eval_epoch(args, model, test_loader)
    print({'train': tr_scores, 'valid': val_scores, 'test': te_scores})    

    torch.cuda.empty_cache()
        
    with open(args['trial_path'] + '/eval.txt', 'w') as f:
        f.write('Best val {}: {}\n'.format(args['metric'], stopper.best_score))
        f.write('Test {}: {}\n'.format(args['metric'], te_scores))

    with open(args['trial_path'] + '/configure.json', 'w') as f:
        json.dump(exp_config, f, indent=2)

    return args['trial_path'], stopper.best_score

def bayesian_optimization(args, train_set, val_set, test_set):
    # Run grid search
    results = []

    candidate_hypers = init_hyper_space(args['model'])

    def objective(hyperparams):
        configure = deepcopy(args)
        trial_path, val_metric = main(configure, hyperparams, train_set, val_set, test_set)

        if args['metric'] in ['r2']:
            # Maximize R2 is equivalent to minimize the negative of it
            val_metric_to_minimize = -1 * val_metric
        else:
            val_metric_to_minimize = val_metric

        results.append((trial_path, val_metric_to_minimize))

        return val_metric_to_minimize

    fmin(objective, candidate_hypers, algo=tpe.suggest, max_evals=args['num_evals'])
    results.sort(key=lambda tup: tup[1])
    best_trial_path, best_val_metric = results[0]

    return best_trial_path

In [None]:
import argparse
parser = argparse.ArgumentParser()

parser.add_argument('-c', '--csv-path', type=str, required=True,
                    help='Path to a csv file for loading a dataset')
parser.add_argument('-sc', '--smiles-column', type=str, required=True,
                    help='Header for the SMILES column in the CSV file')
parser.add_argument('-lv', '--log-values', action='store_true', default=False,
                    help='Whether to take logarithm of the labels for modeling')
parser.add_argument('-t', '--task-names', default=None, type=str,
                    help='Header for the tasks to model. If None, we will model '
                         'all the columns except for the smiles_column in the CSV file. '
                         '(default: None)')
parser.add_argument('-s', '--split',
                    choices=['scaffold_decompose', 'scaffold_smiles', 'random'],
                    default='scaffold_smiles',
                    help='Dataset splitting method (default: scaffold_smiles). For scaffold '
                         'split based on rdkit.Chem.AllChem.MurckoDecompose, '
                         'use scaffold_decompose. For scaffold split based on '
                         'rdkit.Chem.Scaffolds.MurckoScaffold.MurckoScaffoldSmiles, '
                         'use scaffold_smiles.')
parser.add_argument('-sr', '--split-ratio', default='0.8,0.1,0.1', type=str,
                    help='Proportion of the dataset to use for training, validation and test '
                         '(default: 0.8,0.1,0.1)')
parser.add_argument('-me', '--metric', choices=['r2', 'mae', 'rmse'], default='r2',
                    help='Metric for evaluation (default: r2)')
parser.add_argument('-mo', '--model', choices=['GCN', 'GAT', 'Weave', 'MPNN', 'AttentiveFP',
                                               'gin_supervised_contextpred',
                                               'gin_supervised_infomax',
                                               'gin_supervised_edgepred',
                                               'gin_supervised_masking',
                                               'NF'],
                    default='GCN', help='Model to use (default: GCN)')
parser.add_argument('-a', '--atom-featurizer-type', choices=['canonical', 'attentivefp'],
                    default='canonical',
                    help='Featurization for atoms (default: canonical)')
parser.add_argument('-b', '--bond-featurizer-type', choices=['canonical', 'attentivefp'],
                    default='canonical',
                    help='Featurization for bonds (default: canonical)')
parser.add_argument('-n', '--num-epochs', type=int, default=1000,
                    help='Maximum number of epochs allowed for training. '
                         'We set a large number by default as early stopping '
                         'will be performed. (default: 1000)')
parser.add_argument('-nw', '--num-workers', type=int, default=0,
                    help='Number of processes for data loading (default: 1)')
parser.add_argument('-pe', '--print-every', type=int, default=20,
                    help='Print the training progress every X mini-batches')
parser.add_argument('-p', '--result-path', type=str, default='regression_results',
                    help='Path to save training results (default: regression_results)')
parser.add_argument('-ne', '--num-evals', type=int, default=None,
                    help='Number of trials for hyperparameter search (default: None)')

In [None]:
GPUNum = '0'
repetitions = 10
args = parser.parse_args(args=['--csv-path','SurFace1881_seed0.csv',
                               '--task-names','Surface',
                               '--smiles-column','smiles',
                               '--result-path','result/SurFace1881_GAT',
                               #'--log-values',
                               '--num-evals','50',
                               '--num-epochs','1000',
#                                '--split-ratio',
                                '--split','random',                     
                               '--metric','r2',
                               '--model','GAT',
                               '--atom-featurizer-type','attentivefp',
                               '--bond-featurizer-type','attentivefp'
#                                '--num-workers',
#                                '--print-every',
                                  ]).__dict__
args

In [None]:
import os
import shutil

def del_file(filepath):
    """
    删除某一目录下的所有文件或文件夹
    :param filepath: 路径
    :return:
    """
    del_list = os.listdir(filepath)
    for f in del_list:
        file_path = os.path.join(filepath, f)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
            
path_data = args['result_path']

del_file(path_data)

dirs = args['result_path']+'/saved_model'

if not os.path.exists(dirs):
    os.makedirs(dirs)

In [None]:
if torch.cuda.is_available():
    args['device'] = torch.device('cuda:'+ GPUNum)
else:
    args['device'] = torch.device('cpu')

if args['task_names'] is not None:
    args['task_names'] = args['task_names'].split(',')
    
args = init_featurizer(args)
df = pd.read_csv(args['csv_path'])
mkdir_p(args['result_path'])
dataset = load_dataset(args, df)
# Whether to take the logarithm of labels for narrowing the range of values
if args['log_values']:
    dataset.labels = dataset.labels.log()
args['n_tasks'] = dataset.n_tasks
train_set, val_set, test_set = split_dataset(args, dataset)

if args['num_evals'] is not None:
    assert args['num_evals'] > 0, 'Expect the number of hyperparameter search trials to ' \
                                  'be greater than 0, got {:d}'.format(args['num_evals'])
    print('Start hyperparameter search with Bayesian '
          'optimization for {:d} trials'.format(args['num_evals']))
    trial_path = bayesian_optimization(args, train_set, val_set, test_set)
else:
    print('Use the manually specified hyperparameters')
    exp_config = get_configure(args['model'])
    main(args, exp_config, train_set, val_set, test_set)
    trial_path = args['result_path'] + '/1'

# Copy final
copyfile(trial_path + '/model.pth', args['result_path'] + '/model.pth')
copyfile(trial_path + '/configure.json', args['result_path'] + '/configure.json')
copyfile(trial_path + '/eval.txt', args['result_path'] + '/eval.txt')

In [None]:
with open(args['result_path']+'/configure.json', 'r') as f:
    config = json.load(f)

In [None]:
config

In [None]:
tr_res = []
val_res = []
te_res = []

def trainWithHyper (args, exp_config, train_set, val_set, test_set):
    # Record settings
    exp_config.update({
        'model': args['model'],
        'n_tasks': args['n_tasks'],
        'atom_featurizer_type': args['atom_featurizer_type'],
        'bond_featurizer_type': args['bond_featurizer_type'],
        'patience': patienceNum
    })
    if args['atom_featurizer_type'] != 'pre_train':
        exp_config['in_node_feats'] = args['node_featurizer'].feat_size()
    if args['edge_featurizer'] is not None and args['bond_featurizer_type'] != 'pre_train':
        exp_config['in_edge_feats'] = args['edge_featurizer'].feat_size()

    # Set up directory for saving results
#     args = init_trial_path(args)

    train_loader = DataLoader(dataset=train_set, batch_size=exp_config['batch_size'], shuffle=True,
                              collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    val_loader = DataLoader(dataset=val_set, batch_size=exp_config['batch_size'],
                            collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    test_loader = DataLoader(dataset=test_set, batch_size=exp_config['batch_size'],
                             collate_fn=collate_molgraphs, num_workers=args['num_workers'])
    model = load_model(exp_config).to(args['device'])
    
    best_model_file = args['result_path']+'/saved_model/%s_bst_%s.pth' % (args['model'], split)

    loss_criterion = nn.SmoothL1Loss(reduction='none')
    optimizer = Adam(model.parameters(), lr=exp_config['lr'],
                     weight_decay=exp_config['weight_decay'])
    
    stopper = EarlyStopping(patience=exp_config['patience'],
                            filename=best_model_file,
                            metric=args['metric'])

    for epoch in range(args['num_epochs']):
        # Train
        run_a_train_epoch(args, epoch, model, train_loader, loss_criterion, optimizer)

        # Validation and early stop
        val_score = run_an_eval_epoch(args, model, val_loader)
        early_stop = stopper.step(val_score[args['metric']], model)

        if early_stop:
            break

    stopper.load_checkpoint(model)
    
    tr_scores = run_an_eval_epoch(args, model, train_loader)
    val_scores = run_an_eval_epoch(args, model, val_loader)
    te_scores = run_an_eval_epoch(args, model, test_loader)
    
    
    tr_res.append(tr_scores);
    val_res.append(val_scores);
    te_res.append(te_scores)   
        

In [None]:
# for split in range(1, repetitions + 1):
for split in range(1, repetitions + 1):
   # raining_data, data_test = train_test_split(my_df, test_size=0.1, random_state=seed)
   # data_train, data_val = train_test_split(training_data, test_size=0.1, random_state=seed)
    train_set, val_set, test_set = split_dataset(args,dataset,split)
    
    trainWithHyper(args, config, train_set, val_set, test_set)




In [None]:
train_set

In [None]:
cols = [ 'r2','mae', 'rmse',]
tr = [list(item.values()) for item in tr_res]
val = [list(item.values()) for item in val_res]
te = [list(item.values()) for item in te_res]
tr_pd = pd.DataFrame(tr, columns=cols)
tr_pd['split'] = range(1, repetitions + 1)
tr_pd['set'] = 'train'
val_pd = pd.DataFrame(val, columns=cols)
val_pd['split'] = range(1, repetitions + 1)
val_pd['set'] = 'validation'
te_pd = pd.DataFrame(te, columns=cols)
te_pd['split'] = range(1, repetitions + 1)
te_pd['set'] = 'test'
sta_pd = pd.concat([tr_pd, val_pd, te_pd], ignore_index=True)
sta_pd['model'] = args['model']
sta_pd.to_csv('{}_statistical_results_split10.csv'.format(args['model']), index=False)

print('training mean:', np.mean(tr, axis=0), 'training std:', np.std(tr, axis=0))
print('validation mean:', np.mean(val, axis=0), 'validation std:', np.std(val, axis=0))
print('testing mean:', np.mean(te, axis=0), 'test std:', np.std(te, axis=0))
end_time = time.time()
print('the total elapsed time is', end_time - start_time, 'S')

In [None]:
end_time = time.time()
print('the total elapsed time is', (end_time - start_time)/3600, 'H')

In [None]:
print('the total elapsed time is', (end_time - start_time)/3600, 'H')
print(args['model'])