In [40]:
# インストールはgit cloneで
# https://github.com/chainer/chainer-chemistry/issues/406

# この辺の例をもとに作成
# https://github.com/chainer/chainer-chemistry/tree/master/examples/own_dataset
# https://github.com/chainer/chainer-chemistry/tree/master/examples/qm9

In [1]:
import numpy as  np
import pandas as pd
import os
from sklearn.metrics import mean_squared_error

import chainer
import numpy
import os

from chainer.datasets import split_dataset_random
from chainer import functions as F

from argparse import ArgumentParser
from chainer.iterators import SerialIterator
from chainer.training.extensions import Evaluator

from chainer_chemistry.models.prediction import Regressor
from chainer_chemistry.models.prediction import set_up_predictor
from chainer_chemistry.dataset.parsers import CSVFileParser
from chainer_chemistry.dataset.converters import converter_method_dict
from chainer_chemistry.dataset.preprocessors import preprocess_method_dict
from chainer_chemistry import datasets as D
from chainer_chemistry.datasets import NumpyTupleDataset

# These imports are necessary for pickle to work.
from chainer_chemistry.links.scaler.standard_scaler import StandardScaler  # NOQA
from chainer_chemistry.models.prediction import GraphConvPredictor  # NOQA
from chainer_chemistry.utils import save_json
from chainer_chemistry.utils import run_train

# 作図関連
from bokeh.plotting import output_notebook, figure, show
from bokeh.models import ColumnDataSource, CDSView, GroupFilter,HoverTool,Range1d
output_notebook() 



In [2]:
# 予測値の項目名、数を設定
labels = ['value1', 'value2']
class_num = len(labels) if isinstance(labels, list) else 1
class_num

2

In [3]:
# 予測方式のリスト
method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn',
                   'relgat', 'mpnn', 'gnnfilm', 'megnet']

# 選択する予測方式
i = 4

In [4]:
# CSVファイルの読み込み
def postprocess_label(label_list):
    return np.asarray(label_list, dtype=np.float32)


preprocessor = preprocess_method_dict[method_list[i]]()
parser = CSVFileParser(preprocessor, postprocess_label=postprocess_label,
                       labels=labels, smiles_col='SMILES')

dataset = parser.parse('dataset_train.csv')['dataset']

100%|████████████████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 2249.43it/s]


In [5]:
# 必要な場合は標準化を以下のように行う
scaler = StandardScaler()
scaler.fit(dataset.get_datasets()[-1])

<chainer_chemistry.links.scaler.standard_scaler.StandardScaler at 0x1b7ef3644c8>

In [6]:
# トレーニングとバリデーションのデータを分割
train_data_size = int(len(dataset) * 1)
train, _ = split_dataset_random(dataset, train_data_size, seed = 0)

In [7]:
# predictorのセットアップ
# n_unitはモデルのレイヤー数
predictor = set_up_predictor(
    method = method_list[0], n_unit = 16,
    #conv_layers=4,class_num=class_num)
    conv_layers=4,class_num=class_num, label_scaler=scaler)

Set up NFP predictor...


In [8]:
# 条件の設定、device=-1 Numpy(CPU)で計算？、+1でCupy(GPU)で計算？
def rmse(x0, x1):
    return F.sqrt(F.mean_squared_error(x0, x1))

device = chainer.get_device(-1)
metrics_fun = {'mae': F.mean_absolute_error, 'rmse': rmse}
regressor = Regressor(predictor, lossfun=F.mean_squared_error,metrics_fun=metrics_fun, device=device)

In [9]:
# トレーニングの実施
converter_t = converter_method_dict[method_list[0]]
    
run_train(regressor, train, valid=None,
          batch_size=32, epoch=10,
          out='result', extensions_list=None,
          device=device, converter=converter_t,
          resume_path=None)


epoch       main/loss   main/mae    main/rmse   elapsed_time
1           0.00693981  0.0608485   0.0832953   0.146191      
2           0.00471831  0.0491045   0.0684569   0.297799      
3           0.00380194  0.0448476   0.0614458   0.436167      
     total [#################.................................] 35.56%
this epoch [###########################.......................] 55.56%
        10 iter, 3 epoch / 10 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
4           0.00242349  0.0354384   0.0490348   0.574075      
5           0.00197217  0.0329826   0.0443258   0.697776      
6           0.001788    0.0321756   0.0421888   0.784516      
7           0.001345    0.0284455   0.0366497   0.911064      
     total [###################################...............] 71.11%
this epoch [#####.............................................] 11.11%
        20 iter, 7 epoch / 10 epochs
    23.387 iters/sec. Estimated time to finish: 0:00:00.347414.
8           0.0013

In [14]:
# テストデータの読み込み
dataset = None
dataset = parser.parse('dataset_test.csv')['dataset']

100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 2885.06it/s]


In [38]:
# この辺参考
# https://github.com/chainer/chainer-chemistry/issues/362
if isinstance(dataset, NumpyTupleDataset):
    converter = converter_method_dict[method_list[i]]

    @chainer.dataset.converter()
    def extract_inputs(batch, device=None):
        return converter(batch, device=device)[:-1]

    # Extract the ground-truth labels as numpy array.
    original_t = converter(dataset, device=-1)[-1]
else:
    converter = dataset.converter
    extract_inputs = converter

    # Extract the ground-truth labels as numpy array.
    original_t = converter(dataset, device=-1).y

def postprocess_fn(x): 
    if scaler is not None: 
        scaled_x = scaler.inverse_transform(x) 
        return scaled_x 
    else: 
        return x

In [34]:
# データの予測
#y_pred = regressor.predict(dataset, converter=extract_inputs,postprocess_fn=postprocess_fn)
y_pred = regressor.predict(dataset, converter=extract_inputs,postprocess_fn=lambda x:x)

In [35]:
y_pred

array([[-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826725, -0.00034008],
       [-0.22826727, -0.00034007],
       [-0.22826727, -0.00034007]], dtype=float32)

In [36]:
# 予測結果のdf作成
df_dict = {}
for i, l in enumerate(labels):
    df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
                    't_{}'.format(l): original_t[:, i], })
df = pd.DataFrame(df_dict)

# Show a prediction/ground truth table with 5 random examples.
df.head(10)

Unnamed: 0,y_pred_value1,t_value1,y_pred_value2,t_value2
0,-0.228267,-0.2191,-0.00034,0.0859
1,-0.228267,-0.2751,-0.00034,-0.033
2,-0.228267,-0.2308,-0.00034,-0.0537
3,-0.228267,-0.2626,-0.00034,-0.0437
4,-0.228267,-0.1958,-0.00034,-0.0227
5,-0.228267,-0.2444,-0.00034,0.0191
6,-0.228267,-0.2335,-0.00034,-0.0118
7,-0.228267,-0.2748,-0.00034,0.0225
8,-0.228267,-0.2505,-0.00034,-0.0208
9,-0.228267,-0.2787,-0.00034,-0.0694


In [37]:
# データの可視化
source = ColumnDataSource(df)

x = np.linspace(-1,1)
y = x

common_para = {'fill_alpha':0.3,'line_alpha':0.5}

p = figure(plot_height=400, plot_width=400)


p.line(x,y,line_color='Silver')
p.circle(x="t_value1", y="y_pred_value1", size=10,source=source,**common_para,fill_color='OrangeRed',line_color='OrangeRed',legend_label='test')


p.x_range=Range1d(start=-0.6,end=0.1)
p.y_range=Range1d(start=-0.6,end=0.1)
p.legend.location = "top_left"
p.xaxis.axis_label = 't_value'
p.yaxis.axis_label = 'Pred'
show(p)

## 予測値が全部ほぼ同じ値になってしまう