In [1]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.6.1-py3-none-any.whl (608 kB)
[K     |████████████████████████████████| 608 kB 30.2 MB/s 
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 37 kB/s 
Installing collected packages: rdkit-pypi, deepchem
Successfully installed deepchem-2.6.1 rdkit-pypi-2022.3.5


In [2]:
import deepchem as dc
import tensorflow as tf
import torch

# Tensorflow Keras Model 만들기
- dc.models 모듈이 제공하는 KerasModel을 사용

In [3]:
keras_model = tf.keras.Sequential([
    tf.keras.layers.Dense(1000, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model = dc.models.KerasModel(keras_model, dc.models.losses.L2Loss())

In [4]:
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='ECFP', splitter='random')
train_dataset, valid_dataset, test_dataset = datasets

model.fit(train_dataset, nb_epoch=50)

metric = dc.metrics.Metric(dc.metrics.mean_squared_error)

print('training set score:', model.evaluate(train_dataset, [metric]))
print('test set score:', model.evaluate(test_dataset, [metric]))

training set score: {'mean_squared_error': 0.016998797631284805}
test set score: {'mean_squared_error': 0.37252119427686536}


# Torch Model 만들기
- torch.nn.Module 사용

In [5]:
pytorch_model = torch.nn.Sequential(
    torch.nn.Linear(1024, 1000),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(1000, 1)
)

model = dc.models.TorchModel(pytorch_model, dc.models.losses.L2Loss())


In [6]:
model.fit(train_dataset, nb_epoch=50)

print('training set score:', model.evaluate(train_dataset, [metric]))
print('test set score:', model.evaluate(test_dataset, [metric]))

training set score: {'mean_squared_error': 0.01648045789747482}
test set score: {'mean_squared_error': 0.39780620162985697}


# 이진 분류 모델 만들기
- 알츠하이머의 원인으로 지목되는 BACE-1 효소를 분자가 저해하는지 예측하는 이진 분류
- dc.molnet.load_bace_classification()
- 치매 환자의 뇌를 관찰한 결과, BACE1의 발현양이 정상인의 2~3배를 넘는 것으로 밝혀져 BACE1을 타깃으로 하는 약물 개발이 활발히 이뤄졌지만 임상에서 실패

In [7]:
class MyModel(tf.keras.Model):
    
    def __init__(self):
        super(MyModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(1000, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs, training=False):
        y = self.dense1(inputs)
        if training:
            y = tf.nn.dropout(y, 0.5)
        logits = self.dense2(y)
        output = tf.nn.sigmoid(logits)
        return output, logits

In [8]:
keras_model = MyModel()
output_types = ['prediction', 'loss']
model = dc.models.KerasModel(keras_model, 
      dc.models.losses.SigmoidCrossEntropy(), output_types=output_types)

In [9]:
tasks, datasets, transformers = dc.molnet.load_bace_classification(feturizer='ECFP', splitter='scaffold')
train_dataset, valid_dataset, test_dataset = datasets

model.fit(train_dataset, nb_epoch=50)

metric1 = dc.metrics.Metric(dc.metrics.roc_auc_score)
metric2 = dc.metrics.Metric(dc.metrics.accuracy_score)

print('training set score:', model.evaluate(train_dataset, [metric1, metric2]))
print('test set score:', model.evaluate(test_dataset, [metric1, metric2]))

training set score: {'roc_auc_score': 0.9995334218062443, 'accuracy_score': 0.9859504132231405}
test set score: {'roc_auc_score': 0.7708333333333334, 'accuracy_score': 0.6710526315789473}


# 모델 최적화하기
- HIV dataset : 40,000 이상 분자 데이터로 inhibit HIV replication 예측하는 모델
- 에이즈 치료제는 인간 면역결핍 바이러스(HIV)의 증식을 억제하여 질병의 진행을 지연시키는 약물

In [10]:
tasks, datasets, transformers = dc.molnet.load_hiv(featurizer='ECFP', splitter='scaffold')
train_dataset, valid_dataset, test_dataset = datasets

In [11]:
params_dict = {
    'n_tasks': [len(tasks)],
    'n_features': [1024],
    'layer_sizes': [[500], [1000], [1000, 1000]],
    'dropouts': [0.2, 0.5],
    'learning_rate': [0.001, 0.0001]
}
optimizer = dc.hyper.GridHyperparamOpt(dc.models.MultitaskClassifier)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
        params_dict, train_dataset, valid_dataset, metric, transformers)

In [12]:
all_results

{'_dropouts_0.200000_layer_sizes[500]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7680071771506956,
 '_dropouts_0.200000_layer_sizes[500]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7797542499510092,
 '_dropouts_0.500000_layer_sizes[500]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7755624755046051,
 '_dropouts_0.500000_layer_sizes[500]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7593128429355281,
 '_dropouts_0.200000_layer_sizes[1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7815026087595531,
 '_dropouts_0.200000_layer_sizes[1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7747020747599451,
 '_dropouts_0.500000_layer_sizes[1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7730027067411327,
 '_dropouts_0.500000_layer_sizes[1000]_learning_rate_0.000100_n_features_1024_n_tasks_1': 0.7579104815794631,
 '_dropouts_0.200000_layer_sizes[1000, 1000]_learning_rate_0.001000_n_features_1024_n_tasks_1': 0.7494220617773859,
 '_dropo

In [13]:
best_hyperparams

{'n_tasks': 1,
 'n_features': 1024,
 'layer_sizes': [1000],
 'dropouts': 0.2,
 'learning_rate': 0.001}

# Early Stopping 이용하기
- ValidationCallback()를 사용하여 성능을 기록해준다

In [14]:
model = dc.models.MultitaskClassifier(n_tasks=len(tasks),
                                      n_features=1024,
                                      layer_sizes=[500],
                                      dropouts=0.2,
                                      learning_rate=0.001)

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

callback = dc.models.ValidationCallback(valid_dataset, 1000, metrics=metric)

model.fit(train_dataset, nb_epoch=50, callbacks=callback)

Step 1000 validation: roc_auc_score=0.772182
Step 2000 validation: roc_auc_score=0.777096
Step 3000 validation: roc_auc_score=0.767224
Step 4000 validation: roc_auc_score=0.753462
Step 5000 validation: roc_auc_score=0.756655
Step 6000 validation: roc_auc_score=0.75419
Step 7000 validation: roc_auc_score=0.75793
Step 8000 validation: roc_auc_score=0.74253
Step 9000 validation: roc_auc_score=0.738502
Step 10000 validation: roc_auc_score=0.729027
Step 11000 validation: roc_auc_score=0.720942
Step 12000 validation: roc_auc_score=0.73735
Step 13000 validation: roc_auc_score=0.73015
Step 14000 validation: roc_auc_score=0.725297
Step 15000 validation: roc_auc_score=0.718996
Step 16000 validation: roc_auc_score=0.708228


0.0195554518699646

# 학습률 조정하기
- ExponentialDecay()를 사용

In [15]:
learning_rate = dc.models.optimizers.ExponentialDecay(0.0002, 0.9, 1000)

model = dc.models.MultitaskClassifier(n_tasks=len(tasks),
                                      n_features=1024,
                                      layer_sizes=[1000],
                                      dropouts=0.2,
                                      learning_rate=learning_rate)

model.fit(train_dataset, nb_epoch=50, callbacks=callback)

Step 1000 validation: roc_auc_score=0.753691
Step 2000 validation: roc_auc_score=0.759846
Step 3000 validation: roc_auc_score=0.76894
Step 4000 validation: roc_auc_score=0.767041
Step 5000 validation: roc_auc_score=0.766646
Step 6000 validation: roc_auc_score=0.774791
Step 7000 validation: roc_auc_score=0.767038
Step 8000 validation: roc_auc_score=0.75814
Step 9000 validation: roc_auc_score=0.769319
Step 10000 validation: roc_auc_score=0.7625
Step 11000 validation: roc_auc_score=0.762794
Step 12000 validation: roc_auc_score=0.76916
Step 13000 validation: roc_auc_score=0.763477
Step 14000 validation: roc_auc_score=0.763887
Step 15000 validation: roc_auc_score=0.766563
Step 16000 validation: roc_auc_score=0.767583


0.5475979995727539

In [16]:
model.evaluate(valid_dataset, [metric])

{'roc_auc_score': 0.7676810822065452}