<a href="https://colab.research.google.com/github/jinwoo3239/rdkit/blob/main/Deepchem_model_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.6.1-py3-none-any.whl (608 kB)
[K     |████████████████████████████████| 608 kB 30.0 MB/s 
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 43 kB/s 
Installing collected packages: rdkit-pypi, deepchem
Successfully installed deepchem-2.6.1 rdkit-pypi-2022.3.5


In [None]:
import deepchem as dc

import torch
import torch.nn as nn

import tensorflow as tf
from tensorflow.keras import layers

## Tensorflow keras model, Regression

In [None]:
keras_model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

model_tf = dc.models.KerasModel(model=keras_model, loss=dc.models.losses.L2Loss())

In [None]:
task, datasets, transforms = dc.molnet.load_delaney(featurizer='ECFP', splitter='random')
train_datasets, validation_datasets, test_datasets = datasets

metric = dc.metrics.Metric(dc.metrics.r2_score)

model_tf.fit(train_datasets, nb_epoch=50)

0.02209074258804321

In [None]:
print(model_tf.evaluate(train_datasets, metrics=[metric]))
print(model_tf.evaluate(validation_datasets, metrics=[metric]))
print(model_tf.evaluate(test_datasets, metrics=[metric]))

{'r2_score': 0.9772463197424119}
{'r2_score': 0.7550559321417838}
{'r2_score': 0.6656071487929602}


## Torch model, Regression

In [None]:
train_datasets.X.shape

(902, 1024)

In [None]:
torch_model = nn.Sequential(nn.Linear(1024, 128),
                            nn.ReLU(),
                            nn.Linear(128, 32),
                            nn.ReLU(),
                            nn.Linear(32, 1))

model_torch = dc.models.TorchModel(model=torch_model,
                                   loss=dc.models.losses.L2Loss())

model_torch.fit(train_datasets)

0.3304560089111328

In [None]:
metric = dc.metrics.Metric(dc.metrics.r2_score)

print(model_torch.evaluate(train_datasets, metrics=[metric]))
print(model_torch.evaluate(validation_datasets, metrics=[metric]))
print(model_torch.evaluate(test_datasets, metrics=[metric]))

{'r2_score': 0.9186426908785275}
{'r2_score': 0.6893643042280762}
{'r2_score': 0.5619844997067731}


# Classification (Binary)

In [None]:
tasks, datasets, transforms = dc.molnet.load_bace_classification(featurizer='ECFP', splitter='random')
train_dataset, validation_dataset, test_dataset = datasets

train_dataset

<DiskDataset X.shape: (1210, 1024), y.shape: (1210, 1), w.shape: (1210, 1), task_names: ['Class']>

## tensorflow_model

In [None]:
class tf_model(tf.keras.Model):

    def __init__(self, ):
        super(tf_model, self).__init__()

        self.layer1 = tf.keras.Sequential([layers.Dense(128, activation='relu'),
                                           layers.Dense(64, activation='relu'),
                                           layers.Dense(32, activation='relu'),
                                           layers.Dropout(0.1),
                                           layers.Dense(1, activation='sigmoid')])
        
    def call(self, x):
        x = self.layer1(x)
        return x

model = tf_model()

In [None]:
output_type = ['precision', 'loss']


model = dc.models.KerasModel(model=model, loss=dc.models.losses.BinaryCrossEntropy(),
                             output_type=output_type, optimizer=dc.models.optimizers.RMSProp())

model.fit(train_dataset, nb_epoch=10)

0.33840678532918295

In [None]:
metric1 = dc.metrics.Metric(dc.metrics.roc_auc_score)
metric2 = dc.metrics.Metric(dc.metrics.accuracy_score)

print('training set score:', model.evaluate(train_dataset, [metric1, metric2]))
print('test set score:', model.evaluate(test_dataset, [metric1, metric2]))

training set score: {'roc_auc_score': 0.9952230571882863, 'accuracy_score': 0.9619834710743802}
test set score: {'roc_auc_score': 0.8838833595250568, 'accuracy_score': 0.8092105263157895}


# Multitask Model

-  데이터 수가 적거나, 비대칭성일 때 멀티태스킹 작업이 성능을 개선시킬 수 있다
- MUV 데이터를 이용 예
- MUV dataset에는 17개의 타겟에 대해서 소수의 액티브 샘플만 보유하고 있다
- 총 93,087 개의 화합물이 있는데 태스크별로 30개 이하의 액티브 샘플만 존재한다
- 멀티태스크 모델을 사용하여 이러한 문제를 일부 개선할 수 있다. 한가지 태스크 예측에 도움이 되는 특성은 다른 태스크에도 도움이 될 것임

In [None]:
tasks, datasets, transformers = dc.molnet.load_muv(splitter='stratified')
train_dataset, valid_dataset, test_dataset = datasets
train_dataset

<DiskDataset X.shape: (74470, 1024), y.shape: (74470, 17), w.shape: (74470, 17), task_names: ['MUV-466' 'MUV-548' 'MUV-600' ... 'MUV-852' 'MUV-858' 'MUV-859']>

In [None]:
n_tasks = len(tasks)
n_features = train_dataset.get_data_shape()[0]
model = dc.models.MultitaskClassifier(n_tasks, n_features)
model.fit(train_dataset, nb_epoch=2)

0.16953183280097114