In [1]:
##Deepchem is a fantastic package to explore deeplearning in chemistry and biological science.
#It has been a great learning curuve, especially coming from keras, but everything is very well documented.
#This is just my effort to practice more and get myself more aquainted witht the concept and package.
#I hope this document will help many others who wants to learn DeepLearning in chemistry.

In [2]:
#I am assuming you have installed the "Deepchem" and "rdkit" packages.
#let's import both of them
import deepchem as dc
import rdkit
import numpy as np

In [4]:
#deepchem has the 'molnet' module that has built in function to pull lots of scientific data, most of them are already sorted.
tasks, datasets, transformers = dc.molnet.load_tox21(reload =False)

#let's import toxicity dataset. You can check the three outputs individually.
#make sure the reload parameter set to 'False', if your code doesn't work. I struggled with this for hours.



In [7]:
print(tasks) #task represent the response of each compounds to the 12 assays, basically 12 outcomes for 12 assays.

['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']


In [8]:
#datasets has all the data, let's split them into the groups of three: train, vaid, test
train_data, valid_data, test_data = datasets

In [9]:
#you can check the shape of each type of data, it is important to know the shape before the model.
#each set has three variables, X: the molecules, y:assay the outcome, and w:the weight for certain assays are set to zero if there are no data available,
print(train_data.X.shape, train_data.y.shape, train_data.w.shape)
print(valid_data.X.shape, valid_data.y.shape, valid_data.w.shape)
print(test_data.X.shape, test_data.y.shape, test_data.w.shape)

(6264, 1024) (6264, 12) (6264, 12)
(783, 1024) (783, 12) (783, 12)
(784, 1024) (784, 12) (784, 12)


In [10]:
#we have 6264 compounds in the training sample, and 783 and 784 compounds for validation set and the test set.
#1024 is the dimension of the vector, this vector was created after featuring every molecule.
#we have used the deafult featurization: ECFP(extended connectivity finger printing)
#ECFP converts each molecule into a fixed dimension(1024) vector, which will be used as input in the deep learning model.

In [11]:
#the last thing the load function output is the type of transformers it has used.
#this function has used Balancingtransformers, you can find the type by printing it out.
#this transformer balance the uneven datasets
print(transformers)

[<deepchem.trans.transformers.BalancingTransformer object at 0x7f7f2dbe2290>]


In [12]:
# now let's create a model

model = dc.models.MultitaskClassifier(n_tasks=train_data.y.shape[1],   #equals to 12 assays data  
                                     n_features=train_data.X.shape[1], #equals to 1024 features                                    
                                     layer_sizes=[500],                #we will have one hidden layers with 500 arbitary neurons, you have to play around
                                     dropouts=0.5)                     # we will drop 50 percent of the neurons randomly,
                                                                       #this model already uses relu activation function by default

2021-12-30 15:56:14.474775: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-30 15:56:14.477354: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [13]:
#this is the easy part, fitting the model into your trainning dataset.

model.fit(train_data, nb_epoch=50)

#i have chosen arbitarily 50 epochs, one epoch means the entired data has been cycled through 50 times through the neurons.

0.15401579856872558

In [15]:
#now we will evaluate our model on the test sample. But before that we have to choose the proper metrics method.
#good thing about the deepchem is that it has dc.metrics.Metric module that wrap the common metric method and-
#-that is extremely helpful to find the metrics for training and testing datasets anyway you would like.

met = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

train_score = model.evaluate(train_data, [met], transformers)
test_score = model.evaluate(test_data, [met], transformers)
print(train_score)
print(test_score)

{'mean-roc_auc_score': 0.9905262531017529}
{'mean-roc_auc_score': 0.6713550605074426}


In [None]:
# we definitely overfitted the model
#if you would want to find the toxicity of any molecule here are the few things you would have to do.
# 1. get the SMILE notation of the molecule
# 2. convert it into a mol obeject by using rdkit
# 3. featurize the mol object by using the deepchem's ECFP featurizer. it should be an array of (,1024) features,
# 4. then use the predict function to predict.
# I will explore more on the different datasets in the future projects.
# This is a reacreation of the toxicity project from the book "Deep learning for the life sciences". I will highly recommend reading it.