# Image Classification with DNN

## DATASETS:
(a) Carbonic Anhydrase II (ChEMBL205), a protein lyase,  
(b) Cyclin-dependent kinase 2 (CHEMBL301), a protein kinase,  
(c) ether-a-go-go-related gene potassium channel 1 (HERG) (CHEMBL240), a voltage-gated ion channel,  
(d) Dopamine D4 receptor (CHEMBL219), a monoamine GPCR,  
(e) Coagulation factor X (CHEMBL244), a serine protease,  
(f) Cannabinoid CB1 receptor (CHEMBL218), a lipid-like GPCR and  
(g) Cytochrome P450 19A1 (CHEMBL1978), a cytochrome P450.  
The activity classes were selected based on data availability and as representatives of therapeutically important target classes or as anti-targets.

In [1]:
!nvidia-smi

Wed Oct  6 14:18:22 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1080    Off  | 00000000:01:00.0  On |                  N/A |
|  0%   52C    P8    16W / 240W |    453MiB /  8116MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
#%%capture
#!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
#!chmod +x Miniconda3-latest-Linux-x86_64.sh
#!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
#!time conda install -q -y -c conda-forge rdkit

In [2]:
# Import
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
sys.path.append('/usr/local/lib/python3.7/site-packages/')
from rdkit import Chem
from rdkit.Chem import AllChem



In [4]:
path = Path('../dataset/13321_2017_226_MOESM1_ESM/')

In [5]:
list(path.iterdir())

[PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL1978_cl_ecfp_1024.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205_cl-data-with-ecfp-activations.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL244_cl_ecfp_512.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL218_cl.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/RdkitDescriptors.py'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205_cl_ecfp_512.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301_cl.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205_cl_ecfp_1024.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL1978_cl_ecfp_512.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL240_cl.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301_cl_ecfp_512.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL218_cl_ecfp_1024.csv'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301_cl_ecfp_1024.cs

# Generate images

In [6]:
#DElETE DIRECTORY
#import shutil
#import pathlib
#import os  # for checking results

#print(os.listdir())
# ["a_directory", "foo.py", ...]

#DATA = DATA/'directory'

#shutil.rmtree(DATA)
#print(os.listdir())
# ["foo.py", ...]

In [7]:
datasets = ['CHEMBL205_cl','CHEMBL1978_cl', 'CHEMBL301_cl', 'CHEMBL218_cl', 
            'CHEMBL240_cl', 'CHEMBL219_cl', 
            'CHEMBL244_cl']

In [31]:
DATA = path/'mol_images'
DATA.mkdir(exist_ok=True)
PATH = DATA/datasets[2]
len(list(PATH.iterdir()))

7755

In [10]:
datasets[2:3]

['CHEMBL301_cl']

In [11]:
for dataset in datasets[2:3]:
    
    df = pd.read_csv(path/f'{dataset}.csv')
    IMAGES = DATA/dataset
    if not IMAGES.is_dir():
        IMAGES.mkdir(exist_ok=True)
        for i, r in df.iterrows():
    
            cid = r.CID
            smile = r.SMILES
            mol = Chem.MolFromSmiles(smile)
            Chem.Draw.MolToFile(mol, IMAGES/f'{cid}.png', size = (224, 224), imageType='png')
    

In [32]:
df = pd.read_csv(path/f'{datasets[2]}.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7755 entries, 0 to 7754
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       7755 non-null   object
 1   SMILES    7755 non-null   object
 2   Activity  7755 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 181.9+ KB


In [23]:
df[df.CID == 'CHEMBL310529']

Unnamed: 0,CID,SMILES,Activity
0,CHEMBL310529,OCCS(=O)(=O)c1cc(c2ccc[nH]2)c3C(=O)Nc4ccc(F)c1c34,1


In [25]:
df = pd.read_csv(path/f'{datasets[0]}.csv')
df.head()

Unnamed: 0,CID,SMILES,Activity
0,CHEMBL188002,S(=O)(=O)(N)c1cc(N/C(/S)=N\c2cc(C(=O)[O-])c(cc...,1
1,CHEMBL364127,Clc1ccc(cc1)C(=O)NC1Cc2cc(S(=O)(=O)N)ccc2C1,1
2,CHEMBL1683469,S(=O)(=O)(N)c1ccc(cc1)CNS(=O)(=O)CC12CCC(CC1=O...,1
3,CHEMBL52564,Oc1ccccc1\C=C\C(=O)[O-],1
4,CHEMBL21427,OB(O)c1ccc(OC)cc1,1


In [26]:
df[df.CID == 'CHEMBL364127']

Unnamed: 0,CID,SMILES,Activity
1,CHEMBL364127,Clc1ccc(cc1)C(=O)NC1Cc2cc(S(=O)(=O)N)ccc2C1,1


In [12]:
IMAGES = DATA/datasets[2]

In [16]:
images = list(IMAGES.glob('CHEMBL364127.png'))

In [17]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [18]:
img = mpimg.imread(images[0])
imgplot = plt.imshow(img)
plt.show()

IndexError: list index out of range