# Image Classification with DNN

## DATASETS:
(a) Carbonic Anhydrase II (ChEMBL205), a protein lyase,  
(b) Cyclin-dependent kinase 2 (CHEMBL301), a protein kinase,  
(c) ether-a-go-go-related gene potassium channel 1 (HERG) (CHEMBL240), a voltage-gated ion channel,  
(d) Dopamine D4 receptor (CHEMBL219), a monoamine GPCR,  
(e) Coagulation factor X (CHEMBL244), a serine protease,  
(f) Cannabinoid CB1 receptor (CHEMBL218), a lipid-like GPCR and  
(g) Cytochrome P450 19A1 (CHEMBL1978), a cytochrome P450.  
The activity classes were selected based on data availability and as representatives of therapeutically important target classes or as anti-targets.

In [1]:
# Import
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
path = Path('../dataset/13321_2017_226_MOESM1_ESM/')

In [3]:
list(path.iterdir())

[PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/.ipynb_checkpoints'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL218'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL219'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL244'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/mol_images'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL1978'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL240')]

# Create Test Datasets

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
datasets = ['CHEMBL205','CHEMBL1978', 'CHEMBL301', 'CHEMBL218', 
            'CHEMBL240', 'CHEMBL219', 
            'CHEMBL244']

In [6]:
dataset=datasets[0]

In [7]:
DATA = path
DATA.mkdir(exist_ok=True)
PATH = DATA/datasets[2]
len(list(PATH.iterdir()))

4

In [8]:
DATASET = DATA/dataset
df = pd.read_csv(DATASET/f'{dataset}_cl.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17941 entries, 0 to 17940
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       17941 non-null  object
 1   SMILES    17941 non-null  object
 2   Activity  17941 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 420.6+ KB


In [14]:
len(df[df['Activity']==1]), len(df[df['Activity']==0])

(1631, 16310)

In [25]:
for dataset in datasets[0:1]:
    
    DATASET = DATA/dataset
    df = pd.read_csv(DATASET/f'{dataset}_cl.csv')
    df.info()
    x_train, x_test = train_test_split(df.index, test_size=0.2, random_state=666, stratify=df['Activity'])
    test1 = df.loc[x_test]
    test1 = df_test.reset_index(drop=True)
    test1.info()
    df = df.loc[x_train]
    df = df.reset_index(drop=True)
    df.info()
    x_train, x_test = train_test_split(df.index, test_size=0.25, random_state=666, stratify=df['Activity'])
    test2 = df.loc[x_test]
    test2 = df_test.reset_index(drop=True)
    test2.info()
    df = df.loc[x_train]
    df = df.reset_index(drop=True)
    df.info()
    x_train, x_valid = train_test_split(df.index, test_size=0.2, random_state=666, stratify=df['Activity'])
    df.loc[x_train, 'is_valid']=False
    df.loc[x_valid, 'is_valid']=True
    df = df.reset_index(drop=True)
    df.to_csv(DATASET/f'{dataset}_train_valid.csv', index=False)
    test1.to_csv(DATASET/f'{dataset}_test1.csv', index=False)
    test2.to_csv(DATASET/f'{dataset}_test2.csv', index=False)
    test1.info()
    test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17941 entries, 0 to 17940
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       17941 non-null  object
 1   SMILES    17941 non-null  object
 2   Activity  17941 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 420.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       3589 non-null   object
 1   SMILES    3589 non-null   object
 2   Activity  3589 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 84.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14352 entries, 0 to 14351
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       14352 non-null  object
 1   SMILES    14352 non-null  object
 2   Activity  14352 non-null  int64 
dtype

In [26]:
df = pd.read_csv(DATA/dataset/f'{datasets[0]}_train_valid.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10764 entries, 0 to 10763
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       10764 non-null  object
 1   SMILES    10764 non-null  object
 2   Activity  10764 non-null  int64 
 3   is_valid  10764 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 262.9+ KB


In [27]:
df = pd.read_csv(DATA/dataset/f'{datasets[0]}_test1.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       3589 non-null   object
 1   SMILES    3589 non-null   object
 2   Activity  3589 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 84.2+ KB


In [29]:
df = pd.read_csv(DATA/dataset/f'{datasets[0]}_test2.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3589 entries, 0 to 3588
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CID       3589 non-null   object
 1   SMILES    3589 non-null   object
 2   Activity  3589 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 84.2+ KB
