# Drug Discovery Project

## DATASETS:
(a) Carbonic Anhydrase II (ChEMBL205), a protein lyase,  
(b) Cyclin-dependent kinase 2 (CHEMBL301), a protein kinase,  
(c) ether-a-go-go-related gene potassium channel 1 (HERG) (CHEMBL240), a voltage-gated ion channel,  
(d) Dopamine D4 receptor (CHEMBL219), a monoamine GPCR,  
(e) Coagulation factor X (CHEMBL244), a serine protease,  
(f) Cannabinoid CB1 receptor (CHEMBL218), a lipid-like GPCR and  
(g) Cytochrome P450 19A1 (CHEMBL1978), a cytochrome P450.  
The activity classes were selected based on data availability and as representatives of therapeutically important target classes or as anti-targets.

In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
# Import
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem



In [4]:
dataset = 'CHEMBL205'

In [5]:
path = Path('../dataset/13321_2017_226_MOESM1_ESM/')
df = pd.read_csv(path/f'{dataset}/{dataset}_cl.csv', index_col=0)

In [6]:
df.head()
list(path.iterdir())

[PosixPath('../dataset/13321_2017_226_MOESM1_ESM/.DS_Store'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/mol_images'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL240'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL301'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL244'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL219'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL1978'),
 PosixPath('../dataset/13321_2017_226_MOESM1_ESM/CHEMBL218')]

In [7]:
df[df['Activity'] == 1].SMILES[0]

'S(=O)(=O)(N)c1cc(N/C(/S)=N\\c2cc(C(=O)[O-])c(cc2)C=2c3c(OC4=CC(=O)C=CC=24)cc(O)cc3)ccc1'

# Create fingerprints for all datasets

In [8]:
# function for returning fingerprint from a specific smile.

def fp(smile, diam = 2, bits = 1024):

    mol = Chem.MolFromSmiles(smile)
    Chem.SanitizeMol(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, diam, nBits = bits) 
    return fp

In [9]:
#ECFP4
#Generated Circular fingerprints hashed into n bits length vectors.

def ECFP(ifile, ofile, diam, bits):
    
    print(f"Making fingerprints for file: {ifile}")
    df = pd.read_csv(ifile)
    
    df.insert(2, "ECFP4_", df.SMILES.apply(fp))
    
    for i in range(len(df.ECFP4_[0])):
        df.insert(i + 2, f"ECFP4_{i + 1}", 0)
    
    df[[f"ECFP4_{i+1}" for i in range(len(df.ECFP4_[0]))]] = np.array(df.ECFP4_.to_list())
    
    df.drop("ECFP4_", axis = 1, inplace = True)
    
    
    df.to_csv(path/ofile, index = None)
    return df

# Run the functions on a file from dataset and store the results

In [10]:
datasets = ['CHEMBL205', 'CHEMBL301', 'CHEMBL218', 
            'CHEMBL240', 'CHEMBL219', 
            'CHEMBL244', 'CHEMBL1978']

In [11]:
def create_fingerprints(dataset, bits, data='train_valid'):
    ECFP(path/f'{dataset}/{dataset}_{data}.csv', f'{dataset}/{dataset}_ecfp_{bits}_{data}.csv', 2, bits)

In [12]:
create_fingerprints(dataset, 1024, data='train_valid')

Making fingerprints for file: ../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205/CHEMBL205_train_valid.csv


In [13]:
create_fingerprints(dataset, 1024, data='test1')

Making fingerprints for file: ../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205/CHEMBL205_test1.csv


In [14]:
create_fingerprints(dataset, 1024, data='test2')

Making fingerprints for file: ../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205/CHEMBL205_test2.csv


In [18]:
for dataset in datasets: 
    create_fingerprints(dataset, 1024)

Making fingerprints for file: ../dataset/13321_2017_226_MOESM1_ESM/CHEMBL205/CHEMBL205_cl.csv


  


ValueError: Columns must be same length as key

In [None]:
for dataset in datasets:
    df = pd.read_csv(path/f'{dataset}_ecfp_1024.csv')
    df.info()
    print()

In [None]:
df.head()

In [None]:
df[df['Activity'] == 1].SMILES[0]