In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table
from collections import Counter
import numpy as np
import scipy as sp
from padelpy import from_smiles
%matplotlib inline

In [4]:
data = pd.read_csv("cleaned-data.csv", usecols = ["Molecule ChEMBL ID", "Smiles", "Standard Value", 
                                                                "Standard Units", "Target ChEMBL ID", 
                                                                "Standard Relation", "Target Name"])

In [5]:
data.shape

(18050, 7)

In [6]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name
0,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,'=',0.0013,nM,CHEMBL614725,MIA PaCa-2
1,CHEMBL2324630,COC(=O)c1cc(-c2ccc(NC(=O)c3nc(NC(=O)CCCOc4cc5c...,'=',0.0021,nM,CHEMBL614725,MIA PaCa-2
2,CHEMBL63786,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,'=',0.003,nM,CHEMBL203,Epidermal growth factor receptor erbB1
3,CHEMBL53711,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,'=',0.006,nM,CHEMBL203,Epidermal growth factor receptor erbB1
4,CHEMBL35820,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,'=',0.006,nM,CHEMBL203,Epidermal growth factor receptor erbB1


In [7]:
def add_active(row):
    sv = row["Standard Value"]
    sr = row["Standard Relation"]
    if int(sv) < 1000 and sr != '>':
        return 1
    else:
        return 0

In [8]:
data['Active'] = data.apply(add_active, axis=1)

In [9]:
data["Active"].value_counts()

1    10451
0     7599
Name: Active, dtype: int64

In [10]:
new_data = data

In [11]:
new_data.drop(columns=['Standard Relation', 'Standard Value', 'Standard Units', 'Target Name'])

Unnamed: 0,Molecule ChEMBL ID,Smiles,Target ChEMBL ID,Active
0,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,CHEMBL614725,1
1,CHEMBL2324630,COC(=O)c1cc(-c2ccc(NC(=O)c3nc(NC(=O)CCCOc4cc5c...,CHEMBL614725,1
2,CHEMBL63786,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,CHEMBL203,1
3,CHEMBL53711,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,CHEMBL203,1
4,CHEMBL35820,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,CHEMBL203,1
...,...,...,...,...
18045,CHEMBL1956069,COc1cccc(C(=O)c2sc(Nc3ccc(N4CCN(C(C)C)CC4)cc3)...,CHEMBL1957,0
18046,CHEMBL1956070,CN1CCN(c2ccc(Nc3nc(N)c(C(=O)c4ccc5c(c4)OCCO5)s...,CHEMBL1957,0
18047,CHEMBL1431,CN(C)C(=N)NC(=N)N,CHEMBL614725,0
18048,CHEMBL45068,O=C(CCc1ccc(O)cc1)c1c(O)cc(O)cc1O,CHEMBL203,0


In [12]:
new_data["Molecule ChEMBL ID"].value_counts()

CHEMBL4160854    3
CHEMBL553        3
CHEMBL1336       3
CHEMBL1684800    3
CHEMBL1834657    3
                ..
CHEMBL3355475    1
CHEMBL3639542    1
CHEMBL1910278    1
CHEMBL3947072    1
CHEMBL73820      1
Name: Molecule ChEMBL ID, Length: 17230, dtype: int64

In [13]:
new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL2321905"]

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name,Active
0,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,'=',0.0013,nM,CHEMBL614725,MIA PaCa-2,1


In [14]:
target_ids = new_data["Target ChEMBL ID"].tolist()
targets = list(set(target_ids))
targets

['CHEMBL1957',
 'CHEMBL4026',
 'CHEMBL209',
 'CHEMBL203',
 'CHEMBL2842',
 'CHEMBL614725']

In [15]:
columns = ["Molecule_id", "Smiles"]
columns = columns + targets
columns

['Molecule_id',
 'Smiles',
 'CHEMBL1957',
 'CHEMBL4026',
 'CHEMBL209',
 'CHEMBL203',
 'CHEMBL2842',
 'CHEMBL614725']

In [16]:
transformed_data = new_data[["Molecule ChEMBL ID", "Smiles"]]
transformed_data = transformed_data.drop_duplicates(subset=["Molecule ChEMBL ID", "Smiles"])
for target in targets:
    transformed_data[target] = 0
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL1957,CHEMBL4026,CHEMBL209,CHEMBL203,CHEMBL2842,CHEMBL614725
0,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,0
1,CHEMBL2324630,COC(=O)c1cc(-c2ccc(NC(=O)c3nc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,0
2,CHEMBL63786,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,0,0,0,0,0,0
3,CHEMBL53711,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,0,0,0,0,0,0
4,CHEMBL35820,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,0,0,0,0,0,0


In [17]:
transformed_data[transformed_data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

2    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [18]:
data[data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

2    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [19]:
df = new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL1684800"]

for index, row in df.iterrows():
    print(row["Target ChEMBL ID"])

CHEMBL1957
CHEMBL203
CHEMBL2842


In [20]:
new_data.dtypes

Molecule ChEMBL ID     object
Smiles                 object
Standard Relation      object
Standard Value        float64
Standard Units         object
Target ChEMBL ID       object
Target Name            object
Active                  int64
dtype: object

In [21]:
def get_target_list():
    target_list = {}
    for t in targets:
        target_list[t] = 0
    return target_list

In [22]:
def add_values(row):
    mol = row["Molecule ChEMBL ID"]
    mol_data = new_data[new_data["Molecule ChEMBL ID"] == mol]
    for index, r in mol_data.iterrows():
        row[r["Target ChEMBL ID"]] = 1
    return row

In [23]:
transformed_data = transformed_data.apply(add_values, axis = 1)

In [24]:
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL1957,CHEMBL4026,CHEMBL209,CHEMBL203,CHEMBL2842,CHEMBL614725
0,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,1
1,CHEMBL2324630,COC(=O)c1cc(-c2ccc(NC(=O)c3nc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,1
2,CHEMBL63786,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,0,0,0,1,0,0
3,CHEMBL53711,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,0,0,0,1,0,0
4,CHEMBL35820,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,0,0,0,1,0,0


In [25]:
set1 = set(new_data[new_data["Target ChEMBL ID"] == "CHEMBL614725"]["Molecule ChEMBL ID"].tolist())

In [26]:
set2 = set(transformed_data[transformed_data["CHEMBL614725"] == 1]["Molecule ChEMBL ID"].tolist())

In [27]:
set1 == set2

True

In [28]:
transformed_data = transformed_data.drop(columns = ["Molecule ChEMBL ID"], axis = 1)
transformed_data

Unnamed: 0,Smiles,CHEMBL1957,CHEMBL4026,CHEMBL209,CHEMBL203,CHEMBL2842,CHEMBL614725
0,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,1
1,COC(=O)c1cc(-c2ccc(NC(=O)c3nc(NC(=O)CCCOc4cc5c...,0,0,0,0,0,1
2,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,0,0,0,1,0,0
3,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,0,0,0,1,0,0
4,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,0,0,0,1,0,0
...,...,...,...,...,...,...,...
18044,O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O[C@@H]2O[C@H](C...,0,0,0,1,0,0
18045,COc1cccc(C(=O)c2sc(Nc3ccc(N4CCN(C(C)C)CC4)cc3)...,1,0,0,0,0,0
18046,CN1CCN(c2ccc(Nc3nc(N)c(C(=O)c4ccc5c(c4)OCCO5)s...,1,0,0,0,0,0
18047,CN(C)C(=N)NC(=N)N,0,0,0,0,0,1


In [29]:
transformed_data.to_csv("Training-Data.csv")