In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table
from collections import Counter
import numpy as np
import scipy as sp
%matplotlib inline

In [4]:
data = pd.read_csv("Cleaned-data.csv", usecols = ["Molecule ChEMBL ID", "Smiles", "Standard Value", 
                                                                "Standard Units", "Target ChEMBL ID", 
                                                                "Standard Relation", "Target Name", "Active"])

In [5]:
data.shape

(5723, 8)

In [6]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name,Active
0,CHEMBL3645910,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,'=',0.07,nM,CHEMBL2842,Serine/threonine-protein kinase mTOR,1
1,CHEMBL3693444,COc1ccc(-c2ccc3c(N4CCOC[C@@H]4C)nc(N4CCOC[C@@H...,'=',0.15,nM,CHEMBL2842,Serine/threonine-protein kinase mTOR,1
2,CHEMBL574680,O=C(Nc1ccc(-c2nc(N3CCOCC3)c3cnn(CC(F)(F)F)c3n2...,'=',0.2,nM,CHEMBL2842,Serine/threonine-protein kinase mTOR,1
3,CHEMBL1765602,Nc1ccc(-c2ccc3ncc4ccc(=O)n(-c5cccc(C(F)(F)F)c5...,'=',0.25,nM,CHEMBL2842,Serine/threonine-protein kinase mTOR,1
4,CHEMBL1092389,C[C@@H]1COCCN1c1nc(-c2ccc(NC(=O)Nc3ccc(C(=O)N4...,'=',0.25,nM,CHEMBL2842,Serine/threonine-protein kinase mTOR,1


In [7]:
new_data = data

In [8]:
new_data.drop(columns=['Standard Relation', 'Standard Value', 'Standard Units', 'Target Name'])

Unnamed: 0,Molecule ChEMBL ID,Smiles,Target ChEMBL ID,Active
0,CHEMBL3645910,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,CHEMBL2842,1
1,CHEMBL3693444,COc1ccc(-c2ccc3c(N4CCOC[C@@H]4C)nc(N4CCOC[C@@H...,CHEMBL2842,1
2,CHEMBL574680,O=C(Nc1ccc(-c2nc(N3CCOCC3)c3cnn(CC(F)(F)F)c3n2...,CHEMBL2842,1
3,CHEMBL1765602,Nc1ccc(-c2ccc3ncc4ccc(=O)n(-c5cccc(C(F)(F)F)c5...,CHEMBL2842,1
4,CHEMBL1092389,C[C@@H]1COCCN1c1nc(-c2ccc(NC(=O)Nc3ccc(C(=O)N4...,CHEMBL2842,1
...,...,...,...,...
5718,CHEMBL4216200,COc1ccc(CNc2ccnc3oc4ccc(N)cc4c23)cc1,CHEMBL1957,0
5719,CHEMBL4213021,Nc1ccc2oc3nccc(NCc4cccc(Br)c4)c3c2c1,CHEMBL1957,0
5720,CHEMBL3353410,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(O...,CHEMBL1957,0
5721,CHEMBL1956069,COc1cccc(C(=O)c2sc(Nc3ccc(N4CCN(C(C)C)CC4)cc3)...,CHEMBL1957,0


In [9]:
new_data["Molecule ChEMBL ID"].value_counts()

CHEMBL1945559    3
CHEMBL1421       3
CHEMBL3263869    3
CHEMBL3651966    3
CHEMBL4438748    3
                ..
CHEMBL446019     1
CHEMBL487533     1
CHEMBL3357655    1
CHEMBL4451792    1
CHEMBL3655751    1
Name: Molecule ChEMBL ID, Length: 5494, dtype: int64

In [10]:
new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL2321905"]

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name,Active
4147,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,'=',0.0013,nM,CHEMBL614725,MIA PaCa-2,1


In [11]:
target_ids = new_data["Target ChEMBL ID"].tolist()
targets = list(set(target_ids))
targets

['CHEMBL1957', 'CHEMBL2842', 'CHEMBL614725', 'CHEMBL203']

In [12]:
columns = ["Molecule_id", "Smiles"]
columns = columns + targets
columns

['Molecule_id',
 'Smiles',
 'CHEMBL1957',
 'CHEMBL2842',
 'CHEMBL614725',
 'CHEMBL203']

In [13]:
transformed_data = new_data[["Molecule ChEMBL ID", "Smiles"]]
transformed_data = transformed_data.drop_duplicates(subset=["Molecule ChEMBL ID", "Smiles"])
for target in targets:
    transformed_data[target] = 0
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL1957,CHEMBL2842,CHEMBL614725,CHEMBL203
0,CHEMBL3645910,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,0,0,0,0
1,CHEMBL3693444,COc1ccc(-c2ccc3c(N4CCOC[C@@H]4C)nc(N4CCOC[C@@H...,0,0,0,0
2,CHEMBL574680,O=C(Nc1ccc(-c2nc(N3CCOCC3)c3cnn(CC(F)(F)F)c3n2...,0,0,0,0
3,CHEMBL1765602,Nc1ccc(-c2ccc3ncc4ccc(=O)n(-c5cccc(C(F)(F)F)c5...,0,0,0,0
4,CHEMBL1092389,C[C@@H]1COCCN1c1nc(-c2ccc(NC(=O)Nc3ccc(C(=O)N4...,0,0,0,0


In [14]:
transformed_data[transformed_data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

1190    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [15]:
data[data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

1190    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [16]:
df = new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL1684800"]

for index, row in df.iterrows():
    print(row["Target ChEMBL ID"])

CHEMBL2842


In [17]:
new_data.dtypes

Molecule ChEMBL ID     object
Smiles                 object
Standard Relation      object
Standard Value        float64
Standard Units         object
Target ChEMBL ID       object
Target Name            object
Active                  int64
dtype: object

In [18]:
def get_target_list():
    target_list = {}
    for t in targets:
        target_list[t] = 0
    return target_list

In [19]:
def add_values(row):
    mol = row["Molecule ChEMBL ID"]
    mol_data = new_data[new_data["Molecule ChEMBL ID"] == mol]
    for index, r in mol_data.iterrows():
        row[r["Target ChEMBL ID"]] = 1
    return row

In [20]:
transformed_data = transformed_data.apply(add_values, axis = 1)

In [21]:
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL1957,CHEMBL2842,CHEMBL614725,CHEMBL203
0,CHEMBL3645910,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,0,1,0,0
1,CHEMBL3693444,COc1ccc(-c2ccc3c(N4CCOC[C@@H]4C)nc(N4CCOC[C@@H...,0,1,0,0
2,CHEMBL574680,O=C(Nc1ccc(-c2nc(N3CCOCC3)c3cnn(CC(F)(F)F)c3n2...,0,1,0,0
3,CHEMBL1765602,Nc1ccc(-c2ccc3ncc4ccc(=O)n(-c5cccc(C(F)(F)F)c5...,0,1,0,0
4,CHEMBL1092389,C[C@@H]1COCCN1c1nc(-c2ccc(NC(=O)Nc3ccc(C(=O)N4...,0,1,0,0


In [22]:
set1 = set(new_data[new_data["Target ChEMBL ID"] == "CHEMBL614725"]["Molecule ChEMBL ID"].tolist())

In [23]:
set2 = set(transformed_data[transformed_data["CHEMBL614725"] == 1]["Molecule ChEMBL ID"].tolist())

In [24]:
set1 == set2

True

In [25]:
transformed_data = transformed_data.drop(columns = ["Molecule ChEMBL ID"], axis = 1)
transformed_data

Unnamed: 0,Smiles,CHEMBL1957,CHEMBL2842,CHEMBL614725,CHEMBL203
0,CCc1ncnc(-c2cc(F)c(C(=O)N3CCN4CCC[C@H]4C3)c(Cl...,0,1,0,0
1,COc1ccc(-c2ccc3c(N4CCOC[C@@H]4C)nc(N4CCOC[C@@H...,0,1,0,0
2,O=C(Nc1ccc(-c2nc(N3CCOCC3)c3cnn(CC(F)(F)F)c3n2...,0,1,0,0
3,Nc1ccc(-c2ccc3ncc4ccc(=O)n(-c5cccc(C(F)(F)F)c5...,0,1,0,0
4,C[C@@H]1COCCN1c1nc(-c2ccc(NC(=O)Nc3ccc(C(=O)N4...,0,1,0,0
...,...,...,...,...,...
5716,O=C1Nc2ccccc2/C1=C1/Nc2ccccc2C1=O,1,0,0,0
5719,Nc1ccc2oc3nccc(NCc4cccc(Br)c4)c3c2c1,1,0,0,0
5720,C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(O...,1,0,0,0
5721,COc1cccc(C(=O)c2sc(Nc3ccc(N4CCN(C(C)C)CC4)cc3)...,1,0,0,0


In [28]:
transformed_data.to_csv("Training-Data.csv")