In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table
from collections import Counter
import numpy as np
import scipy as sp
%matplotlib inline

In [7]:
data = pd.read_csv("Cleaned.csv", usecols = ["Molecule ChEMBL ID", "Smiles", "Standard Value", 
                                                                "Standard Units", "Target ChEMBL ID", 
                                                                "Standard Relation", "Target Name", "Active"])

In [8]:
data.shape

(8727, 8)

In [9]:
data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name,Active
0,CHEMBL3905758,CN1CCN(CCn2ccc(Nc3ncc4c(n3)-c3c(nn(C)c3-c3ccc(...,'=',0.05,nM,CHEMBL1957,Insulin-like growth factor I receptor,1
1,CHEMBL3950012,COc1cc(N2CCN(C(C)C)CC2)ccc1Nc1ncc2c(n1)-c1c(nn...,'=',0.05,nM,CHEMBL1957,Insulin-like growth factor I receptor,1
2,CHEMBL3928243,Cn1cc(-c2cccc(Cc3c4c(nn3C)CCc3cnc(Nc5ccn(CCCN6...,'=',0.08,nM,CHEMBL1957,Insulin-like growth factor I receptor,1
3,CHEMBL3955626,COc1cc(N2CCN(CCO)CC2)ccc1Nc1ncc2c(n1)-c1c(nn(C...,'=',0.08,nM,CHEMBL1957,Insulin-like growth factor I receptor,1
4,CHEMBL3979064,COc1cc(N2CCN(C3CCOCC3)CC2)ccc1Nc1ncc2c(n1)-c1c...,'=',0.08,nM,CHEMBL1957,Insulin-like growth factor I receptor,1


In [10]:
new_data = data

In [11]:
new_data.drop(columns=['Standard Relation', 'Standard Value', 'Standard Units', 'Target Name'])

Unnamed: 0,Molecule ChEMBL ID,Smiles,Target ChEMBL ID,Active
0,CHEMBL3905758,CN1CCN(CCn2ccc(Nc3ncc4c(n3)-c3c(nn(C)c3-c3ccc(...,CHEMBL1957,1
1,CHEMBL3950012,COc1cc(N2CCN(C(C)C)CC2)ccc1Nc1ncc2c(n1)-c1c(nn...,CHEMBL1957,1
2,CHEMBL3928243,Cn1cc(-c2cccc(Cc3c4c(nn3C)CCc3cnc(Nc5ccn(CCCN6...,CHEMBL1957,1
3,CHEMBL3955626,COc1cc(N2CCN(CCO)CC2)ccc1Nc1ncc2c(n1)-c1c(nn(C...,CHEMBL1957,1
4,CHEMBL3979064,COc1cc(N2CCN(C3CCOCC3)CC2)ccc1Nc1ncc2c(n1)-c1c...,CHEMBL1957,1
...,...,...,...,...
8722,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,CHEMBL203,0
8723,CHEMBL261238,CN(c1cccnc1)c1cc2c(Nc3ccc(F)c(Cl)c3)c(C#N)cnc2cn1,CHEMBL203,0
8724,CHEMBL76587,N#CC(C#N)Cc1ccc(O)cc1,CHEMBL203,0
8725,CHEMBL490510,O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O[C@@H]2O[C@H](C...,CHEMBL203,0


In [12]:
new_data["Molecule ChEMBL ID"].value_counts()

CHEMBL4438748    3
CHEMBL1834657    3
CHEMBL1945559    3
CHEMBL1421       3
CHEMBL553        3
                ..
CHEMBL431977     1
CHEMBL437879     1
CHEMBL258940     1
CHEMBL2165029    1
CHEMBL1091883    1
Name: Molecule ChEMBL ID, Length: 8429, dtype: int64

In [13]:
new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL2321905"]

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value,Standard Units,Target ChEMBL ID,Target Name,Active
1460,CHEMBL2321905,COC(=O)c1cc(-c2ccc(NC(=O)c3cc(NC(=O)CCCOc4cc5c...,'=',0.0013,nM,CHEMBL614725,MIA PaCa-2,1


In [14]:
target_ids = new_data["Target ChEMBL ID"].tolist()
targets = list(set(target_ids))
targets

['CHEMBL2842', 'CHEMBL203', 'CHEMBL1957', 'CHEMBL614725']

In [15]:
columns = ["Molecule_id", "Smiles"]
columns = columns + targets
columns

['Molecule_id',
 'Smiles',
 'CHEMBL2842',
 'CHEMBL203',
 'CHEMBL1957',
 'CHEMBL614725']

In [16]:
transformed_data = new_data[["Molecule ChEMBL ID", "Smiles"]]
transformed_data = transformed_data.drop_duplicates(subset=["Molecule ChEMBL ID", "Smiles"])
for target in targets:
    transformed_data[target] = 0
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL2842,CHEMBL203,CHEMBL1957,CHEMBL614725
0,CHEMBL3905758,CN1CCN(CCn2ccc(Nc3ncc4c(n3)-c3c(nn(C)c3-c3ccc(...,0,0,0,0
1,CHEMBL3950012,COc1cc(N2CCN(C(C)C)CC2)ccc1Nc1ncc2c(n1)-c1c(nn...,0,0,0,0
2,CHEMBL3928243,Cn1cc(-c2cccc(Cc3c4c(nn3C)CCc3cnc(Nc5ccn(CCCN6...,0,0,0,0
3,CHEMBL3955626,COc1cc(N2CCN(CCO)CC2)ccc1Nc1ncc2c(n1)-c1c(nn(C...,0,0,0,0
4,CHEMBL3979064,COc1cc(N2CCN(C3CCOCC3)CC2)ccc1Nc1ncc2c(n1)-c1c...,0,0,0,0


In [17]:
transformed_data[transformed_data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

4480    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [18]:
data[data["Molecule ChEMBL ID"] == "CHEMBL63786"]["Smiles"]

4480    Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1
Name: Smiles, dtype: object

In [19]:
df = new_data[new_data["Molecule ChEMBL ID"] == "CHEMBL1684800"]

for index, row in df.iterrows():
    print(row["Target ChEMBL ID"])

CHEMBL2842


In [20]:
new_data.dtypes

Molecule ChEMBL ID     object
Smiles                 object
Standard Relation      object
Standard Value        float64
Standard Units         object
Target ChEMBL ID       object
Target Name            object
Active                  int64
dtype: object

In [21]:
def get_target_list():
    target_list = {}
    for t in targets:
        target_list[t] = 0
    return target_list

In [22]:
def add_values(row):
    mol = row["Molecule ChEMBL ID"]
    mol_data = new_data[new_data["Molecule ChEMBL ID"] == mol]
    for index, r in mol_data.iterrows():
        row[r["Target ChEMBL ID"]] = 1
    return row

In [23]:
transformed_data = transformed_data.apply(add_values, axis = 1)

In [24]:
transformed_data.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,CHEMBL2842,CHEMBL203,CHEMBL1957,CHEMBL614725
0,CHEMBL3905758,CN1CCN(CCn2ccc(Nc3ncc4c(n3)-c3c(nn(C)c3-c3ccc(...,0,0,1,0
1,CHEMBL3950012,COc1cc(N2CCN(C(C)C)CC2)ccc1Nc1ncc2c(n1)-c1c(nn...,0,0,1,0
2,CHEMBL3928243,Cn1cc(-c2cccc(Cc3c4c(nn3C)CCc3cnc(Nc5ccn(CCCN6...,0,0,1,0
3,CHEMBL3955626,COc1cc(N2CCN(CCO)CC2)ccc1Nc1ncc2c(n1)-c1c(nn(C...,0,0,1,0
4,CHEMBL3979064,COc1cc(N2CCN(C3CCOCC3)CC2)ccc1Nc1ncc2c(n1)-c1c...,0,0,1,0


In [25]:
set1 = set(new_data[new_data["Target ChEMBL ID"] == "CHEMBL614725"]["Molecule ChEMBL ID"].tolist())

In [26]:
set2 = set(transformed_data[transformed_data["CHEMBL614725"] == 1]["Molecule ChEMBL ID"].tolist())

In [27]:
set1 == set2

True

In [28]:
transformed_data = transformed_data.drop(columns = ["Molecule ChEMBL ID"], axis = 1)
transformed_data

Unnamed: 0,Smiles,CHEMBL2842,CHEMBL203,CHEMBL1957,CHEMBL614725
0,CN1CCN(CCn2ccc(Nc3ncc4c(n3)-c3c(nn(C)c3-c3ccc(...,0,0,1,0
1,COc1cc(N2CCN(C(C)C)CC2)ccc1Nc1ncc2c(n1)-c1c(nn...,0,0,1,0
2,Cn1cc(-c2cccc(Cc3c4c(nn3C)CCc3cnc(Nc5ccn(CCCN6...,0,0,1,0
3,COc1cc(N2CCN(CCO)CC2)ccc1Nc1ncc2c(n1)-c1c(nn(C...,0,0,1,0
4,COc1cc(N2CCN(C3CCOCC3)CC2)ccc1Nc1ncc2c(n1)-c1c...,0,0,1,0
...,...,...,...,...,...
8722,O=C(O)/C=C/c1ccc(O)cc1,0,1,0,0
8723,CN(c1cccnc1)c1cc2c(Nc3ccc(F)c(Cl)c3)c(C#N)cnc2cn1,0,1,0,0
8724,N#CC(C#N)Cc1ccc(O)cc1,0,1,0,0
8725,O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O[C@@H]2O[C@H](C...,0,1,0,0


In [29]:
transformed_data.to_csv("Training-Data.csv")