# *Data Load*

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/cmpd.csv')
df.head()

Unnamed: 0,inchikey,smiles,group,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active


In [3]:
df.shape

(5530, 4)

# *Data Processing*

In [4]:
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

In [5]:
df['mol'] = df.smiles.apply(Chem.MolFromSmiles)

In [6]:
df.head()

Unnamed: 0,inchikey,smiles,group,activity,mol
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7f7a3042b3a0>
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7f7a3042b3f0>
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7f7a3042b350>
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active,<rdkit.Chem.rdchem.Mol object at 0x7f7a3042b440>
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7f7a3042b490>


In [7]:
temp = AllChem.GetMorganFingerprintAsBitVect(df.mol[0], 4, nBits=2048)
print(np.vstack(temp))

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [8]:
# with minimal modification, we obtain the fingerprint vector using RDKit

def get_Xy(df):
    X = np.vstack(df.mol.apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048))))
    y = df.activity.eq('active').astype(float).to_numpy()
    return X, y 

In [9]:
X_train, y_train = get_Xy(df[df.group.eq('train')])
X_test, y_test = get_Xy(df[df.group.eq('test')])

In [10]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


In [11]:
print(y_train)

[1. 1. 1. ... 0. 0. 0.]


In [12]:
print(X_test)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
print(y_test)

[1. 1. 1. ... 0. 0. 0.]


# *Sklearn Model Test*

In [14]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

## *RF*

In [15]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8622021893110109

In [16]:
y_pred = clf.predict_proba(X_test)[:, 1]
print(y_pred)

[0.52       0.48       0.33333333 ... 0.49       0.25       0.4       ]


### *metrics*

In [17]:
# logloss
metrics.log_loss(y_test, y_pred, labels=[0, 1])

0.42002783476381417

In [18]:
# AUC PRC
precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred, pos_label=1)
metrics.auc(recall, precision)

0.8761509861674194

In [19]:
# AUC ROC
fpr_roc, tpr_roc, _ = metrics.roc_curve(y_test, y_pred, pos_label=1)
metrics.auc(fpr_roc, tpr_roc)

0.8937501125700187

## *SVM*

In [20]:
clf = SVC(C=1, kernel='linear')
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7475853187379266

In [21]:
y_pred = clf.predict(X_test)
print(y_pred)

[1. 1. 0. ... 0. 0. 0.]


### *metrics*

In [22]:
accuracy = metrics.accuracy_score(y_test, y_pred)
mac_f1score = metrics.f1_score(y_test, y_pred, average='macro')
mic_f1score = metrics.f1_score(y_test, y_pred, average='micro')
                               
print(f"accuracy : {accuracy}")
print(f"mac_f1score : {mac_f1score}")
print(f"mic_f1score : {mic_f1score}")

accuracy : 0.7475853187379266
mac_f1score : 0.739777930520362
mic_f1score : 0.7475853187379266


### *labeling*

In [23]:
test_data = df[df.group.eq('test')]

In [24]:
print(test_data.count())
print(len(test_data))

inchikey    1553
smiles      1553
group       1553
activity    1553
mol         1553
dtype: int64
1553


In [25]:
test_data['predict'] = y_pred
test_data = test_data[['inchikey', 'smiles', 'predict']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [26]:
test_data.head(30)

Unnamed: 0,inchikey,smiles,predict
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,1.0
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,1.0
6,JMGXJHWTVBGOKG-UHFFFAOYSA-N,Cc1cc(-c2cc(OC(=O)c3ccccc3)ccc2Cl)cc2nnc(Nc3cc...,0.0
10,DXCUKNQANPLTEJ-UHFFFAOYSA-N,CCN(CC)CCCCNc1ncc2cc(-c3cc(OC)cc(OC)c3)c(NC(=O...,1.0
11,PIQCTGMSNWUMAF-UHFFFAOYSA-N,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...,1.0
16,JGEBLDKNWBUGRZ-HXUWFJFHSA-N,CN(C[C@@H]1COCCO1)S(=O)(=O)Nc1ccc2ccc3ncc(-c4c...,0.0
18,AQHXGQTWGFVXTB-UHFFFAOYSA-N,COc1cc(CCc2[nH]nc(Nc3ccnc(NCc4onc(C)c4)n3)c2)c...,1.0
23,BUSNTKOLFQPMBH-UHFFFAOYSA-N,Cc1cc(CNc2ncc(Br)c(Nc3[nH]nc(CCc4ccccc4)c3)n2)on1,1.0
24,COJBCAMFZDFGFK-TVSWGBMESA-N,O[CH]1O[CH]([CH](O)[CH](O)[CH]1O[S](O)(=O)=O)C...,0.0
25,COJBCAMFZDFGFK-VCSGLWQLSA-N,O[CH]1O[CH]([CH](O)[CH](O)[CH]1O[S](O)(=O)=O)C...,0.0


# *NN(Neural Network) Model*

In [27]:
from keras import models
from keras import layers
from keras import optimizers
import tensorflow as tf

In [31]:
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(2048,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [32]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mse',
              metrics=['accuracy'])

In [33]:
model.fit(X_train, y_train, epochs=100, batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f793c236ac0>

In [34]:
results = model.evaluate(X_test, y_test)



In [35]:
print(results)

[0.19068124890327454, 0.7707662582397461]


In [36]:
pred = model.predict(X_test)
print(pred)

[[9.8509246e-01]
 [9.9908018e-01]
 [1.5291163e-03]
 ...
 [4.6342513e-05]
 [4.2366234e-04]
 [1.4108245e-04]]


## *labling*

In [37]:
test_data2 = df[df.group.eq('test')]
test_data2['predict'] = pred
test_data2 = test_data2[['inchikey', 'smiles', 'predict']]

print(test_data2)

                         inchikey  \
2     TTZSNFLLYPYKIL-UHFFFAOYSA-N   
4     CUIHSIWYWATEQL-UHFFFAOYSA-N   
6     JMGXJHWTVBGOKG-UHFFFAOYSA-N   
10    DXCUKNQANPLTEJ-UHFFFAOYSA-N   
11    PIQCTGMSNWUMAF-UHFFFAOYSA-N   
...                           ...   
5525  UBAHPEHGSJRHGA-UHFFFAOYSA-N   
5526  RTTIKBHDHKOSNI-UHFFFAOYSA-N   
5527  HVUOSZANYULBJR-UHFFFAOYSA-N   
5528  SNFWCJIVWUVRNO-UHFFFAOYSA-N   
5529  MKSAGABLDNGEAP-DHIUTWEWSA-N   

                                                 smiles   predict  
2     Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...  0.985092  
4     Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...  0.999080  
6     Cc1cc(-c2cc(OC(=O)c3ccccc3)ccc2Cl)cc2nnc(Nc3cc...  0.001529  
10    CCN(CC)CCCCNc1ncc2cc(-c3cc(OC)cc(OC)c3)c(NC(=O...  0.999924  
11    CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...  0.999876  
...                                                 ...       ...  
5525  Cc1cccc2nc(-c3ccc(-c4cccc(CN5CCC(C(N)=O)CC5)c4...  0.000004  
5526       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
