# *Compound Classification*

## *Data Load*

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/cmpd.csv')
df.head()

Unnamed: 0,inchikey,smiles,group,activity
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active


In [3]:
df.shape

(5530, 4)

## *Data Processing*

In [4]:
import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

In [5]:
df['mol'] = df.smiles.apply(Chem.MolFromSmiles)

In [6]:
df.head()

Unnamed: 0,inchikey,smiles,group,activity,mol
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e2e0>
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e340>
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e3a0>
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e400>
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e460>


In [7]:
temp = AllChem.GetMorganFingerprintAsBitVect(df.mol[0], 4, nBits=2048)
print(np.vstack(temp))

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [8]:
# with minimal modification, we obtain the fingerprint vector using RDKit

def get_Xy(df):
    X = np.vstack(df.mol.apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048))))
    y = df.activity.eq('active').astype(float).to_numpy()
    return X, y 

In [9]:
X_train, y_train = get_Xy(df[df.group.eq('train')])
X_test, y_test = get_Xy(df[df.group.eq('test')])

In [10]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 1 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]


In [11]:
print(y_train)

[1. 1. 1. ... 0. 0. 0.]


In [12]:
print(X_test)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
print(y_test)

[1. 1. 1. ... 0. 0. 0.]


## *Sklearn Model Test*

In [14]:
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

### *RF*

In [15]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8718609143593046

In [16]:
y_pred = clf.predict_proba(X_test)[:, 1]
print(y_pred)

[0.47 0.52 0.3  ... 0.28 0.33 0.43]


In [17]:
# logloss
metrics.log_loss(y_test, y_pred, labels=[0, 1])

0.4206760610348625

In [18]:
# AUC PRC
precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred, pos_label=1)
metrics.auc(recall, precision)

0.8773222420558681

In [19]:
# AUC ROC
fpr_roc, tpr_roc, _ = metrics.roc_curve(y_test, y_pred, pos_label=1)
metrics.auc(fpr_roc, tpr_roc)

0.886617676194593

### *SVM*

In [20]:
clf = SVC()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.871216999356085

In [33]:
y_pred = clf.predict(X_test)
print(y_pred)

[1. 0. 0. ... 0. 0. 0.]


In [36]:
test_data = df[df.group.eq('test')]

In [39]:
print(test_data.count())
print(len(test_data))

inchikey    1553
smiles      1553
group       1553
activity    1553
mol         1553
dtype: int64
1553


In [43]:
test_data['predict'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predict'] = y_pred


In [47]:
test_data.head(30)

Unnamed: 0,inchikey,smiles,group,activity,mol,predict
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e3a0>,1.0
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e460>,0.0
6,JMGXJHWTVBGOKG-UHFFFAOYSA-N,Cc1cc(-c2cc(OC(=O)c3ccccc3)ccc2Cl)cc2nnc(Nc3cc...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e520>,0.0
10,DXCUKNQANPLTEJ-UHFFFAOYSA-N,CCN(CC)CCCCNc1ncc2cc(-c3cc(OC)cc(OC)c3)c(NC(=O...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e6a0>,1.0
11,PIQCTGMSNWUMAF-UHFFFAOYSA-N,CN1CCN(c2ccc3nc(-c4c(N)c5c(F)cccc5[nH]c4=O)[nH...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e700>,1.0
16,JGEBLDKNWBUGRZ-HXUWFJFHSA-N,CN(C[C@@H]1COCCO1)S(=O)(=O)Nc1ccc2ccc3ncc(-c4c...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e8e0>,0.0
18,AQHXGQTWGFVXTB-UHFFFAOYSA-N,COc1cc(CCc2[nH]nc(Nc3ccnc(NCc4onc(C)c4)n3)c2)c...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125e9a0>,1.0
23,BUSNTKOLFQPMBH-UHFFFAOYSA-N,Cc1cc(CNc2ncc(Br)c(Nc3[nH]nc(CCc4ccccc4)c3)n2)on1,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125eb80>,1.0
24,COJBCAMFZDFGFK-TVSWGBMESA-N,O[CH]1O[CH]([CH](O)[CH](O)[CH]1O[S](O)(=O)=O)C...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125ebe0>,0.0
25,COJBCAMFZDFGFK-VCSGLWQLSA-N,O[CH]1O[CH]([CH](O)[CH](O)[CH]1O[S](O)(=O)=O)C...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7fac9125ec40>,0.0
