In [1]:
import numpy as np
import rdkit.Chem as Chem
from rdkit.Chem import AllChem
import csv

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier

In [2]:
def smiles_to_fps(smiles, fp_radius, fp_length):
    """This function converts an array of SMILES to an array of Morgan fingerprints"""
    fps = np.array([smile_to_fp(s, fp_radius, fp_length) for s in smiles])  # get the bit strings
    return np.array([list(fp) for fp in fps], dtype=int)  # convert the bit strings to integer arrays


def smile_to_fp(s, fp_radius, fp_length):
    """This function converts SMILES to Morgan fingerprints"""
    mol = Chem.MolFromSmiles(s)  # get molecule object
    return (AllChem.GetMorganFingerprintAsBitVect(  # get fingerprint
        mol, fp_radius, nBits=fp_length)).ToBitString()  # convert it to bit string for speed


def read_csv(filename, input_name, target_name):
    """This function reads a csv file with an input column and a target column"""
    x = []  # SMILES
    y = []  # enzyme class
    with open(filename) as file:
        reader = csv.DictReader(file)
        for row in reader:
            x.append(row[input_name])
            y.append(int(row[target_name]))
    return np.array(x), np.array(y)


# read csv file
x, y = read_csv('pdb_data_protein_mc.csv', 'smiles', 'class')

# convert SMILES to Morgan fingerprints (ECFP4)
X = smiles_to_fps(x, 2, 8192)

In [3]:
# classifiers for comparison
# random_state is used for reproducible results
classifiers = {"BernoulliNB": BernoulliNB(),
              "ExtraTreesClassifier": ExtraTreesClassifier(random_state=999),
              "GaussianNB": GaussianNB(),
              "KNeighborsClassifier": KNeighborsClassifier(),
              "LinearSVC": LinearSVC(random_state=999),
              "LogisticRegression": LogisticRegression(random_state=999),
              "MLPClassifier": MLPClassifier(random_state=999),
              "NearestCentroid": NearestCentroid(),
              "RandomForestClassifier": RandomForestClassifier(random_state=999),
              "RidgeClassifier": RidgeClassifier(random_state=999)}
# class predictions
predictions = []
# confusion matrices
confusion_m = []
# f1-scores for each class
scores_f1 = []
# f1-scores with weighted average
scores_f1_avg = []

In [4]:
# compare all classifiers
for name, clf in classifiers.items():
    preds = cross_val_predict(clf, X, y, cv=5, n_jobs=-1)
    
    # print results
    print("\n\n" + name + ":")
    print(confusion_matrix(y, preds))
    print(f1_score(y, preds, average=None))
    print(f1_score(y, preds, average="weighted"))
    
    predictions.append(preds)
    confusion_m.append(confusion_matrix(y, preds))
    scores_f1.append(f1_score(y, preds, average=None))
    scores_f1_avg.append(f1_score(y, preds, average="weighted"))



BernoulliNB:
[[3061   71  109  253  665  965]
 [  81   72   21   77   90  101]
 [  79    4  114   18   55   73]
 [  70   11   33  522  175   70]
 [ 276   45   85  110 1202  437]
 [ 586   59  207  374  602 3965]]
[0.65991161 0.20454545 0.25       0.46711409 0.48624595 0.69537005]
0.6137340480758626


ExtraTreesClassifier:
[[3895    9   14   47  211  948]
 [ 128  105    4   22   40  143]
 [ 101    2  130    3   19   88]
 [ 167    9    2  464   67  172]
 [ 529   14   12   39 1003  558]
 [ 937   22   16   96  204 4518]]
[0.71592684 0.34825871 0.49904031 0.59793814 0.54230873 0.73944354]
0.6766566392535242


GaussianNB:
[[3100   50   41  115  719 1099]
 [ 132   66    3   13   83  145]
 [ 147    3   42    7   56   88]
 [ 257   36    6  143  178  261]
 [ 574   47   21   65  881  567]
 [1398   44   37  100  820 3394]]
[0.57771152 0.19186047 0.1703854  0.21601208 0.36017989 0.59821979]
0.5112918913244991


KNeighborsClassifier:
[[3794   64   51   97  753  365]
 [ 159  111    2   25   79   66]