In [1]:
import numpy as np
import rdkit.Chem as Chem
from rdkit.Chem import AllChem
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
def smiles_to_fps(smiles, fp_radius, fp_length):
    """This function converts an array of SMILES to an array of Morgan fingerprints"""
    fps = np.array([smile_to_fp(s, fp_radius, fp_length) for s in smiles])  # get the bit strings
    return np.array([list(fp) for fp in fps], dtype=int)  # convert the bit strings to integer arrays


def smile_to_fp(s, fp_radius, fp_length):
    """This function converts SMILES to Morgan fingerprints"""
    mol = Chem.MolFromSmiles(s)  # get molecule object
    return (AllChem.GetMorganFingerprintAsBitVect(  # get fingerprint
        mol, fp_radius, nBits=fp_length)).ToBitString()  # convert it to bit string for speed


def read_csv(filename, input_name, target_name):
    """This function reads a csv file with an input column and a target column"""
    x = []  # SMILES
    y = []  # enzyme class
    with open(filename) as file:
        reader = csv.DictReader(file)
        for row in reader:
            x.append(row[input_name])
            y.append(int(row[target_name]))
    return np.array(x), np.array(y)


# read csv file
x, y = read_csv('pdb_data_protein_mc.csv', 'smiles', 'class')

# convert SMILES to Morgan fingerprints (ECFP4)
X = smiles_to_fps(x, 2, 8192)

In [3]:
# class_weight is set to 'balanced' because of imbalanced classes
# experimenting with max iterations
args = [{'class_weight':'balanced', 'max_iter':100},
        {'class_weight':'balanced', 'max_iter':500},
        {'class_weight':'balanced', 'max_iter':1000},
        {'class_weight':'balanced', 'max_iter':3000},
        {'class_weight':'balanced', 'max_iter':5000}]

In [4]:
# get results for each case
scores_f1 = []
for arg in args:
    # run classifier
    clf = LogisticRegression(**arg, n_jobs=-1, random_state=999)
    # get results
    preds = cross_val_predict(clf, X, y, cv=5, n_jobs=-1)
    scores_f1.append(f1_score(y, preds, average="weighted"))
    # print results
    print(scores_f1[-1])

0.681259653969988
0.681259653969988
0.681259653969988
0.681259653969988
0.681259653969988


In [5]:
# experimenting with penalty and solver
args = [{'penalty':'l1', 'solver':'liblinear', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l1', 'solver':'saga', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l2', 'solver':'lbfgs', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l2', 'solver':'liblinear', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l2', 'solver':'newton-cg', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l2', 'solver':'sag', 'class_weight':'balanced', 'max_iter':500},
{'penalty':'l2', 'solver':'saga', 'class_weight':'balanced', 'max_iter':500}]

In [6]:
scores_f1 = []
for arg in args:
    clf = LogisticRegression(**arg, n_jobs=-1, random_state=999)
    preds = cross_val_predict(clf, X, y, cv=5, n_jobs=-1)
    scores_f1.append(f1_score(y, preds, average="weighted"))
    print(scores_f1[-1])

0.664596861637438
0.6647251207279857
0.6784906371749834
0.681259653969988
0.6784908194847554
0.6783947774953267
0.67846805855283


In [7]:
args = [{'penalty':'l2', 'solver':'saga', 'class_weight':'balanced', 'max_iter':500, 'multi_class':'ovr'},
{'penalty':'l2', 'solver':'saga', 'class_weight':'balanced', 'max_iter':500, 'multi_class':'multinomial'}]

In [8]:
scores_f1 = []
for arg in args:
    clf = LogisticRegression(**arg)
    preds = cross_val_predict(clf, X, y, cv=5, n_jobs=-1)
    scores_f1.append(f1_score(y, preds, average="weighted"))
    print(scores_f1[-1])

0.67846805855283
0.6691240801808801
