In [23]:
from glob import glob
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import useful_rdkit_utils as uru
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, matthews_corrcoef, f1_score, precision_score, recall_score

 Read the training and test data

In [25]:
def read_input_data(dirname, prefix):
    idx = int(dirname.replace("data\\Sol",""))
    # read the datafiles
    train = pd.read_csv(f"{dirname}/{prefix}_train_{idx:03d}.csv")
    val = pd.read_csv(f"{dirname}/{prefix}_val_{idx:03d}.csv")
    test = pd.read_csv(f"{dirname}/{prefix}_test_{idx:03d}.csv")
    train = pd.concat([train, val])
    train = train.dropna(subset=["Sol"])
    test = test.dropna(subset=["Sol"])
    print(len(train),len(val),len(test))
    train['fp'] = train.SMILES.apply(uru.smi2numpy_fp)
    test['fp'] = test.SMILES.apply(uru.smi2numpy_fp)
    return train, test


 Build the LightGBM model

In [26]:
def build_model(train, test):
    lgbm = LGBMClassifier(verbose=-1)
    lgbm.fit(np.stack(train.fp),train.Sol)
    prob = lgbm.predict_proba(np.stack(test.fp))
    pred = lgbm.predict(np.stack(test.fp))
    roc_auc = roc_auc_score(test.Sol, prob[:,1])
    precision, recall, thresholds = precision_recall_curve(test.Sol, prob[:,1])
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(test.Sol, pred)
    thresh = .5
    prec = precision_score(test.Sol, (prob[:,1] >= thresh).astype(int))
    recall = recall_score(test.Sol, (prob[:,1] >= thresh).astype(int))
    return roc_auc, pr_auc, mcc, prec, recall, prob, pred



 Build models and store the data

In [27]:
df_list = []
result_list = []
for dirname in sorted(glob("data/Sol0*")):
    idx = int(dirname.replace("data\\Sol",""))
    for prefix in ["random", "scaffold"]:
        train, test = read_input_data(dirname,prefix)
        roc_auc, pr_auc, mcc, prec, recall, prob, pred = build_model(train, test)
        test.Sol = test.Sol.astype(int)
        test['method'] = 'lightGBM'
        test['Sol_prob'] = prob[:,1]
        test['Sol_pred'] = pred.astype(int)
        test['cv_cycle'] = idx
        test['split'] = prefix
        df_list.append(test)
        print(f"{prefix} {dirname} {roc_auc:.2f} {pr_auc:.2f} {mcc: .2f} {prec:.2f} {recall:.2f}")
        result_list.append([prefix, dirname, "ST", roc_auc, pr_auc, mcc, prec, recall])



1955 217 218
random data\Sol000 0.81 0.85  0.50 0.71 0.98
1955 217 218
scaffold data\Sol000 0.74 0.80  0.25 0.63 0.91
1955 217 218
random data\Sol001 0.79 0.81  0.48 0.72 0.90
1955 217 218
scaffold data\Sol001 0.80 0.85  0.39 0.71 0.86
1955 217 218
random data\Sol002 0.82 0.85  0.46 0.72 0.95
1955 217 218
scaffold data\Sol002 0.80 0.85  0.45 0.77 0.85
1955 217 218
random data\Sol003 0.85 0.88  0.54 0.80 0.85
1955 217 218
scaffold data\Sol003 0.77 0.82  0.37 0.62 0.97
1955 217 218
random data\Sol004 0.82 0.87  0.49 0.75 0.91
1955 217 218
scaffold data\Sol004 0.83 0.90  0.46 0.75 0.91
1955 217 218
random data\Sol005 0.83 0.87  0.52 0.77 0.89
1955 217 218
scaffold data\Sol005 0.74 0.81  0.22 0.66 0.99
1955 217 218
random data\Sol006 0.84 0.88  0.44 0.73 0.91
1955 217 218
scaffold data\Sol006 0.77 0.82  0.44 0.72 0.84
1955 217 218
random data\Sol007 0.80 0.86  0.40 0.69 0.96
1955 217 218
scaffold data\Sol007 0.77 0.78  0.37 0.71 0.81
1955 217 218
random data\Sol008 0.82 0.87  0.48 0.76 0.8

 Format the results into a dataframe

In [28]:
cols = ['cv_cycle','split','method','SMILES','Name','Sol','Sol_prob','Sol_pred']
test[cols]


Unnamed: 0,cv_cycle,split,method,SMILES,Name,Sol,Sol_prob,Sol_pred
0,9,scaffold,lightGBM,CC(C)Oc1ccc(C(=O)Nc2cccc(-c3nc4cnccc4o3)c2)cc1,Mol3349,0,0.080745,0
1,9,scaffold,lightGBM,Brc1cc2c(cc1Cn1cncn1)OCCCO2,Mol1176,1,0.711642,1
2,9,scaffold,lightGBM,Cc1ccc2c(c1)C(=O)CC1(CCN(C(=O)c3ccc4nc(C)cc(O)...,Mol1526,0,0.917070,1
3,9,scaffold,lightGBM,O=C(Nc1ccccc1)N1CCCc2cnc(N3CCOCC3)nc21,Mol1620,0,0.160344,0
4,9,scaffold,lightGBM,CC(C)c1ccccc1OC(=O)NCCc1ccc2ccccc2c1,Mol3201,0,0.331978,0
...,...,...,...,...,...,...,...,...
213,9,scaffold,lightGBM,COc1cc(Nc2ncc3c(n2)-c2ccc(Cl)cc2C(c2c(F)cccc2O...,Mol1155,1,0.439188,0
214,9,scaffold,lightGBM,CCCCC1=NC2(CCCC2)C(=O)N1Cc1ccc(-c2ccccc2-c2nn[...,Mol340,0,0.094580,0
215,9,scaffold,lightGBM,N#Cc1ccc2nc([C@H]3CCN(C4CCC4)C3)[nH]c2c1,Mol1451,1,0.658023,1
216,9,scaffold,lightGBM,N#CC1(NC(=O)CN2CCOCC2)CCCCCC1,Mol1211,1,0.981131,1


 Write the individual predictions to disk

In [29]:
pd.concat(df_list)[cols].to_csv("lightgbm_classifciation_results.csv",index=False)


 Write the summary statistics to disk

In [30]:
result_df = pd.DataFrame(result_list,columns=["split","dataset","task","roc_auc","pr_auc","mcc", "prec", "recall"])


In [31]:
result_df


Unnamed: 0,split,dataset,task,roc_auc,pr_auc,mcc,prec,recall
0,random,data\Sol000,ST,0.814931,0.853788,0.500984,0.710227,0.976562
1,scaffold,data\Sol000,ST,0.742007,0.796034,0.247981,0.630058,0.908333
2,random,data\Sol001,ST,0.792101,0.811002,0.480479,0.72327,0.898438
3,scaffold,data\Sol001,ST,0.802254,0.84979,0.393198,0.714286,0.860656
4,random,data\Sol002,ST,0.819965,0.853872,0.459807,0.724551,0.945312
5,scaffold,data\Sol002,ST,0.800825,0.848783,0.446936,0.767123,0.854962
6,random,data\Sol003,ST,0.847135,0.878549,0.541356,0.79562,0.851562
7,scaffold,data\Sol003,ST,0.768955,0.822975,0.369292,0.621053,0.967213
8,random,data\Sol004,ST,0.82283,0.865803,0.49083,0.745223,0.914062
9,scaffold,data\Sol004,ST,0.825502,0.89516,0.4564,0.751515,0.911765


In [32]:
result_df.to_csv("lgbm_result.csv",index=False)
