In [22]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier,LGBMRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.metrics import roc_curve,roc_auc_score, confusion_matrix, precision_recall_curve, auc, mean_squared_error, \
    r2_score, mean_absolute_error,cohen_kappa_score,accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score
import pickle

def Standardize(col):
    return (col - np.mean(col)) / np.std(col)

def GetPreTable(trueDate,modelName,repetitions,path,DateDesc):
    cols_ = pd.read_csv('output/output_'+modelName+'_cols.csv')['0'].tolist()
    sub_predate = DateDesc[cols_]
#     sub_predate = sub_predate.apply(Standardize, axis=0)
#     sub_predate = sub_predate.fillna(0)
    predict_data = pd.DataFrame()
    for i in range(repetitions):  
        modelpath = path+'/{}_{}.pkl'.format(modelName,i+1)
        model= pickle.load(open(modelpath, "rb"))
        tr_pred = model.predict_proba(sub_predate)
        y_test = trueDate['label'].tolist()
        predict_data['smiles'] = trueDate['SMILES'].to_list()
        predict_data['Test'] = y_test
        predict_data['label_model_'+str(i+1)] = tr_pred[:, 1]
    predict_data['label'] = predict_data.iloc[:,2:].mean(axis=1)
    return predict_data

def Statistical(y_true, y_pred, y_pro):
    c_mat = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = list(c_mat.flatten())
    se = tp / (tp + fn)
    sp = tn / (tn + fp)
    auc_prc = auc(precision_recall_curve(y_true, y_pro, pos_label=1)[1],
                  precision_recall_curve(y_true, y_pro, pos_label=1)[0])
    acc = (tp + tn) / (tn + fp + fn + tp)
#     acc_skl = accuracy_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pro)
#     recall = se
# #     recall_skl = recall_score(y_true, y_pred)
#     precision = tp / (tp + fp)
# #     precision_skl = precision_score(y_true, y_pred)
#     f1 = 2 * (precision * recall) / (precision + recall) # F1 = 2 * (precision * recall) / (precision + recall)
# #     f1_skl = f1_score(y_true, y_pred)
#     kappa = cohen_kappa_score(y_true,y_pred)
#     mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
#     mcc_skl = matthews_corrcoef(y_true,y_pred)

    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)   
    mcc = matthews_corrcoef(y_true, y_pred)

    scores_dict = {}
    scores_dict['acc'] = acc
    scores_dict['auc_roc'] = auc_roc
    scores_dict['recall'] = recall
    scores_dict['precision'] = precision
    scores_dict['f1'] = f1
    scores_dict['kappa'] = kappa
    scores_dict['mcc'] = mcc 
    scores_dict['auc_prc'] = auc_prc
    import collections
    scores_dict = collections.OrderedDict(scores_dict)
    
    return scores_dict

def GetPreMetricTabel(repetitions,preDate,tureData,name,result_path):
    smiles = tureData['SMILES'].to_list()
    y_true = tureData['label'].to_list()
    predict_data = pd.DataFrame()
    predict_data['smiles'] = smiles
    predict_data['Test'] = y_true
    pre_scores = []
    y_pro_avg = preDate['label']
    y_pred_avg = [1 if p > 0.5 else 0 for p in y_pro_avg]  
    avg_scores_dict = Statistical(y_true, y_pred_avg, y_pro_avg)
    avg_scores_dict = pd.DataFrame(avg_scores_dict,index=['avg'])
    for i in range(repetitions):
        y_pro = preDate['label_model_'+str(i+1)]
        y_pred = [1 if p > 0.5 else 0 for p in y_pro]
        predict_data['label_model_'+str(i+1)] = y_pred
        scores_dict = Statistical(y_true, y_pred, y_pro)
        pre_scores.append(scores_dict)

    predict_data['label'] =  y_pred_avg
    predict_data.to_csv(result_path+name+'_Cal_Metric_Data_bak.csv',index = False)
    
    data_df = pd.DataFrame(pre_scores)
    data_df.loc['Mean'] = data_df.mean()
    data_df.loc['Std'] = data_df.std()
    data_df = pd.concat([data_df, avg_scores_dict], axis=0)
    data_df.to_csv(result_path+name+'_Cal_Metric_Summarize.csv',index = False)
    return data_df

In [23]:
trueDate = pd.read_csv('1_AR_Alva_32_slim_Normalize.csv').iloc[:,:3]
DateDesc = pd.read_csv('1_AR_Alva_32_slim_Normalize.csv').iloc[:,3:]
path = 'model'
repetitions = 10
result_path = 'preresult/'

In [13]:
modelName = 'LGB'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.75,0.823529,0.6,0.818182,0.692308,0.49004,0.506791,0.795409
1,0.625,0.67451,0.4,0.666667,0.5,0.228916,0.248096,0.612256
2,0.59375,0.631373,0.466667,0.583333,0.518519,0.174603,0.177859,0.629644
3,0.71875,0.796078,0.6,0.75,0.666667,0.428571,0.436564,0.774498
4,0.75,0.756863,0.6,0.818182,0.692308,0.49004,0.506791,0.692026
5,0.75,0.784314,0.6,0.818182,0.692308,0.49004,0.506791,0.81883
6,0.84375,0.803922,0.733333,0.916667,0.814815,0.68254,0.695269,0.716815
7,0.6875,0.788235,0.4,0.857143,0.545455,0.352227,0.411842,0.689734
8,0.8125,0.776471,0.733333,0.846154,0.785714,0.620553,0.625577,0.708603
9,0.625,0.631373,0.533333,0.615385,0.571429,0.241107,0.243059,0.525621


In [14]:
modelName = 'xgb'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.65625,0.686275,0.466667,0.7,0.56,0.296,0.312428,0.676009
1,0.625,0.678431,0.533333,0.615385,0.571429,0.241107,0.243059,0.683586
2,0.78125,0.8,0.6,0.9,0.72,0.552,0.582636,0.790844
3,0.75,0.737255,0.6,0.818182,0.692308,0.49004,0.506791,0.712664
4,0.71875,0.717647,0.533333,0.8,0.64,0.424,0.447532,0.758589
5,0.6875,0.713725,0.6,0.692308,0.642857,0.367589,0.370565,0.687782
6,0.75,0.776471,0.666667,0.769231,0.714286,0.494071,0.498071,0.797247
7,0.75,0.72549,0.666667,0.769231,0.714286,0.494071,0.498071,0.659322
8,0.625,0.65098,0.466667,0.636364,0.538462,0.23506,0.243095,0.650576
9,0.5625,0.627451,0.466667,0.538462,0.5,0.114625,0.115553,0.621652


In [27]:
modelName = 'cat'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.75,0.839216,0.666667,0.769231,0.714286,0.494071,0.498071,0.776474
1,0.8125,0.803922,0.733333,0.846154,0.785714,0.620553,0.625577,0.784572
2,0.71875,0.803922,0.466667,0.875,0.608696,0.419355,0.470016,0.751922
3,0.84375,0.803922,0.733333,0.916667,0.814815,0.68254,0.695269,0.775647
4,0.78125,0.733333,0.666667,0.833333,0.740741,0.555556,0.565916,0.720276
5,0.75,0.803922,0.6,0.818182,0.692308,0.49004,0.506791,0.708883
6,0.71875,0.827451,0.6,0.75,0.666667,0.428571,0.436564,0.856929
7,0.78125,0.784314,0.8,0.75,0.774194,0.5625,0.563602,0.737089
8,0.8125,0.788235,0.733333,0.846154,0.785714,0.620553,0.625577,0.817755
9,0.78125,0.760784,0.8,0.75,0.774194,0.5625,0.563602,0.708492


In [26]:
modelName = 'ext'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.5,0.447059,0.0,0.0,0.0,-0.062241,-0.16871,0.410043
1,0.53125,0.447059,0.0,0.0,0.0,0.0,0.0,0.40747
2,0.53125,0.486275,0.0,0.0,0.0,0.0,0.0,0.431103
3,0.53125,0.478431,0.0,0.0,0.0,0.0,0.0,0.42673
4,0.53125,0.490196,0.0,0.0,0.0,0.0,0.0,0.429488
5,0.53125,0.482353,0.0,0.0,0.0,0.0,0.0,0.426074
6,0.5,0.490196,0.0,0.0,0.0,-0.062241,-0.16871,0.436309
7,0.53125,0.482353,0.0,0.0,0.0,0.0,0.0,0.433953
8,0.53125,0.517647,0.0,0.0,0.0,0.0,0.0,0.463741
9,0.53125,0.466667,0.0,0.0,0.0,0.0,0.0,0.425073


In [17]:
modelName = 'gbc'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.78125,0.768627,0.666667,0.833333,0.740741,0.555556,0.565916,0.774054
1,0.71875,0.760784,0.466667,0.875,0.608696,0.419355,0.470016,0.738915
2,0.71875,0.768627,0.466667,0.875,0.608696,0.419355,0.470016,0.752492
3,0.71875,0.796078,0.466667,0.875,0.608696,0.419355,0.470016,0.767312
4,0.78125,0.737255,0.6,0.9,0.72,0.552,0.582636,0.691234
5,0.71875,0.772549,0.466667,0.875,0.608696,0.419355,0.470016,0.75309
6,0.75,0.831373,0.533333,0.888889,0.666667,0.485944,0.52666,0.83687
7,0.84375,0.784314,0.733333,0.916667,0.814815,0.68254,0.695269,0.762084
8,0.65625,0.741176,0.333333,0.833333,0.47619,0.284553,0.350966,0.728358
9,0.8125,0.788235,0.666667,0.909091,0.769231,0.61753,0.638639,0.76106


In [18]:
modelName = 'mlp'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.5625,0.682353,0.6,0.529412,0.5625,0.128405,0.129412,0.650353
1,0.6875,0.784314,0.733333,0.647059,0.6875,0.377432,0.380392,0.742558
2,0.78125,0.823529,0.8,0.75,0.774194,0.5625,0.563602,0.6818
3,0.71875,0.741176,0.733333,0.6875,0.709677,0.4375,0.438357,0.611556
4,0.6875,0.737255,0.733333,0.647059,0.6875,0.377432,0.380392,0.706212
5,0.75,0.788235,0.733333,0.733333,0.733333,0.498039,0.498039,0.72559
6,0.65625,0.843137,0.866667,0.590909,0.702703,0.328244,0.363092,0.845197
7,0.6875,0.784314,0.533333,0.727273,0.615385,0.36255,0.374943,0.751895
8,0.71875,0.780392,0.733333,0.6875,0.709677,0.4375,0.438357,0.718108
9,0.71875,0.752941,0.8,0.666667,0.727273,0.44186,0.449712,0.712747


In [19]:
modelName = 'rf'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.625,0.678431,0.266667,0.8,0.4,0.216327,0.285653,0.745139
1,0.625,0.72549,0.266667,0.8,0.4,0.216327,0.285653,0.690207
2,0.625,0.619608,0.266667,0.8,0.4,0.216327,0.285653,0.664161
3,0.65625,0.768627,0.333333,0.833333,0.47619,0.284553,0.350966,0.738926
4,0.71875,0.756863,0.466667,0.875,0.608696,0.419355,0.470016,0.73025
5,0.625,0.654902,0.266667,0.8,0.4,0.216327,0.285653,0.694331
6,0.6875,0.690196,0.466667,0.777778,0.583333,0.35743,0.387378,0.722729
7,0.6875,0.796078,0.4,0.857143,0.545455,0.352227,0.411842,0.791024
8,0.6875,0.764706,0.4,0.857143,0.545455,0.352227,0.411842,0.732718
9,0.65625,0.733333,0.333333,0.833333,0.47619,0.284553,0.350966,0.733022


In [20]:
modelName = 'svm'
predictTable = GetPreTable(trueDate,modelName,repetitions,path,DateDesc)
metricTable = GetPreMetricTabel(repetitions,predictTable,trueDate,modelName,result_path)
metricTable

Unnamed: 0,acc,auc_roc,recall,precision,f1,kappa,mcc,auc_prc
0,0.6875,0.827451,0.866667,0.619048,0.722222,0.386973,0.416146,0.82575
1,0.78125,0.74902,0.8,0.75,0.774194,0.5625,0.563602,0.788576
2,0.78125,0.764706,0.8,0.75,0.774194,0.5625,0.563602,0.797882
3,0.75,0.784314,0.8,0.705882,0.75,0.501946,0.505882,0.815928
4,0.75,0.784314,0.8,0.705882,0.75,0.501946,0.505882,0.787302
5,0.78125,0.788235,0.866667,0.722222,0.787879,0.565891,0.575947,0.80494
6,0.84375,0.905882,0.866667,0.8125,0.83871,0.6875,0.688847,0.928819
7,0.75,0.784314,0.8,0.705882,0.75,0.501946,0.505882,0.808073
8,0.8125,0.815686,0.8,0.8,0.8,0.623529,0.623529,0.855149
9,0.75,0.74902,0.8,0.705882,0.75,0.501946,0.505882,0.737126
