In [None]:
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score,mean_squared_error,accuracy_score,auc,roc_auc_score
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from sklearn import svm
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_label_df = pd.read_excel("../data/ADMET.xlsx",sheet_name="training")
test_label_df = pd.read_excel("../data/ADMET.xlsx",sheet_name="test")
train_data_df = pd.read_excel("../data/Molecular_Descriptor.xlsx", sheet_name = "training")
test_data_df = pd.read_excel("../data/Molecular_Descriptor.xlsx", sheet_name = "test")

In [None]:
pd.set_option("display.max_rows", None)      # 显示所有行
pd.set_option("display.float_format",lambda x: "%.2f" % x) #为了直观的显示数字，不采用科学计数法

In [None]:
#找出train全部为0的列（特征）--无意义
df1=train_data_df.loc[:, (train_data_df == 0).all(axis=0)]
zero_features = list(df1.columns)
print(f"全部为0的列有{len(zero_features)}列")
#排除train中全部为0的列
exclude_col = ['SMILES'] + zero_features

In [None]:
## 训练数据及测试数据准备
all_cols = [f for f in train_data_df.columns if f not in exclude_col]
x_train = train_data_df[all_cols]
x_test = test_data_df[all_cols]

In [None]:
#为了计算相似度，先做归一化，统一量纲
def kkk(x,col):
    return (x - min(x_train[col])) / (max(x_train[col]) - min(x_train[col]))
#x_train.apply(lambda x: kkk(x))
for col in x_train.columns:
    x_train[col] = x_train[col].apply(lambda x: kkk(x, col))
x_train

In [None]:
def jjj(x,col):
    return (x - min(x_test[col])) / (max(x_test[col]) - min(x_test[col]) +1e-6)
for col in x_test.columns:
    x_test[col] = x_test[col].apply(lambda x: jjj(x, col))
x_test

In [None]:
#计算余弦距离:
def caculateCos(vec1,vec2):
    return (np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))

In [None]:
#提出的KN选择验证集算法
all_index = []
top_k = 5
for i in range(len(x_test)):
    cosdis = []
    for j in range(len(x_train)):
    
        dis = caculateCos(x_test.iloc[i].values,x_train.iloc[j].values)
        cosdis.append(dis)
    cosdis = np.array(cosdis)
    index = np.argmax(cosdis)
    top_k_idx=cosdis.argsort()[::-1][0:top_k]
    #print(top_k_idx)
    all_index += list(top_k_idx)
    #simdata = x_train.iloc[index]
#print(simdata)

out_index = set(all_index)
print(len(out_index))
print(out_index)

In [None]:
out_index = list(out_index)
train_idx = [x for x in range(len(x_train)) if x not in out_index]
len(train_idx)

In [None]:
train_data = x_train.iloc[train_idx]
val_data = x_train.iloc[out_index]
val_data

In [None]:
y_A = train_label_df["Caco-2"]
y_D = train_label_df["CYP3A4"]
y_M = train_label_df["hERG"]
y_E = train_label_df["HOB"]
y_T = train_label_df["MN"]

In [None]:
val_y_A = y_A.iloc[out_index]
train_y_A = y_A.iloc[train_idx]

val_y_D = y_D.iloc[out_index]
train_y_D = y_D.iloc[train_idx]

val_y_M = y_M.iloc[out_index]
train_y_M = y_M.iloc[train_idx]

val_y_E = y_E.iloc[out_index]
train_y_E = y_E.iloc[train_idx]

val_y_E = y_E.iloc[out_index]
train_y_E = y_E.iloc[train_idx]

In [None]:
def generateClsOut(model,train_y, val_y):
    
    model.fit(train_data, train_y)
    val_pred = model.predict(val_data)
    test_pred = model.predict(x_test)
    val_score = model.predict_proba(val_data)
    acc = accuracy_score(val_y, val_pred)
    recall = recall_score(val_y, val_pred)
    f1 = f1_score(val_y, val_pred)
    precision = precision_score(val_y, val_pred)
    spec = calculate_Spec(val_y,val_pred)
    print(f"acc:{acc},recall:{recall},F1:{f1},Precision:{precision}SPec{spec}")
    return test_pred, val_score

In [None]:
#计算Specity的函数
def get_basic_metrics(labels_true, labels_pred, class_names=[0,1]):
    cm = sklearn.metrics.confusion_matrix(labels_true, labels_pred, labels=range(len(class_names)))
    cm = cm.astype(np.float32)
    FP = cm.sum(axis=0) - np.diag(cm)
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    return FP, TP, FN, TN

def calculate_Spec(labels_true, labels_pred):
    FP, TP, FN, TN = get_basic_metrics(labels_true, labels_pred)
    spec = (TN / (TN + FP))
    print(spec)

In [None]:
model_A = XGBClassifier()
pred_A,score_A = generateClsOut(model_A,train_y_A, val_y_A)

In [None]:
model_A = XGBClassifier()
pred_A,score_A = generateClsOut(model_A,train_y, val_y)

In [None]:
model_M = XGBClassifier()
pred_M,score_M = generateClsOut(model_M,train_y_M, val_y_M)

In [None]:
model_E = lgb.LGBMClassifier()
pred_E, score_E = generateClsOut(model_E,train_y_E, val_y_E)

In [None]:
model_T = lgb.LGBMClassifier()
pred_T,score_T = generateClsOut(model_T,train_y_T, val_y_T)

In [None]:
test_label_df["Caco-2"] = pred_A
test_label_df["CYP3A4"] = pred_D
test_label_df["hERG"] = pred_M
test_label_df["HOB"] = pred_E
test_label_df["MN"] = pred_T

In [None]:
test_label_df

In [None]:
fpr, tpr, threshold = roc_curve(val_y_A.values.reshape(-1,1), score_A[:,1])

In [None]:
#绘制ROC曲线
roc_auc = auc(fpr, tpr)

lw = 2
plt.figure(figsize=(6 * 1.2, 6))
plt.plot(fpr, tpr, color='orange',
         lw=lw, label='AUC = %0.2f' % roc_auc)  # 假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Caco-2(A) ROC plot')
plt.legend(loc="lower right")

plt.savefig(r'A_ROC_plot.png')
print(roc_auc)
print('ROC plot has finished!')