In [1]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier

from vecstack import stacking
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.utils import resample

In [2]:
covid_df = pd.read_csv("dataset.csv")
covid_df.shape

(600, 21)

In [3]:
covid_cols = ['Patient age quantile', 'Hematocrit', 'Hemoglobin',
        'Platelets', 'Red blood Cells', 'Lymphocytes', 'Leukocytes',
        'Basophils', 'Eosinophils', 'Monocytes', 'Serum Glucose', 'Neutrophils',
      'Urea', 'Proteina C reativa mg/dL', 'Creatinine', 'Potassium', 'Sodium',
       'Alanine transaminase', 'Aspartate transaminase','Label']

#covid_df = covid_df[covid_cols]
# print(covid_df)



clinicalOutput = covid_df["Label"]
clinicalInput = covid_df.drop(["Patient ID"],axis=1)
# print(clinicalOutput)
# print(clinicalInput)
clinicalInput.shape

(600, 20)

In [8]:
ET = ExtraTreesClassifier(n_estimators=300, max_depth=17)
     
RF = RandomForestClassifier(n_estimators=300, max_depth=17)
GBDT = GradientBoostingClassifier(learning_rate=0.1, n_estimators=290,min_samples_split=200,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10)
   
LR = LogisticRegression(solver = 'liblinear', max_iter= 200, multi_class='ovr',
                       class_weight='balanced')

from sklearn.svm import SVC
SVC = SVC(probability=True)

In [9]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = ET.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 5.3
Average TN  = 38.9
Average FP  = 1.3
Average FN  = 1.5
Average accuracy  = 0.9404
95.0 confidence interval 85.6% and 99.5%
Average AUC       = 0.9689
95.0 confidence interval 90.5% and 100.0%
Average precision = 0.8143
95.0 confidence interval 51.6% and 100.0%
Average recall    = 0.7762
95.0 confidence interval 46.1% and 100.0%
Average f1 score = 0.7871
95.0 confidence interval 48.6% and 98.5%
Sensitivity score = 0.7762
95.0 confidence interval 46.1% and 100.0%
Specificity score = 0.9677
95.0 confidence interval 92.5% and 100.0%


In [10]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = RF.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 6.8
Average TN  = 37.1
Average FP  = 1.6
Average FN  = 1.5
Average accuracy  = 0.934
95.0 confidence interval 86.5% and 97.9%
Average AUC       = 0.9763
95.0 confidence interval 93.6% and 100.0%
Average precision = 0.8172
95.0 confidence interval 59.4% and 97.5%
Average recall    = 0.8181
95.0 confidence interval 62.5% and 100.0%
Average f1 score = 0.8101
95.0 confidence interval 61.7% and 94.1%
Sensitivity score = 0.8181
95.0 confidence interval 62.5% and 100.0%
Specificity score = 0.9588
95.0 confidence interval 90.3% and 99.4%


In [11]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = GBDT.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 4.7
Average TN  = 39.2
Average FP  = 1.9
Average FN  = 1.2
Average accuracy  = 0.934
95.0 confidence interval 89.4% and 97.4%
Average AUC       = 0.9711
95.0 confidence interval 90.1% and 99.9%
Average precision = 0.7321
95.0 confidence interval 57.8% and 96.8%
Average recall    = 0.7933
95.0 confidence interval 52.2% and 100.0%
Average f1 score = 0.7476
95.0 confidence interval 56.1% and 90.8%
Sensitivity score = 0.7933
95.0 confidence interval 52.2% and 100.0%
Specificity score = 0.9537
95.0 confidence interval 90.8% and 99.5%


In [14]:
XGBC = XGBClassifier(learning_rate=0.4, 
                     n_estimators=50, max_depth=17,eval_metric='error')

In [15]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = XGBC.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 4.6
Average TN  = 38.5
Average FP  = 2.2
Average FN  = 1.7
Average accuracy  = 0.917
95.0 confidence interval 83.5% and 97.4%
Average AUC       = 0.9398
95.0 confidence interval 81.8% and 99.3%
Average precision = 0.6548
95.0 confidence interval 33.3% and 85.7%
Average recall    = 0.7262
95.0 confidence interval 20.4% and 100.0%
Average f1 score = 0.6817
95.0 confidence interval 24.7% and 90.8%
Sensitivity score = 0.7262
95.0 confidence interval 20.4% and 100.0%
Specificity score = 0.946
95.0 confidence interval 90.8% and 97.6%


In [17]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = LR.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 4.4
Average TN  = 36.1
Average FP  = 5.7
Average FN  = 0.8
Average accuracy  = 0.8617
95.0 confidence interval 75.9% and 93.6%
Average AUC       = 0.9031
95.0 confidence interval 81.1% and 94.2%
Average precision = 0.4736
95.0 confidence interval 32.4% and 73.1%
Average recall    = 0.8433
95.0 confidence interval 64.5% and 100.0%
Average f1 score = 0.5888
95.0 confidence interval 46.5% and 71.4%
Sensitivity score = 0.8433
95.0 confidence interval 64.5% and 100.0%
Specificity score = 0.8631
95.0 confidence interval 74.3% and 97.1%


In [18]:
imputer = KNNImputer(n_neighbors=7)
Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(Ximputer, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=600, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Label # Target variable
X = dataframe.drop(['Label'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []


skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = SVC.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 4.1
Average TN  = 36.4
Average FP  = 5.5
Average FN  = 1.0
Average accuracy  = 0.8617
95.0 confidence interval 80.9% and 94.8%
Average AUC       = 0.9349
95.0 confidence interval 82.1% and 100.0%
Average precision = 0.4476
95.0 confidence interval 33.3% and 68.2%
Average recall    = 0.8067
95.0 confidence interval 60.0% and 100.0%
Average f1 score = 0.5688
95.0 confidence interval 42.9% and 80.7%
Sensitivity score = 0.8067
95.0 confidence interval 60.0% and 100.0%
Specificity score = 0.8688
95.0 confidence interval 79.1% and 94.7%
