In [24]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier

from vecstack import stacking
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.utils import resample

In [25]:
covid_df = pd.read_csv("dataset.csv")
covid_df.shape

(196, 9)

In [26]:
covid_cols = ['Age' ,'WBC','LYMC','LYMPH','NEUT','NEU','NLR','Severity']

#covid_df = covid_df[covid_cols]
# print(covid_df)
clinicalOutput = covid_df["Severity"]
clinicalInput = covid_df.drop(["Gender"],axis=1)
# print(clinicalOutput)
# print(clinicalInput)
clinicalInput.shape

(196, 8)

In [32]:
et = ExtraTreesClassifier(n_estimators=300, max_depth=15)
rf = RandomForestClassifier(n_estimators=100, max_depth=19)
gbdt = GradientBoostingClassifier()
#        learning_rate=0.1, n_estimators=300,max_depth=7, min_samples_leaf =10, 
#                min_samples_split =100, max_features=11, random_state=10,subsample=0.85),
#     LogisticRegression(solver = 'saga', max_iter= 600, multi_class='ovr',
#                         class_weight='balanced'),
xgbc = XGBClassifier(n_estimators=80,learning_rate=0.45,max_depth=7, eval_metric='error')

In [36]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = et.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 9.7
Average TN  = 3.6
Average FP  = 0.7
Average FN  = 1.3
Average accuracy  = 0.8688
95.0 confidence interval 80.0% and 93.8%
Average AUC       = 0.9409
95.0 confidence interval 82.8% and 100.0%
Average precision = 0.9363
95.0 confidence interval 85.8% and 100.0%
Average recall    = 0.8818
95.0 confidence interval 81.8% and 100.0%
Average f1 score = 0.9059
95.0 confidence interval 85.7% and 95.6%
Sensitivity score = 0.8818
95.0 confidence interval 81.8% and 100.0%
Specificity score = 0.835
95.0 confidence interval 55.6% and 100.0%


In [37]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = rf.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 8.6
Average TN  = 5.1
Average FP  = 0.5
Average FN  = 1.1
Average accuracy  = 0.895
95.0 confidence interval 80.0% and 100.0%
Average AUC       = 0.9495
95.0 confidence interval 83.8% and 100.0%
Average precision = 0.9484
95.0 confidence interval 81.7% and 100.0%
Average recall    = 0.8867
95.0 confidence interval 71.8% and 100.0%
Average f1 score = 0.9125
95.0 confidence interval 82.4% and 100.0%
Sensitivity score = 0.8867
95.0 confidence interval 71.8% and 100.0%
Specificity score = 0.91
95.0 confidence interval 69.7% and 100.0%


In [38]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = gbdt.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 8.5
Average TN  = 5.7
Average FP  = 0.4
Average FN  = 0.7
Average accuracy  = 0.9283
95.0 confidence interval 81.7% and 100.0%
Average AUC       = 0.9694
95.0 confidence interval 84.1% and 100.0%
Average precision = 0.96
95.0 confidence interval 82.2% and 100.0%
Average recall    = 0.9244
95.0 confidence interval 88.9% and 100.0%
Average f1 score = 0.9402
95.0 confidence interval 85.5% and 100.0%
Sensitivity score = 0.9244
95.0 confidence interval 88.9% and 100.0%
Specificity score = 0.9333
95.0 confidence interval 70.4% and 100.0%


In [40]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = xgbc.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 10.2
Average TN  = 3.7
Average FP  = 0.8
Average FN  = 0.6
Average accuracy  = 0.9083
95.0 confidence interval 86.7% and 93.8%
Average AUC       = 0.9539
95.0 confidence interval 85.6% and 100.0%
Average precision = 0.9344
95.0 confidence interval 84.6% and 100.0%
Average recall    = 0.9445
95.0 confidence interval 83.7% and 100.0%
Average f1 score = 0.936
95.0 confidence interval 90.2% and 95.7%
Sensitivity score = 0.9445
95.0 confidence interval 83.7% and 100.0%
Specificity score = 0.815
95.0 confidence interval 52.2% and 100.0%


In [41]:
LR = LogisticRegression(solver = 'liblinear', max_iter= 90, multi_class='ovr',
                       class_weight='balanced')

In [42]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = LR.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 7.4
Average TN  = 5.5
Average FP  = 0.9
Average FN  = 1.5
Average accuracy  = 0.8425
95.0 confidence interval 68.5% and 98.5%
Average AUC       = 0.9157
95.0 confidence interval 80.2% and 99.6%
Average precision = 0.8981
95.0 confidence interval 75.6% and 100.0%
Average recall    = 0.8292
95.0 confidence interval 63.4% and 97.5%
Average f1 score = 0.8579
95.0 confidence interval 72.0% and 98.7%
Sensitivity score = 0.8292
95.0 confidence interval 63.4% and 97.5%
Specificity score = 0.8571
95.0 confidence interval 66.7% and 100.0%


In [43]:
from sklearn.svm import SVC
model = SVC(probability=True)    
    

In [44]:
# imputer = KNNImputer(n_neighbors=7)
# Ximputer = imputer.fit_transform(clinicalInput)
dataframe=pd.DataFrame(clinicalInput, columns=covid_cols)

outlier_detect = IsolationForest(n_estimators=150, max_samples=196, contamination=float(0.02), max_features=clinicalInput.shape[1])

dataframe = dataframe.values
outlier_detect.fit(dataframe)
outliers_predicted = outlier_detect.predict(dataframe)
covid_check = dataframe[outlier_detect.predict(dataframe) == -1]
dataframe = dataframe[outlier_detect.predict(dataframe) != -1]

     #values = dataframe.values
n_size = int(len(dataframe) * 0.80)

    # prepare train and test sets
    #data_sample = resample(values, n_samples=n_size)
data_sample = resample(dataframe, n_samples=n_size)


dataframe = pd.DataFrame(data_sample, columns=covid_cols)
#    print(dataframe)
    
    # split into input and output elements
y = dataframe.Severity # Target variable
X = dataframe.drop(['Severity'], axis = 1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
n_iterations = 10

accuracies_stats = []
AUC_stats = []
pre_stats = []
recall_stats = []
f1_stats = []
sensitivity_stats = []
specificity_stats = []
TP_list = []
TN_list = []
FP_list = []
FN_list = []

from sklearn.svm import SVC
model = SVC(probability=True, degree=3)

skf = StratifiedKFold(n_splits=n_iterations, shuffle=True)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_resample(X_train, y_train)
    
    # 训练模型
    model = model.fit(x_res, y_res)
    
  # 预测测试集
    y_pred = model.predict(X_test)
    
    # 计算评估指标
    accuracies_stats.append(metrics.accuracy_score(y_test, y_pred))
    probs = model.predict_proba(X_test)[:, 1]
    AUC_stats.append(roc_auc_score(y_test, probs))
    pre_stats.append(precision_score(y_test, y_pred, average='binary'))
    recall_stats.append(recall_score(y_test, y_pred, average='binary'))
    f1_stats.append(f1_score(y_test, y_pred, average='binary'))
    
    confusion = confusion_matrix(y_test, y_pred)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    
    sensitivity = TP / float(FN + TP)
    specificity = TN / (TN + FP)
    
    sensitivity_stats.append(sensitivity)
    specificity_stats.append(specificity)
    
    TP_list.append(TP)
    TN_list.append(TN)
    FP_list.append(FP)
    FN_list.append(FN)
    

    
print("Average TP  = {}".format(np.round(np.array(TP_list).mean(), 4)))
print("Average TN  = {}".format(np.round(np.array(TN_list).mean(), 4)))
print("Average FP  = {}".format(np.round(np.array(FP_list).mean(), 4)))
print("Average FN  = {}".format(np.round(np.array(FN_list).mean(), 4)))

# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(accuracies_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(accuracies_stats, p))

print("Average accuracy  = {}".format(np.round(np.array(accuracies_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(AUC_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(AUC_stats, p))

print("Average AUC       = {}".format(np.round(np.array(AUC_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(pre_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(pre_stats, p))
print("Average precision = {}".format(np.round(np.array(pre_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(recall_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(recall_stats, p))
print("Average recall    = {}".format(np.round(np.array(recall_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(f1_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(f1_stats, p))
print("Average f1 score = {}".format(np.round(np.array(f1_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(sensitivity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(sensitivity_stats, p))
print("Sensitivity score = {}".format(np.round(np.array(sensitivity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))

p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(specificity_stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(specificity_stats, p))
print("Specificity score = {}".format(np.round(np.array(specificity_stats).mean(), 4)))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))


Average TP  = 7.0
Average TN  = 4.4
Average FP  = 0.5
Average FN  = 3.4
Average accuracy  = 0.745
95.0 confidence interval 51.2% and 91.8%
Average AUC       = 0.8352
95.0 confidence interval 69.5% and 97.9%
Average precision = 0.9274
95.0 confidence interval 75.6% and 100.0%
Average recall    = 0.6718
95.0 confidence interval 35.5% and 88.2%
Average f1 score = 0.7706
95.0 confidence interval 49.1% and 93.7%
Sensitivity score = 0.6718
95.0 confidence interval 35.5% and 88.2%
Specificity score = 0.9
95.0 confidence interval 64.5% and 100.0%
