In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 
from sklearn.metrics import mean_squared_error

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt 
%matplotlib inline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

# 加载数据集

In [None]:
# load the car data dataset
try:
    data1 = pd.read_csv("data/04-07_carbonhealth_and_braidhealth.csv", delimiter=",")
    print(" dataset has {} samples with {} features each.".format(*data1.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

In [None]:
try:
    data2 = pd.read_csv("data/04-14_carbonhealth_and_braidhealth.csv", delimiter=",")
    print(" dataset has {} samples with {} features each.".format(*data2.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

In [None]:
try:
    data3 = pd.read_csv("data/04-21_carbonhealth_and_braidhealth.csv", delimiter=",")
    print(" dataset has {} samples with {} features each.".format(*data3.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

In [None]:
data = pd.concat([data1, data2, data3])
data.head()

In [None]:
data.info()

In [None]:
list_of_columns = list(data.select_dtypes(['bool']).columns)
data[list_of_columns] = data[list_of_columns].apply(lambda col:pd.Categorical(col).codes)#.replace(-1,np.nan)
#del data["cxr_findings", "cxr_impression", "cxr_link"]

In [None]:
data.cough_severity.value_counts()

In [None]:
data.cxr_impression.head()

In [None]:
data.cancer.value_counts()

# 处理成one-hot数据集

In [None]:
person_onehot_columns = ["swab_type","test_name", "cough_severity", "sob_severity", 
                         "high_risk_exposure_occupation", "high_risk_interactions", "rapid_flu_results", 
                         "rapid_strep_results", "ctab", "labored_respiration", "rhonchi", "wheezes", "cough",
                         "cough_severity", "fever", "sob", "sob_severity", "diarrhea", "fatigue", "headache", 
                         "loss_of_smell", "loss_of_taste", "runny_nose", "muscle_sore", "sore_throat" ]
onehot_columns_prefix = ["swab_type","test_name", "cough_severity", "sob_severity",  
                         "high_risk_exposure_occupation", "high_risk_interactions", "rapid_flu_results", 
                         "rapid_strep_results", "ctab", "labored_respiration", "rhonchi", "wheezes", "cough",
                         "cough_severity", "fever", "sob", "sob_severity", "diarrhea", "fatigue", "headache", 
                         "loss_of_smell", "loss_of_taste", "runny_nose", "muscle_sore", "sore_throat"]

one_hot_data = pd.get_dummies(data,prefix=onehot_columns_prefix, columns=person_onehot_columns)
one_hot_data.head(-5)

In [None]:
del one_hot_data["cxr_findings"]
del one_hot_data["cxr_impression"]
del one_hot_data["cxr_link"]

In [None]:
one_hot_data.drop(one_hot_data[one_hot_data.age < 0].index, inplace=True)

In [None]:
one_hot_data.head(-5)

In [None]:
one_hot_data.shape

In [None]:
one_hot_data.batch_date.value_counts()

In [None]:
one_hot_data.head()

In [None]:
one_hot_data.covid19_test_results = pd.Categorical(one_hot_data.covid19_test_results)
one_hot_data['target'] = one_hot_data.covid19_test_results.cat.codes
#sub2['income'].fillna((sub2['income'].mean()), inplace=True)
del one_hot_data["covid19_test_results"]

In [None]:
one_hot_data.shape

In [None]:
one_hot_data['days_since_symptom_onset'].value_counts()

In [None]:
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] <= 7, 1111)
#one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] 7 > = 14, "week2")


one_hot_data['days_since_symptom_onset'] = np.where(one_hot_data['days_since_symptom_onset'].between(8,14), 2222, one_hot_data['days_since_symptom_onset'])
one_hot_data['days_since_symptom_onset'] = np.where(one_hot_data['days_since_symptom_onset'].between(15,21), 3333, one_hot_data['days_since_symptom_onset'])
one_hot_data['days_since_symptom_onset'] = np.where(one_hot_data['days_since_symptom_onset'].between(22,28), 4444, one_hot_data['days_since_symptom_onset'])
one_hot_data['days_since_symptom_onset'] = np.where(one_hot_data['days_since_symptom_onset'].between(29,35), 5555, one_hot_data['days_since_symptom_onset'])
one_hot_data['days_since_symptom_onset'] = np.where(one_hot_data['days_since_symptom_onset'].between(36,150), 6666, one_hot_data['days_since_symptom_onset'])

#one_hot_data['days_since_symptom_onset'].map({'1111': 'week1', '2222': 'week2','3333': 'week3', '4444': 'week4','5555': 'week5', '6666': 'week6'})


#one_hot_data[one_hot_data.days_since_symptom_onset == '1111'] = 1 
#w.female[w.female == 'male']   = 0
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 1111, 1)
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 2222, 2)
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 3333, 3)
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 4444, 4)
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 5555, 5)
one_hot_data['days_since_symptom_onset'] = one_hot_data['days_since_symptom_onset'].mask(one_hot_data['days_since_symptom_onset'] == 6666, 6)


#one_hot_data['days_since_symptom_onset'].replace({'1111': 'week1', '2222': 'week2','3333': 'week3', '4444': 'week4','5555': 'week5', '6666': 'week6'}, inplace=True)



In [None]:
person_onehot_columns = ["days_since_symptom_onset"]
onehot_columns_prefix = ["days_since_symptom_onset_week"]

one_hot_data_onset_sym = pd.get_dummies(one_hot_data,prefix=onehot_columns_prefix, columns=person_onehot_columns)
one_hot_data_onset_sym.head()

In [None]:
one_hot_data_onset_sym.temperature.fillna(one_hot_data_onset_sym.temperature.mean(), inplace=True)
one_hot_data_onset_sym.sys.fillna(one_hot_data_onset_sym.sys.mean(), inplace=True)
one_hot_data_onset_sym.dia.fillna(one_hot_data_onset_sym.dia.mean(), inplace=True)
one_hot_data_onset_sym.rr.fillna(one_hot_data_onset_sym.rr.mean(), inplace=True)
one_hot_data_onset_sym.sats.fillna(one_hot_data_onset_sym.sats.mean(), inplace=True)

In [None]:
small_data = one_hot_data_onset_sym[["temperature","pulse","sys", "dia","rr","sats"]]
column_means = small_data.mean()
small_data = small_data.fillna(column_means)
small_data.info()

In [None]:
small_data.head()

In [None]:
del one_hot_data_onset_sym["temperature"]
del one_hot_data_onset_sym["pulse"]
del one_hot_data_onset_sym["sys"]
del one_hot_data_onset_sym["dia"]
del one_hot_data_onset_sym["rr"]
del one_hot_data_onset_sym["sats"]

In [None]:
final_df = pd.concat([one_hot_data_onset_sym, small_data], axis=1, sort=False)
del final_df["batch_date"]

In [None]:
final_df.target.value_counts()

In [None]:
final_df.info()

In [None]:
del final_df["cxr_label"]

In [None]:
final_df.info()

In [None]:
cols = list(final_df.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('target')) #Remove b from list
X = final_df[cols]
y = final_df.target
X.shape, y.shape

# 建模训练

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=31)

In [None]:
from sklearn import  preprocessing 
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(X_train)
X_train_minmax = min_max_scaler.transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)

In [None]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
#models.append(('pca', PCA()))
models.append(('NN', MLPClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))


# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig("fig1.png")
plt.savefig("fig1.pdf")
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train_minmax)
X_test = pca.transform(X_test_minmax)

In [None]:
explained_variance = pca.explained_variance_ratio_

In [None]:
explained_variance

In [None]:

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
#models.append(('pca', PCA()))
models.append(('NN', MLPClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))


# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed,shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig("fig2.png")
plt.savefig("fig2.pdf")
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
print("RF: Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("NN: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
CART = DecisionTreeClassifier()
CART.fit(X_train, y_train)
predictions = CART.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("CART: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
predictions = gb.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("GB: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
predictions = KNN.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("KNN: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
SVM = SVC(decision_function_shape="ovo").fit(X_train, y_train)
predictions = SVM.predict(X_test)
print(accuracy_score(y_test, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("SVM: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
NB = GaussianNB()
NB.fit(X_train, y_train)
predictions = NB.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("NB: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
LG = LogisticRegression()
LG.fit(X_train, y_train)
predictions = LG.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("LG: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)
predictions = LDA.predict(X_test)
print(accuracy_score(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("LDA: Classification Report")
print(classification_report(y_test, predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
classifiers = [
    KNeighborsClassifier(100),
    SVC(kernel="rbf", C=0.025, probability=True,gamma="auto"),
    RandomForestClassifier(100),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB()]

# Logging for Visual Comparison

class_al =[]
acc1 = []
confusion_matrix_list = []
labels = y.unique()

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    class_al.append(name)
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    acc1.append(acc)
    matrix = confusion_matrix(y_test, train_predictions,labels=labels)

    confusion_matrix_list.append(matrix)
    
max_acc = max(acc1) 
fig, ax = plt.subplots(figsize=(10,20))    
plot=ax.bar(class_al,acc1)
plot=ax.set_xlabel('Classification Al')
plot=ax.set_ylabel('Accuracy')
plot=ax.set_title('Classifier Accuracy')
for tick in ax.get_xticklabels():
    plot=tick.set_rotation(90)
    
plot=ax.axhline(y = max_acc,color = "red",linestyle="--")
plot=ax.annotate("Baseline Acc", xy=(4.5,max_acc))


fig = plot.get_figure()
fig.savefig("output.png")
fig.savefig("output.pdf")

# 其相关性结果

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=31)

In [None]:
alphas = 10**np.linspace(10,-2,100)*0.5

lasso = Lasso(max_iter = 10000, normalize = True)
coefs = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(scale(X_train), y_train)
    coefs.append(lasso.coef_)
    
np.shape(coefs)

In [None]:
lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(X_train, y_train)

lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)
mean_squared_error(y_test, lasso.predict(X_test))

In [None]:
# Some of the coefficients are now reduced to exactly zero.
pd.Series(lasso.coef_, index=X.columns).head(60)