In [1]:
import datasets
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
#from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import confusion_matrix
import seaborn as sns
import math
import pandas as pd
from collections import Counter

feature_list = ['Summated_QRS_integral_V5',
 'S_Peak_Amplitude_C_V5',
 'S_Peak_Amplitude_C_V4',
 'R_Area_C_V5',
 'T_Peak_Amplitude_C_V2',
 'Q_Duration_C_II',
 'S_Area_C_V3',
 'Summated_QRS_integral_V3',
 'Summated_QRS_integral_V6',
 'Summated_QRS_integral_V4',
 'T_Area_C_V2',
 'Q_Peak_Amplitude_C_aVR',
 'Q_Area_C_V1',
 'S_Duration_C_V1',
 'S_Area_C_V4',
 'R_Duration_C_II',
 'R_Peak_Amplitude_C_V2',
 'P_Peak_Amplitude_C_aVR',
 'P_Peak_Amplitude_C_V3',]

feature_list_all_6 = ['Summated_QRS_integral_V5',
 'S_Peak_Amplitude_C_V5',
 'S_Peak_Amplitude_C_V4',
 'R_Area_C_V5',
 'T_Peak_Amplitude_C_V2',
 'Q_Duration_C_II',
 'S_Area_C_V3',
 'Summated_QRS_integral_V3',
 'Summated_QRS_integral_V6',
 'Summated_QRS_integral_V4',
 'T_Area_C_V2',
 'Q_Peak_Amplitude_C_aVR',
 'Q_Area_C_V1',
 'S_Duration_C_V1',
 'S_Area_C_V4',
 'R_Duration_C_II',
 'R_Peak_Amplitude_C_V2',
 'P_Peak_Amplitude_C_aVR',
 'P_Peak_Amplitude_C_V3',
 #'S_Peak_Amplitude_C_V3'
                     ]

feature_list1 = ['QRS_Duration__ms_C',
 'T_Axis_C',
 'QTc_Calculation_C',
 'QTc_Bazett_C',
 'QTc_Friderica_C',
 'T_Peak_Amplitude_C_II',
 'T_Area_C_II',
 'T_Peak_Amplitude_C_V1',
 'T_Area_C_V1',
 'T_Peak_Amplitude_C_V2',
 'T_Area_C_V2',
 'T_Peak_Amplitude_C_V3',
 'T_Area_C_V3',
 'T_Peak_Amplitude_C_V4',
 'T_Area_C_V4',
 'T_Peak_Amplitude_C_V5',
 'T_Area_C_V5',
 'T_Peak_Amplitude_C_V6',
 'T_Area_C_V6',]

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Neural Net",
    "AdaBoost",
]

classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear",probability=True),
    #SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42),
    SVC(kernel="rbf",probability=True,C = 1.2),#nuSVM how many sup vectors
    #MLPClassifier(alpha=1, max_iter=500),
    MLPClassifier(hidden_layer_sizes=(10,), learning_rate="adaptive", verbose=True, activation="relu", solver="adam", alpha=10, max_iter=500),
    AdaBoostClassifier(estimator =DecisionTreeClassifier(max_depth=3),n_estimators=10),
]

In [2]:
def get_data(split: str,data_dir:str, silent: bool = False, cache_dir: str = None):
    print(f'Loading ECG dataset ({split} split) from Huggingface...')
    dataset = datasets.load_dataset('honggen-organization/ECG-disease-clf', split=split, data_dir=data_dir, cache_dir=cache_dir)
    print('done')
    return dataset

def binary_proprocess(disease_data, normal_data):
    disease_df = disease_data.data.to_pandas()#[feature_list_all_6]
    normal_df = normal_data.data.to_pandas()#[feature_list_all_6]
    #For the large dataset wpws, we can sample some patiens
    try:
        normal_df = normal_df[normal_df['Group'].isin(list(set(disease_df['Group'])))]#.sample(n=10000, random_state=42)
    except:
        normal_df = normal_df
    print(set(disease_df['Group']))
    print(set(normal_df['Group']))
    #disease_df = disease_df.drop('Group', axis=1)
    #normal_df = normal_df.drop('Group', axis=1)
    return disease_df,normal_df
#[feature_list_all_6][feature_list_all_6]


In [3]:
def clf(disease_df, health_df, test_disease, test_health, interation = 1, downsample = True, agument=False):
    accu_data = []
    conf_tn_data = []
    conf_fp_data = []
    conf_fn_data = []
    conf_tp_data = []
    disease_test = test_disease.values.tolist()
    h_test  = test_health.values.tolist()
    X_test = disease_test+h_test
    y_test1 = np.ones(len(disease_test))
    y_test2 = np.zeros(len(h_test))
    y_test = np.append(y_test1,y_test2)
    dic_acc = {}
    for i in range(interation):
        if agument:
            upsample_factor = math.floor(len(health_df) / len(disease_df))
            disease_sampled = disease_df.sample(len(health_df)-upsample_factor*len(disease_df), replace=True, random_state=40).reset_index(drop=True)
            disease_df = pd.concat([disease_df] * int(upsample_factor)).reset_index(drop=True)
            disease_df = pd.concat([disease_df,disease_sampled]).reset_index(drop=True)
            print(len(disease_df))
            
        if downsample:
            health_df= health_df.sample(n=len(disease_df), random_state=42).reset_index(drop=True)
        
        disease_X  = disease_df.values.tolist()
        h_X  = health_df.values.tolist()
        print(len(disease_X))
        print(len(h_X))

        X = disease_X+h_X
        y1 = np.ones(len(disease_X))
        y2 = np.zeros(len(h_X))
        y_train = np.append(y1,y2)
        X_train = np.array(X)
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(np.array(X_test))
        Pred_len = len(y_test)

        conf_tn = {}
        conf_fp= {}
        conf_fn= {}
        conf_tp= {}
        dic_roc = {}
        for name, clf in zip(names, classifiers):
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            score = clf.score(X_test, y_test)
            y_prob = clf.predict_proba(X_test)
            tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
            #print(name)
            #print('y_test',y_test)
            #print('y_prob',y_prob[:,1])
            
            df = pd.DataFrame(list(zip(y_test, y_prob[:,1])), columns = ['test', 'prob'])
            dic_roc[name] =df
            dic_acc[name] = score
            conf_tn[name] = tn/(tn+fp)
            conf_tp[name] = tp/(tp+fn)
        accu_data.append(dic_acc)
        conf_tn_data.append(conf_tn)
        conf_tp_data.append(conf_tp)
    df_accu_data =pd.DataFrame.from_dict(accu_data)
    print('Accuracy: \n', df_accu_data.mean())
    df_conf_tn_data = pd.DataFrame.from_dict(conf_tn_data)
    df_conf_tp_data = pd.DataFrame.from_dict(conf_tp_data)
    print('-----------------')
    print('True_negative: \n', df_conf_tn_data.mean())
    print('-----------------')
    print('True_Positive: \n', df_conf_tp_data.mean())
    return dic_roc,df_conf_tp_data,df_conf_tn_data
    

In [6]:
data_method = 'augment'
data = 'kd'
augment = True
downsample = False

normal_data_train = get_data(split= 'train', data_dir = 'normal', cache_dir = './cache/huggingface/datasets')
normal_data_test = get_data(split= 'test', data_dir = 'normal', cache_dir = './cache/huggingface/datasets')

tpr_all = np.zeros((5,5))
tnr_all = np.zeros((5,5))
for i in range(1,6):
    kd_data_train = get_data(split= 'train', data_dir = f"{data}/data{i}", cache_dir = './cache/huggingface/datasets')
    kd_data_test = get_data(split= 'test', data_dir = f"{data}/data{i}", cache_dir = './cache/huggingface/datasets')
    disease_train, normal_train = binary_proprocess(kd_data_train, normal_data_train)
    disease_test, normal_test = binary_proprocess(kd_data_test, normal_data_test)
    disease_test = disease_test[disease_test['Group'] == 6]
    normal_test = normal_test[normal_test['Group'] == 6]

    disease_train = disease_train[feature_list1]
    normal_train = normal_train[feature_list1]
    disease_test = disease_test[feature_list1]
    normal_test = normal_test[feature_list1]

    
    df_dic,df_tpr,df_tnr = clf(disease_df = disease_train, health_df =normal_train,
                                    test_disease = disease_test, test_health = normal_test, 
                                    interation = 1,downsample = downsample,agument=augment)
    for name in names:
        df_dic[name].to_csv(f"./{data}_roc20/{data_method}/{name}_clinical_roc{i}.csv", index=False)
    tpr_all[i-1,:] = df_tpr.values
    tnr_all[i-1,:] = df_tnr.values
    
    
#df_tnr = pd.DataFrame(tnr_all, columns = names)
#df_tnr.to_csv(f"./{data}_roc/{data_method}/tnr.csv", index=False)
#df_tpr = pd.DataFrame(tpr_all, columns = names)
#df_tpr.to_csv(f"./{data}_roc/{data_method}/tpr.csv", index=False)
    

Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
5777
5777
5777
Iteration 1, loss = 1.45636660
Iteration 2, loss = 1.10098763
Iteration 3, loss = 0.92667496
Iteration 4, loss = 0.83153238
Iteration 5, loss = 0.77255910
Iteration 6, loss = 0.73136095
Iteration 7, loss = 0.70092196
Iteration 8, loss = 0.67760562
Iteration 9, loss = 0.65943527
Iteration 10, loss = 0.64525371
Iteration 11, loss = 0.63415023
Iteration 12, loss = 0.62570683
Iteration 13, loss = 0.61864253
Iteration 14, loss = 0.61280244
Iteration 15, loss = 0.60804153
Iteration 16, loss = 0.60390421
Iteration 17, loss = 0.60076293
Iteration 18, lo



Accuracy: 
 Nearest Neighbors    0.821053
Linear SVM           0.673684
RBF SVM              0.742105
Neural Net           0.673684
AdaBoost             0.757895
dtype: float64
-----------------
True_negative: 
 Nearest Neighbors    0.829412
Linear SVM           0.647059
RBF SVM              0.752941
Neural Net           0.652941
AdaBoost             0.747059
dtype: float64
-----------------
True_Positive: 
 Nearest Neighbors    0.75
Linear SVM           0.90
RBF SVM              0.65
Neural Net           0.85
AdaBoost             0.85
dtype: float64
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
5777
5777
5777
Iteration 1, loss = 1.11718608
Iteration 2, loss = 0.93865005
Iteration 3, loss = 0.83792233
Iteration 4, loss 



Accuracy: 
 Nearest Neighbors    0.788360
Linear SVM           0.629630
RBF SVM              0.724868
Neural Net           0.666667
AdaBoost             0.751323
dtype: float64
-----------------
True_negative: 
 Nearest Neighbors    0.800000
Linear SVM           0.605882
RBF SVM              0.723529
Neural Net           0.652941
AdaBoost             0.758824
dtype: float64
-----------------
True_Positive: 
 Nearest Neighbors    0.684211
Linear SVM           0.842105
RBF SVM              0.736842
Neural Net           0.789474
AdaBoost             0.684211
dtype: float64
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
5777
5777
5777
Iteration 1, loss = 1.32233770
Iteration 2, loss = 1.05022404
Iteration 3, loss = 0.9167193



Accuracy: 
 Nearest Neighbors    0.784211
Linear SVM           0.668421
RBF SVM              0.763158
Neural Net           0.689474
AdaBoost             0.778947
dtype: float64
-----------------
True_negative: 
 Nearest Neighbors    0.829412
Linear SVM           0.647059
RBF SVM              0.770588
Neural Net           0.664706
AdaBoost             0.794118
dtype: float64
-----------------
True_Positive: 
 Nearest Neighbors    0.40
Linear SVM           0.85
RBF SVM              0.70
Neural Net           0.90
AdaBoost             0.65
dtype: float64
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
5777
5777
5777
Iteration 1, loss = 1.05280361
Iteration 2, loss = 0.91011306
Iteration 3, loss = 0.82549597
Iteration 4, loss 



Accuracy: 
 Nearest Neighbors    0.800000
Linear SVM           0.668421
RBF SVM              0.763158
Neural Net           0.678947
AdaBoost             0.747368
dtype: float64
-----------------
True_negative: 
 Nearest Neighbors    0.829412
Linear SVM           0.641176
RBF SVM              0.776471
Neural Net           0.658824
AdaBoost             0.758824
dtype: float64
-----------------
True_Positive: 
 Nearest Neighbors    0.55
Linear SVM           0.90
RBF SVM              0.65
Neural Net           0.85
AdaBoost             0.65
dtype: float64
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
5777
5777
5777
Iteration 1, loss = 1.12491485
Iteration 2, loss = 0.93201301
Iteration 3, loss = 0.84016747
Iteration 4, loss 



Accuracy: 
 Nearest Neighbors    0.786096
Linear SVM           0.673797
RBF SVM              0.754011
Neural Net           0.652406
AdaBoost             0.780749
dtype: float64
-----------------
True_negative: 
 Nearest Neighbors    0.805882
Linear SVM           0.652941
RBF SVM              0.752941
Neural Net           0.623529
AdaBoost             0.811765
dtype: float64
-----------------
True_Positive: 
 Nearest Neighbors    0.588235
Linear SVM           0.882353
RBF SVM              0.764706
Neural Net           0.941176
AdaBoost             0.470588
dtype: float64
