In [19]:
import datasets
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import seaborn as sns
import math
import pandas as pd
from joblib import dump, load

In [23]:
def get_data(split: str,data_dir:str, silent: bool = False, cache_dir: str = None):
    print(f'Loading ECG dataset ({split} split) from Huggingface...')
    dataset = datasets.load_dataset('honggen-organization/ECG-disease-clf', split=split, data_dir=data_dir, cache_dir=cache_dir)
    print('done')
    return dataset

def binary_proprocess(disease_data, normal_data):
    disease_df = disease_data.data.to_pandas()#[feature_list_all_6]
    normal_df = normal_data.data.to_pandas()#[feature_list_all_6]
    #For the large dataset wpws, we can sample some patiens
    #try:
    normal_df = normal_df[normal_df['Group'].isin(list(set(disease_df['Group'])))]#.sample(n=10000, random_state=42)
    #except:
    normal_df = normal_df
    print(set(disease_df['Group']))
    print(set(normal_df['Group']))
    #disease_df = disease_df.drop('Group', axis=1)
    #normal_df = normal_df.drop('Group', axis=1)
    return disease_df,normal_df

In [24]:
def clf_fun(disease_df, health_df, test_disease, test_health, interation = 1, downsample = True, agument=False):

    disease_test = test_disease.values.tolist()
    h_test  = test_health.values.tolist()
    X_test = disease_test+h_test
    y_test1 = np.ones(len(disease_test))
    y_test2 = np.zeros(len(h_test))
    y_test = np.append(y_test1,y_test2)
    for i in range(interation):
        if agument:
            upsample_factor = math.floor(len(health_df) / len(disease_df))
            disease_sampled = disease_df.sample(len(health_df)-upsample_factor*len(disease_df), replace=True, random_state=40).reset_index(drop=True)
            disease_df = pd.concat([disease_df] * int(upsample_factor)).reset_index(drop=True)
            disease_df = pd.concat([disease_df,disease_sampled]).reset_index(drop=True)
            print(len(disease_df))
            
        if downsample:
            health_df= health_df.sample(n=len(disease_df), random_state=42).reset_index(drop=True)
        
        disease_X  = disease_df.values.tolist()
        h_X  = health_df.values.tolist()
        print(len(disease_X))
        print(len(h_X))

        X = disease_X+h_X
        y1 = np.ones(len(disease_X))
        y2 = np.zeros(len(h_X))
        y_train = np.append(y1,y2)
        X_train = np.array(X)
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(np.array(X_test))
        Pred_len = len(y_test)
        clf = SVC(kernel="linear",probability=True)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = clf.score(X_test, y_test)
        y_prob = clf.predict_proba(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
        print(tn/(tn+fp),tp/(tp+fn))
        dump(clf, './svm_KD_clf.joblib') 
        dump(scaler, './svm_KD_scaler.joblib') 



In [25]:
data_method = 'downsample'
data = 'kd'
augment = False
downsample = True

normal_data_train = get_data(split= 'train', data_dir = 'normal', cache_dir = './cache/huggingface/datasets')
normal_data_test = get_data(split= 'test', data_dir = 'normal', cache_dir = './cache/huggingface/datasets')

for i in range(1,2):
    kd_data_train = get_data(split= 'train', data_dir = f"{data}/data{i}", cache_dir = './cache/huggingface/datasets')
    kd_data_test = get_data(split= 'test', data_dir = f"{data}/data{i}", cache_dir = './cache/huggingface/datasets')
    disease_train, normal_train = binary_proprocess(kd_data_train, normal_data_train)
    disease_test, normal_test = binary_proprocess(kd_data_test, normal_data_test)
    #disease_train = disease_train[feature_list1]
    #normal_train = normal_train[feature_list1]
    #disease_test = disease_test[feature_list1]
    #normal_test = normal_test[feature_list1]
    clf_fun(disease_df = disease_train, health_df =normal_train,
                                    test_disease = disease_test, test_health = normal_test, 
                                    interation = 1,downsample = downsample,agument=augment)

    
    

Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
Loading ECG dataset (train split) from Huggingface...
done
Loading ECG dataset (test split) from Huggingface...
done
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
{4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0}
297
297
0.772984441301273 0.76


In [9]:
#normal_data_test = get_data(split= 'test', data_dir = 'normal', cache_dir = './cache/huggingface/datasets')


In [11]:
#normal_data_test