In [18]:
#IMPORTS
import pandas as pd
import wfdb
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from sklearn.linear_model import RidgeClassifierCV
from sktime.datasets import load_from_tsfile_to_dataframe
from sktime.datatypes._panel._convert import from_nested_to_2d_array
from sklearn.model_selection import cross_validate, RepeatedKFold
import lib_all_data_analysis_v3 as lda
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [19]:
data = r'C:/Users/jujuv/OneDrive/Documentos/2020.2/Otimização/wale_xexeo/mit-bih-arrhythmia-database-1.0.0 (1)/mit-bih-arrhythmia-database-1.0.0/'

In [20]:
#CONSTANTS

# List of Patients
patients = ['100','101','102','103','104','105','106','107',
           '108','109','111','112','113','114','115','116',
           '117','118','119','121','122','123','124','200',
           '201','202','203','205','207','208','209','210',
           '212','213','214','215','217','219','220','221',
           '222','223','228','230','231','232','233','234']


# Non Beat Symbols
nonbeat = ['[','!',']','x','(',')','p','t','u','`',
           '\'','^','|','~','+','s','T','*','D','=','"','@','Q','?']

# Abnormal Beat Symbols
abnormal = ['L','R','V','/','A','f','F','j','a','E','J','e','S']

# Normal Beat Symbols

In [21]:
symbols_df = pd.DataFrame()

for pts in patients:
    # Generating filepath for all .atr file names
    file = data + pts
    # Saving annotation object
    annotation = wfdb.rdann(file, 'atr')
    # Extracting symbols from the object
    sym = annotation.symbol
    # Saving value counts
    values, counts = np.unique(sym, return_counts=True)
    # Writing data points into dataframe
    df_sub = pd.DataFrame({'symbol':values, 'Counts':counts, 'Patient Number':[pts]*len(counts)})
    # Concatenating all data points  
    symbols_df = pd.concat([symbols_df, df_sub],axis = 0)

In [23]:
# Classifying normal, abnormal or nonbeat
symbols_df['category'] = -1
symbols_df.loc[symbols_df.symbol == 'N','category'] = 0
symbols_df.loc[symbols_df.symbol.isin(abnormal), 'category'] = 1

In [24]:
symbols_df.groupby('category').Counts.sum()


category
-1     3186
 0    75052
 1    34409
Name: Counts, dtype: int64

In [25]:
def load_ecg(file):    
    record = wfdb.rdrecord(file)
    annotation = wfdb.rdann(file, 'atr')
    
    p_signal = record.p_signal

    atr_sym = annotation.symbol
    atr_sample = annotation.sample
    
    return p_signal, atr_sym, atr_sample

In [26]:
# Analysing annotations value counts for a single record
values, counts = np.unique(sym, return_counts=True)
for v,c in zip(values, counts):
    print(v,c)

+ 3
J 50
N 2700
V 3
~ 8


In [27]:
def make_dataset(pts, num_sec, fs, abnormal):
    # function for making dataset ignoring non-beats
    # input:
    #   pts - list of patients
    #   num_sec = number of seconds to include before and after the beat
    #   fs = frequency
    # output: 
    #   X_all = signal (nbeats , num_sec * fs columns)
    #   Y_all = binary is abnormal (nbeats, 1)
    #   sym_all = beat annotation symbol (nbeats,1)
    
    # initialize numpy arrays
    num_cols = 2*num_sec * fs
    X_all = np.zeros((1,num_cols))
    Y_all = np.zeros((1,1))
    sym_all = []
    
    # list to keep track of number of beats across patients
    max_rows = []
    
    for pt in pts:
        file = data + pt
        
        p_signal, atr_sym, atr_sample = load_ecg(file)
        
        # grab the first signal
        p_signal = p_signal[:,0]
        
        # make df to exclude the nonbeats
        df_ann = pd.DataFrame({'atr_sym':atr_sym,
                              'atr_sample':atr_sample})
        df_ann = df_ann.loc[df_ann.atr_sym.isin(abnormal + ['N'])]
        
        X,Y,sym = build_XY(p_signal,df_ann, num_cols, abnormal)
        sym_all = sym_all+sym
        max_rows.append(X.shape[0])
        X_all = np.append(X_all,X,axis = 0)
        Y_all = np.append(Y_all,Y,axis = 0)
        
    # drop the first zero row
    X_all = X_all[1:,:]
    Y_all = Y_all[1:,:]

    return X_all, Y_all, sym_all

In [28]:
def build_XY(p_signal, df_ann, num_cols, abnormal):
    # this function builds the X,Y matrices for each beat
    # it also returns the original symbols for Y
    
    num_rows = len(df_ann)

    X = np.zeros((num_rows, num_cols))
    Y = np.zeros((num_rows,1))
    sym = []
    
    # keep track of rows
    max_row = 0

    for atr_sample, atr_sym in zip(df_ann.atr_sample.values,df_ann.atr_sym.values):

        left = max([0,(atr_sample - num_sec*fs) ])
        right = min([len(p_signal),(atr_sample + num_sec*fs) ])
        x = p_signal[left: right]
        if len(x) == num_cols:
            X[max_row,:] = x
            Y[max_row,:] = int(atr_sym in abnormal)
            sym.append(atr_sym)
            max_row += 1
    X = X[:max_row,:]
    Y = Y[:max_row,:]
    return X,Y,sym

In [30]:
num_sec = 3
fs = 360
num_runs = 10
cross_val = 5
num_kernels = 10000
_results = np.zeros(num_runs)
dataset_name = ['ECG']

In [32]:
X_all,Y_all, sym_all = make_dataset(patients, num_sec, fs, abnormal)


In [None]:

kf = KFold(n_splits=2, shuffle=True, random_state=42)

In [None]:


#Mount dataset results
results = pd.DataFrame(index = dataset_name,
                       columns = ["accuracy_mean",
                                  "accuracy_standard_deviation",
                                  "time_training_seconds",
                                  "time_test_seconds"],
                       data = 0,
                       dtype='float')
results.index.name = "datasets"

#--- run experiment in Dataset---------------------------------------------------------
print(f"RUNNING".center(80, "="))

#for dataset_name in dataset_names:
print(f"{dataset_name}".center(80, "-"))
_timings = np.zeros([5, num_runs])  


--------------------------------['earthquakes']---------------------------------


In [None]:
i = 0
for val_tex, val_index in kf.split(X_all):
    X_train, X_test = X_all[val_tex], X_all[val_index]
    y_train, y_test = Y_all[val_tex], Y_all[val_index]
    y_train=y_train.astype(int)
    y_test=y_test.astype(int)

    # -- transform training ------------------------------------------------
    time_a = time.perf_counter()
    X_train_transform = lda.wave_layer_ts_v2(X_train, 100,['bior3.1', 'bior1.5', 'db11'], 0)
    time_b = time.perf_counter()
    _timings[0, i] = time_b - time_a
    print('terminei 1')

    # -- transform test ----------------------------------------------------
    time_a = time.perf_counter()
    #WALE-a v1 (after tunning)
    X_test_transform = lda.wave_layer_ts_v2(X_test, 100,['bior3.1', 'bior1.5', 'db11'], 0)
    time_b = time.perf_counter()
    _timings[1, i] = time_b - time_a
    print('terminei 2')
    
     # -- training ----------------------------------------------------------
    time_a = time.perf_counter()
    classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
    classifier.fit(X_train_transform, y_train)
    time_b = time.perf_counter()
    _timings[2, i] = time_b - time_a
    print('terminei 3')

    # -- test --------------------------------------------------------------
    time_a = time.perf_counter()
    _results[i] = classifier.score(X_test_transform, y_test)
    time_b = time.perf_counter()
    
    _timings[3, i] = time_b - time_a
    i +=1
    print('terminei 4')

    print("Done.")

    

NameError: name 'X_all' is not defined

In [None]:
y_pred = classifier.predict(X_test_transform)
cm = confusion_matrix(y_test, y_pred)  # Compute the confusion matrix
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[2568   58]
 [ 154 1769]]


In [None]:

# -- store results ---------------------------------------------------------
results.loc[dataset_name, "accuracy_mean"] = _results.mean()
results.loc[dataset_name, "accuracy_standard_deviation"] = _results.std()
results.loc[dataset_name, "time_training_seconds"] = _timings.mean(1)[[0, 2]].sum()
results.loc[dataset_name, "time_test_seconds"] = _timings.mean(1)[[1, 3]].sum() #P, R, MATRIZ DE CONFUSÃO
#ACURACIA Não é um dado bom
#antes de aprender tem que descrever a base.
#EQUILIBRAR  A BASE?

In [None]:
results

Unnamed: 0_level_0,accuracy_mean,accuracy_standard_deviation,time_training_seconds,time_test_seconds
datasets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
earthquakes,0.951967,0.001429,519.07272,519.889057
