In [2]:
from dataloading import *
from preprocessing import *
from utils import *
from modelling import *
import os
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_curve, roc_auc_score
import tensorflow as tf
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import json

db_dir = "data/physionet"
pkl_path = db_dir + "normalisedrecords_fslist_labels.pkl"
cutoff = 60 # Hz
resample_fs = 120 # Hz
crop_length = 30 # s
afib_dup_factor = 2

if not os.path.exists(pkl_path):
    # Read filter, and normalise
    record_list, fs_list, labels = read_challenge17_data(db_dir)
    resampled_records = lowpass_filter_and_resample_record_list(record_list, fs_list, 512, cutoff, resample_fs)
    normalised_records = normalise_record_list(resampled_records)
    # Save it out 
    save_challenge17_pkl(pkl_path, (normalised_records, fs_list, labels))
else:
    # Read in the pkl file
    normalised_records, fs_list, labels = load_challenge17_pkl(pkl_path)

normalised_records, labels = drop_other_class_records_and_labels(normalised_records, labels)


2024-04-20 17:20:13.139964: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 17:20:13.139988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 17:20:13.140823: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-20 17:20:13.144994: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
print(len(normalised_records))

In [3]:
# dup_records, dup_labels = duplicate_afib_records_in_list(normalised_records, labels, afib_dup_factor)
# cropped_records = crop_and_pad_record_list(dup_records, resample_fs, crop_length)

BATCH_SIZE = 64
EPOCHS = 400
K_FOLDS = 10
STREAM2_SIZE = 9

kf = KFold(n_splits=K_FOLDS, shuffle=True)

test_scores = []
fold_id = 0

for train_index, test_index in kf.split(normalised_records):

    train_samples = [normalised_records[index] for index in train_index]
    test_samples = [normalised_records[index] for index in test_index]
    train_labels = labels.iloc[train_index].reset_index(drop=True)
    test_labels = labels.iloc[test_index].reset_index(drop=True)

    train_samples, train_labels = duplicate_afib_records_in_list(train_samples, train_labels, afib_dup_factor)

    a_cases = len(train_labels[train_labels['A']])
    print(f"Percentage of Afib cases: {a_cases/len(train_labels)}")

    train_samples = crop_and_pad_record_list(train_samples, resample_fs, crop_length)
    test_samples = crop_and_pad_record_list(test_samples, resample_fs, crop_length)

    X_train, X_test = np.array(train_samples), np.array(test_samples)
    X_train, X_test = np.expand_dims(X_train, -1), np.expand_dims(X_test, -1)
    y_train, y_test = train_labels['A'].to_numpy().astype(int), test_labels['A'].to_numpy().astype(int)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1)


Percentage of Afib cases: 0.3696406271680311
Percentage of Afib cases: 0.3652076944521885
Percentage of Afib cases: 0.3668243807403284
Percentage of Afib cases: 0.3684356765768269
Percentage of Afib cases: 0.3704422570359074
Percentage of Afib cases: 0.3712425543704114
Percentage of Afib cases: 0.36399162595952544
Percentage of Afib cases: 0.3720415224913495
Percentage of Afib cases: 0.3723889887951307
Percentage of Afib cases: 0.3687864482088309


In [None]:


    model = create_dual_stream_cnn_model((X_train.shape[1], 1), stream2_size = STREAM2_SIZE)
    print_gpu_availability()

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logs_dir, histogram_freq=0)
    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=20, mode='min', restore_best_weights=True, verbose=1)
    model.fit(train_dataset, validation_data = validation_dataset,
            epochs=EPOCHS, verbose=1,
            callbacks=[lr_scheduler, tensorboard_callback, early_stopping_callback])
    
    models_path = models_dir + f'/fold_{fold_id}_model_weights.h5'
    model.save_weights(models_path)

    test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_dataset)
    
    y_scores = model.predict(X_test, verbose=0)
    y_scores = y_scores.flatten()
    test_fpr, test_tpr, _ = roc_curve(y_test, y_scores)
    test_auc = roc_auc_score(y_test, y_scores)

    test_scores.append({'loss':test_loss, 
                        'acc': test_accuracy, 
                        'prec':test_precision, 
                        'rec':test_recall, 
                        'auc':test_auc, 
                        'fpr':test_fpr.tolist(), 
                        'tpr':test_tpr.tolist()})

    fold_id += 1

test_scores_path = models_dir + "/test_scores.json"
with open(test_scores_path, "w") as file:
    json.dump(test_scores, file, indent=4)

kfold_accuracy = np.array([elem['acc'] for elem in test_scores])
kfold_precision = np.array([elem['prec'] for elem in test_scores])
kfold_recall = np.array([elem['rec'] for elem in test_scores])
print(f"Accuracy:\t{100*kfold_accuracy.mean():.1f}% \nPrecision:\t{100*kfold_precision.mean():.1f}% \nRecall:\t\t{100*kfold_recall.mean():.1f}%")

tpr = test_scores[0]['tpr']
fpr = test_scores[0]['fpr']
auc = test_scores[0]['auc']

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], '--', color='black')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title(f"AUC: {auc:.4f}")
plt.savefig(models_dir+'/roc_curve.png')  # Save the plot as a PNG file

In [None]:
X = [1, 2, 3, 4, 5, 6, 7, 8, 9]
kf = KFold(n_splits=3, shuffle=True)

for train_index, test_index in kf.split(X):
    print(labels.iloc[train_index])
    

In [23]:
y = np.array(labels)
print(y.shape)
y = y[:,0].astype(int) # Atrial Fibrillation column
print(y.shape)

y2 = labels['A'].to_numpy().astype(int)
print(y2.shape)

print(np.array_equal(y, y2))

(5788, 2)
(5788,)
(5788,)
True
