In [1]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import pandas as pd
import math
import os
from sklearn.preprocessing import scale
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
from keras.regularizers import l2


# Function to map categorical probabilities to class labels
def categorical_probas_to_classes(p):
    return np.argmax(p, axis=1)


# Function to calculate performance metrics
def calculate_performance(test_num, pred_y, labels):
    tp, fp, tn, fn = 0, 0, 0, 0
    for index in range(test_num):
        if labels[index] == 1:
            if labels[index] == pred_y[index]:
                tp += 1
            else:
                fn += 1
        else:
            if labels[index] == pred_y[index]:
                tn += 1
            else:
                fp += 1

    acc = (tp + tn) / test_num
    precision = tp / (tp + fp + 1e-6)
    npv = tn / (tn + fn + 1e-6)
    sensitivity = tp / (tp + fn + 1e-6)
    specificity = tn / (tn + fp + 1e-6)
    mcc = (tp * tn - fp * fn) / (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + 1e-6)
    f1 = 2 * tp / (2 * tp + fp + fn + 1e-6)
    return acc, precision, npv, sensitivity, specificity, mcc, f1, tp, tn, fp, fn


# Function to define and return the CNN-BiLSTM model
def get_LSTM_model(input_dim, out_dim):
    model = Sequential()
    model.add(LSTM(int(input_dim/2), return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(int(input_dim/4), return_sequences=True))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(int(input_dim/4), activation = 'relu'))
    model.add(Dense(int(input_dim/8), activation = 'relu'))
    model.add(Dense(out_dim, activation = 'softmax',name="Dense_2"))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'Adam', metrics =['accuracy'])
    return model



# Folder containing CSV files
input_dir = './New_/'
csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

# Loop through each CSV file
for file_name in csv_files:
    base_name = os.path.splitext(file_name)[0]
    output_base_dir = f"./LSTM-{file_name}/"
    os.makedirs(output_base_dir, exist_ok=True)

    # Load data
    data_ = pd.read_csv(os.path.join(input_dir, file_name))
    data = data_.iloc[:, 1:]
    data = data.loc[0:2420]

    ones_vector1 = np.ones(1288)
    zeros_vector1 = np.zeros(1133)

    y = np.hstack((ones_vector1, zeros_vector1))
    X = scale(data)

    sepscores = []
    ytest = np.ones((1, 2)) * 0.5
    yscore = np.ones((1, 2)) * 0.5

    [sample_num, input_dim] = np.shape(X)
    out_dim = 2

    probas_cnn = []
    tprs_cnn = []
    sepscore_cnn = []

    # Callbacks for model training
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
    call = [EarlyStopping(monitor='val_loss', patience=25), reduce_lr]

    # 10-fold cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    for i, (train, test) in enumerate(skf.split(X, y)):
        clf_cnn = get_LSTM_model(input_dim, out_dim)
        X_train_cnn = np.reshape(X[train], (-1, 1, input_dim))
        X_test_cnn = np.reshape(X[test], (-1, 1, input_dim))
        y_test = to_categorical(y[test])
        ytest = np.vstack((ytest, y_test))
        y_test_tmp = y[test]

        # Training
        history = clf_cnn.fit(X_train_cnn, to_categorical(y[train]), validation_data=(X_test_cnn, y_test),
                              batch_size=8, epochs=50, callbacks=call)

        # Prediction
        y_cnn_probas = clf_cnn.predict(X_test_cnn)
        probas_cnn.append(y_cnn_probas)
        y_class = np.argmax(y_cnn_probas, axis=1)
        yscore = np.vstack((yscore, y_cnn_probas))

        # Calculate performance metrics
        acc, precision, npv, sensitivity, specificity, mcc, f1, tp, tn, fp, fn = calculate_performance(len(y_class), y_class, y[test])
        fpr, tpr, thresholds = roc_curve(y[test], y_cnn_probas[:, 1])
        roc_auc = auc(fpr, tpr)
        sepscore_cnn.append([acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc, tp, tn, fp, fn])
        print(f'{i} CNN: acc={acc:.6f}, precision={precision:.6f}, npv={npv:.6f}, sensitivity={sensitivity:.6f}, '
              f'specificity={specificity:.6f}, mcc={mcc:.6f}, f1={f1:.6f}, roc_auc={roc_auc:.6f}')

        # Save the model
        model_json = clf_cnn.to_json()
        with open(f"{output_base_dir}/CNN_BiLSTM_{str(i)}model.json", "w") as json_file:
            json_file.write(model_json)
        clf_cnn.save_weights(f"{output_base_dir}/CNN_BiLSTM_{str(i)}model.weights.h5")
        print("Saved model to disk")

    # Save ytest and yscore to CSV
    row = ytest.shape[0]
    ytest = ytest[np.array(range(1, row)), :]
    ytest_sum = pd.DataFrame(data=ytest)
    ytest_sum.to_csv(f'{output_base_dir}/ytest.csv')

    yscore_ = yscore[np.array(range(1, row)), :]
    yscore_sum = pd.DataFrame(data=yscore_)
    yscore_sum.to_csv(f'{output_base_dir}/yscore.csv')

    # Save results of cross-validation
    scores = np.array(sepscore_cnn)
    result1 = np.mean(scores, axis=0)
    H1 = result1.tolist()
    sepscore_cnn.append(H1)
    result = sepscore_cnn
    data_csv = pd.DataFrame(data=result, columns=['acc', 'precision', 'npv', 'sensitivity', 'specificity', 'mcc', 'f1', 'roc_auc', 'tp', 'tn', 'fp', 'fn'])
    data_csv.to_csv(f'{output_base_dir}/results_CV.csv', index=False)

    print(history)

    # Testing phase with another part of data
    test_data = data_.iloc[:, 1:]
    test_data = test_data.loc[2421:]
    ones_vector = np.ones(258)
    zeros_vector = np.zeros(227)
    yt = np.hstack((ones_vector, zeros_vector))

    Xt = scale(test_data)
    Xt = np.reshape(Xt, (-1, 1, input_dim))

    sepscores = []
    ytest = np.ones((1, 2)) * 0.5
    yscore = np.ones((1, 2)) * 0.5

    # Load and test saved models
    for i in range(10):
        with open(f"{output_base_dir}CNN_BiLSTM_{str(i)}model.json", 'r') as json_file:
            loaded_model_json = json_file.read()

        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(f"{output_base_dir}CNN_BiLSTM_{i}model.weights.h5")
        print("Loaded model from disk")

        loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        y_score = loaded_model.predict(Xt)
        y_class = categorical_probas_to_classes(y_score)

        y_test = to_categorical(yt)
        acc, precision, npv, sensitivity, specificity, mcc, f1, tp, tn, fp, fn = calculate_performance(len(y_class), y_class, yt)
        fpr, tpr, thresholds = roc_curve(yt, y_score[:, 1])
        roc_auc = auc(fpr, tpr)
        sepscores.append([acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc, tp, tn, fp, fn])
        print(f'{i} CNN Test: acc={acc:.6f}, precision={precision:.6f}, npv={npv:.6f}, sensitivity={sensitivity:.6f}, '
              f'specificity={specificity:.6f}, mcc={mcc:.6f}, f1={f1:.6f}, roc_auc={roc_auc:.6f}')

        # Save prediction results
        ytest = np.vstack((ytest, y_test))
        yscore = np.vstack((yscore, y_score))

    # Save testing results
    row = ytest.shape[0]
    ytest = ytest[np.array(range(1, row)), :]
    ytest_sum = pd.DataFrame(data=ytest)
    ytest_sum.to_csv(f'{output_base_dir}/ytest_test.csv')

    yscore_ = yscore[np.array(range(1, row)), :]
    yscore_sum = pd.DataFrame(data=yscore_)
    yscore_sum.to_csv(f'{output_base_dir}/yscore_test.csv')

    # Save testing results of cross-validation
    scores = np.array(sepscores)
    result1 = np.mean(scores, axis=0)
    H1 = result1.tolist()
    sepscores.append(H1)
    result = sepscores
    data_csv = pd.DataFrame(data=result, columns=['acc', 'precision', 'npv', 'sensitivity', 'specificity', 'mcc', 'f1', 'roc_auc', 'tp', 'tn', 'fp', 'fn'])
    data_csv.to_csv(f'{output_base_dir}/results_test.csv', index=False)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
0 CNN: acc=0.962963, precision=0.961538, npv=0.964602, sensitivity=0.968992, specificity=0.956140, mcc=0.925636, f1=0.965251, roc_auc=0.993812
Saved model to disk
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
1 CNN: acc=0.958678, precision=0.961240, npv=0.955752, sensitivity=0.961240, specificity=0.955752, mcc=0.916993, f1=0.96124