In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Lasso, RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, LSTM, GRU
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam



In [2]:
# Define sensitivity and specificity
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tp / (tp + fn)

def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

### Load data

In [44]:
df = pd.read_csv('./JM006_0901_whole.csv')
df = df.dropna(subset=['milkweightlbs'])
df = df.dropna(subset=['cells'])
df = df.dropna(subset=['conductivity'])
df_f = df[df['disease'] == 0]
# df_d = df[df['disease'] == 1][df['disease_in'] > 14] 
# df_d = df[df['disease'] == 1][df['disease_in'] <= 14][df['disease_in'] >= 11] 
df_d = df[df['disease'] == 1][df['disease_in'] <= 10][df['disease_in'] >= 8] 
# df_d = df[df['disease'] == 1][df['disease_in'] <= 7][df['disease_in'] >= 6] 
# df_d = df[df['disease'] == 1][df['disease_in'] <= 5][df['disease_in'] >= 4] 
# df_d = df[df['disease'] == 1][df['disease_in'] == 3] 
# df_d = df[df['disease'] == 1][df['disease_in'] == 2] 
# df_d = df[df['disease'] == 1][df['disease_in'] == 1] 
df = pd.concat([df_f, df_d])
df = df.reset_index(drop=True)
X = np.hstack((df.iloc[:,1:488].values, df['milkweightlbs_sca'].values.reshape(-1, 1), df['cells_sca'].values.reshape(-1, 1), df['conductivity_sca'].values.reshape(-1, 1), df['parity_sca'].values.reshape(-1, 1)))
y = df['disease']



### ML models

PLS-DA

In [45]:
plsda = PLSRegression(n_components=2)

cv = StratifiedKFold(n_splits=10, shuffle=True)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_majority = X_train[y_train == 0]
    X_train_minority = X_train[y_train == 1]
    y_train_majority = y_train[y_train == 0]
    
    X_train_majority_downsampled = resample(X_train_majority, 
                                            replace=False,
                                            n_samples=len(X_train_minority),
                                            random_state=42)
    y_train_downsampled = np.array([0]*len(X_train_minority) + [1]*len(X_train_minority))
    
    X_train_downsampled = np.vstack([X_train_majority_downsampled, X_train_minority])
    
    # fit PLSRegression
    plsda.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = plsda.predict(X_test)
    
    # convert continuous values to class labels via a threshold
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f'Average sensitivity: {np.mean(sensitivities)}')
print(f'Average specificity: {np.mean(specificities)}')
print(f'Average accuracy: {np.mean(accuracies)}')

r_pls_da = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'PLS-DA'})
r_pls_da

Average sensitivity: 0.4197802197802198
Average specificity: 0.7258239654610622
Average accuracy: 0.7176536076120035


Unnamed: 0,sen,spe,acc,model
0,0.357143,0.725806,0.715686,PLS-DA
1,0.285714,0.731855,0.719608,PLS-DA
2,0.571429,0.711694,0.707843,PLS-DA
3,0.615385,0.737903,0.734774,PLS-DA
4,0.307692,0.725806,0.715128,PLS-DA
5,0.384615,0.770161,0.760314,PLS-DA
6,0.461538,0.768145,0.760314,PLS-DA
7,0.5,0.70303,0.697446,PLS-DA
8,0.285714,0.707071,0.695481,PLS-DA
9,0.428571,0.676768,0.669941,PLS-DA


Ridge regression

In [46]:
ridge = RidgeClassifier()

cv = StratifiedKFold(n_splits=10, shuffle=True)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_majority = X_train[y_train == 0]
    X_train_minority = X_train[y_train == 1]
    y_train_majority = y_train[y_train == 0]
    
    X_train_majority_downsampled = resample(X_train_majority,
                                            replace=False,
                                            n_samples=len(X_train_minority),
                                            random_state=42)
    
    y_train_downsampled = np.array([0]*len(X_train_minority) + [1]*len(X_train_minority))
    
    X_train_downsampled = np.vstack([X_train_majority_downsampled, X_train_minority])
    
    # fit Ridge
    ridge.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = ridge.predict(X_test)
    
    # convert continuous values to class labels via a threshold
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f'Average sensitivity: {np.mean(sensitivities)}')
print(f'Average specificity: {np.mean(specificities)}')
print(f'Average accuracy: {np.mean(accuracies)}')

r_ridge = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'Ridge'})
r_ridge

Average sensitivity: 0.4758241758241758
Average specificity: 0.7403755294884328
Average accuracy: 0.7333730112870296


Unnamed: 0,sen,spe,acc,model
0,0.571429,0.691532,0.688235,Ridge
1,0.428571,0.71371,0.705882,Ridge
2,0.5,0.743952,0.737255,Ridge
3,0.307692,0.788306,0.776031,Ridge
4,0.384615,0.766129,0.756385,Ridge
5,0.461538,0.685484,0.679764,Ridge
6,0.461538,0.752016,0.744597,Ridge
7,0.571429,0.777778,0.772102,Ridge
8,0.5,0.713131,0.707269,Ridge
9,0.571429,0.771717,0.766208,Ridge


Lasso regression

In [48]:
lasso = Lasso(alpha=0.2) 

cv = StratifiedKFold(n_splits=10, shuffle=True)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_majority = X_train[y_train == 0]
    X_train_minority = X_train[y_train == 1]
    y_train_majority = y_train[y_train == 0]
    
    X_train_majority_downsampled = resample(X_train_majority,
                                            replace=False,
                                            n_samples=len(X_train_minority),
                                            random_state=42)
    
    y_train_downsampled = np.array([0]*len(X_train_minority) + [1]*len(X_train_minority))
    
    X_train_downsampled = np.vstack([X_train_majority_downsampled, X_train_minority])
    
    # fit Lasso
    lasso.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = lasso.predict(X_test)
    
    # convert continuous values to class labels via a threshold
    y_pred = [1 if x >= 0.5 else 0 for x in y_pred]
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f'Average sensitivity: {np.mean(sensitivities)}')
print(f'Average specificity: {np.mean(specificities)}')
print(f'Average accuracy: {np.mean(accuracies)}')

r_lasso = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'Lasso'})
r_lasso

Average sensitivity: 0.44230769230769224
Average specificity: 0.7548985826001955
Average accuracy: 0.7465017912862592


Unnamed: 0,sen,spe,acc,model
0,0.428571,0.794355,0.784314,Lasso
1,0.142857,0.774194,0.756863,Lasso
2,0.5,0.772177,0.764706,Lasso
3,0.307692,0.733871,0.722986,Lasso
4,0.461538,0.703629,0.697446,Lasso
5,0.538462,0.709677,0.705305,Lasso
6,0.615385,0.764113,0.760314,Lasso
7,0.642857,0.763636,0.760314,Lasso
8,0.428571,0.783838,0.774067,Lasso
9,0.357143,0.749495,0.738703,Lasso


Elastic Net

In [49]:
en = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_majority = X_train[y_train == 0]
    X_train_minority = X_train[y_train == 1]
    y_train_majority = y_train[y_train == 0]
    
    X_train_majority_downsampled = resample(X_train_majority, 
                                            replace=False,
                                            n_samples=len(X_train_minority),
                                            random_state=42)
    
    y_train_downsampled = np.array([0]*len(X_train_minority) + [1]*len(X_train_minority))
    
    X_train_downsampled = np.vstack([X_train_majority_downsampled, X_train_minority])
    
    # fit Elastic Net
    en.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = en.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f'Average sensitivity: {np.mean(sensitivities)}')
print(f'Average specificity: {np.mean(specificities)}')
print(f'Average accuracy: {np.mean(accuracies)}')

r_Elastic_Net = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'EN'})
r_Elastic_Net



Average sensitivity: 0.47087912087912087
Average specificity: 0.7193756109481917
Average accuracy: 0.7127443275935129




Unnamed: 0,sen,spe,acc,model
0,0.285714,0.745968,0.733333,EN
1,0.357143,0.731855,0.721569,EN
2,0.642857,0.677419,0.676471,EN
3,0.307692,0.802419,0.789784,EN
4,0.461538,0.707661,0.701375,EN
5,0.461538,0.729839,0.722986,EN
6,0.692308,0.695565,0.695481,EN
7,0.571429,0.678788,0.675835,EN
8,0.357143,0.652525,0.644401,EN
9,0.571429,0.771717,0.766208,EN


Random forest

In [50]:
rf = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42, class_weight='balanced')

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit Random forest
    rf.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = rf.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_RF = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'RF'})
r_RF

Average Sensitivity: 0.6093406593406593
Average Specificity: 0.6090501792114694
Average Accuracy: 0.6090708424823761


Unnamed: 0,sen,spe,acc,model
0,0.857143,0.582661,0.590196,RF
1,0.357143,0.647177,0.639216,RF
2,0.642857,0.59879,0.6,RF
3,0.692308,0.554435,0.557957,RF
4,0.384615,0.622984,0.616896,RF
5,0.538462,0.643145,0.640472,RF
6,0.692308,0.552419,0.555992,RF
7,0.5,0.626263,0.62279,RF
8,0.5,0.616162,0.612967,RF
9,0.928571,0.646465,0.654224,RF


XGBoost

In [51]:
xgboost = XGBClassifier(max_depth=10, n_estimators=100, use_label_encoder=False, eval_metric='logloss', learning_rate=0.01)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit XGBoost
    xgboost.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = xgboost.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_XGBoost = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'XGBoost'})
r_XGBoost

Average Sensitivity: 0.6087912087912087
Average Specificity: 0.6080335614206582
Average Accuracy: 0.6080808197542279


Unnamed: 0,sen,spe,acc,model
0,0.642857,0.578629,0.580392,XGBoost
1,0.285714,0.637097,0.627451,XGBoost
2,0.714286,0.659274,0.660784,XGBoost
3,0.615385,0.627016,0.626719,XGBoost
4,0.461538,0.546371,0.544204,XGBoost
5,0.538462,0.604839,0.603143,XGBoost
6,0.615385,0.580645,0.581532,XGBoost
7,0.642857,0.630303,0.630648,XGBoost
8,0.785714,0.59798,0.603143,XGBoost
9,0.785714,0.618182,0.62279,XGBoost


### MLP

In [52]:
class MLPClassifierCustom(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, hidden_layer_sizes=(100,), activation='relu', epochs=100, batch_size=32):
        self.input_shape = input_shape 
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden_layer_sizes[0], activation=self.activation, input_shape=(self.input_shape,)))
        
        for layer_size in self.hidden_layer_sizes[1:]:
            model.add(Dense(layer_size, activation=self.activation))
            
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [53]:
mlp = MLPClassifierCustom(input_shape=X.shape[1], hidden_layer_sizes=(64, 128, 64), epochs=200) 

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit MLP
    mlp.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = mlp.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_ANN = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'MLP'})
r_ANN

Average Sensitivity: 0.6313186813186814
Average Specificity: 0.7570711143695015
Average Accuracy: 0.7537224084132671


Unnamed: 0,sen,spe,acc,model
0,0.407143,0.878629,0.865686,MLP
1,0.478571,0.610484,0.606863,MLP
2,0.692857,0.731452,0.730392,MLP
3,0.511538,0.874597,0.865324,MLP
4,0.434615,0.810081,0.800491,MLP
5,0.819231,0.685081,0.688507,MLP
6,0.819231,0.757661,0.759234,MLP
7,0.692857,0.680303,0.680648,MLP
8,0.692857,0.732828,0.731729,MLP
9,0.764286,0.809596,0.80835,MLP


### CNN

In [54]:
class CNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, epochs=100, batch_size=32):
        self.input_shape = input_shape  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Conv1D(8, kernel_size=3, activation='relu', input_shape=self.input_shape)) # 16
        model.add(Conv1D(12, kernel_size=3, activation='relu')) 
        model.add(Flatten())
        model.add(Dense(16, activation='relu')) 
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [55]:
X_cnn = X.reshape(X.shape[0], X.shape[1], 1)

cnn = CNNClassifier(input_shape=(X_cnn.shape[1], 1), epochs=200)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X_cnn, y):
    X_train, X_test = X_cnn[train_index], X_cnn[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit CNN
    cnn.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = cnn.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_CNN = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'CNN'})
r_CNN

Average Sensitivity: 0.6032967032967034
Average Specificity: 0.7681805962854351
Average Accuracy: 0.7637181709619014


Unnamed: 0,sen,spe,acc,model
0,0.55,0.834274,0.826471,CNN
1,0.264286,0.83629,0.820588,CNN
2,0.692857,0.677016,0.677451,CNN
3,0.511538,0.852419,0.843713,CNN
4,0.588462,0.658871,0.657073,CNN
5,0.742308,0.713306,0.714047,CNN
6,0.819231,0.783871,0.784774,CNN
7,0.692857,0.763131,0.761198,CNN
8,0.55,0.708586,0.704224,CNN
9,0.621429,0.85404,0.847642,CNN


### LSTM

In [57]:
class LSTMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, units=20, epochs=100, batch_size=32):
        self.input_shape = input_shape 
        self.units = units  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(LSTM(self.units, activation='relu', input_shape=self.input_shape))
        model.add(Dense(64, activation='relu')) 
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [59]:
X_lstm = X.reshape(X.shape[0],X.shape[1], 1)

clf = LSTMClassifier(input_shape=(X_lstm.shape[1], 1), units=40, epochs=200, batch_size=32)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X_lstm, y):
    X_train, X_test = X_lstm[train_index], X_lstm[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit LSTM
    clf.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = clf.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_LSTM = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'LSTM'})
r_LSTM

Average Sensitivity: 0.571978021978022
Average Specificity: 0.7748224177256436
Average Accuracy: 0.7694225509457222


Unnamed: 0,sen,spe,acc,model
0,0.478571,0.771774,0.763725,LSTM
1,0.407143,0.777823,0.767647,LSTM
2,0.692857,0.759677,0.757843,LSTM
3,0.511538,0.745565,0.739587,LSTM
4,0.434615,0.739516,0.731729,LSTM
5,0.511538,0.910887,0.900688,LSTM
6,0.819231,0.77379,0.774951,LSTM
7,0.621429,0.795455,0.790668,LSTM
8,0.478571,0.688384,0.682613,LSTM
9,0.764286,0.785354,0.784774,LSTM


### GRU

In [62]:
class GRUClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, units=20, epochs=100, batch_size=32):
        self.input_shape = input_shape  
        self.units = units  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(GRU(self.units, activation='relu', input_shape=self.input_shape))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [63]:
X_gru = X.reshape(X.shape[0], X.shape[1], 1)

gru = GRUClassifier(input_shape=(X_gru.shape[1], 1), units=40, epochs=200, batch_size=32)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

sensitivities = []
specificities = []
accuracies = []

for train_index, test_index in cv.split(X_gru, y):
    X_train, X_test = X_gru[train_index], X_gru[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # down sampling
    X_train_1 = X_train[y_train == 1]
    X_train_0 = X_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]

    X_train_0_downsampled = resample(X_train_0,
                                     replace=False,
                                     n_samples=len(X_train_1),
                                     random_state=42)
    
    y_train_0_downsampled = np.zeros(len(X_train_1))

    X_train_downsampled = np.vstack([X_train_0_downsampled, X_train_1])
    y_train_downsampled = np.hstack([y_train_0_downsampled, y_train_1])
    
    # fit GRU
    gru.fit(X_train_downsampled, y_train_downsampled)
    
    # predict labels
    y_pred = gru.predict(X_test)
    
    # calculate metrics
    sensitivities.append(sensitivity(y_test, y_pred))
    specificities.append(specificity(y_test, y_pred))
    accuracies.append(accuracy_score(y_test, y_pred))

print(f"Average Sensitivity: {np.mean(sensitivities)}")
print(f"Average Specificity: {np.mean(specificities)}")
print(f"Average Accuracy: {np.mean(accuracies)}")

r_GRU = pd.DataFrame({
    'sen': sensitivities,
    'spe': specificities,
    'acc': accuracies,
    'model': 'GRU'})
r_GRU

Average Sensitivity: 0.6175824175824176
Average Specificity: 0.7524580482241773
Average Accuracy: 0.748793482029354


Unnamed: 0,sen,spe,acc,model
0,0.478571,0.818145,0.808824,GRU
1,0.335714,0.779839,0.767647,GRU
2,0.764286,0.713306,0.714706,GRU
3,0.511538,0.795968,0.788703,GRU
4,0.588462,0.779839,0.774951,GRU
5,0.742308,0.675,0.676719,GRU
6,0.819231,0.620565,0.625639,GRU
7,0.692857,0.817677,0.814244,GRU
8,0.55,0.773232,0.767092,GRU
9,0.692857,0.75101,0.749411,GRU
