In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Lasso, RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from xgboost import XGBClassifier
import shap

#### Load data

In [3]:
df = pd.read_csv('./JM006_0901_whole.csv')
df = df.dropna(subset=['milkweightlbs_sca'])
df = df.dropna(subset=['cells'])
df = df.dropna(subset=['conductivity'])
df_f = df[df['disease'] == 0]
# df_d = df[df['disease'] == 1][df['disease_in'] > 14] # 210 
# df_d = df[df['disease'] == 1][df['disease_in'] <= 14][df['disease_in'] >= 11] # 94
# df_d = df[df['disease'] == 1][df['disease_in'] <= 10][df['disease_in'] >= 8] # 136
# df_d = df[df['disease'] == 1][df['disease_in'] <= 7][df['disease_in'] >= 6] # 125
# df_d = df[df['disease'] == 1][df['disease_in'] <= 5][df['disease_in'] >= 4] # 199
# df_d = df[df['disease'] == 1][df['disease_in'] == 3] # 114
# df_d = df[df['disease'] == 1][df['disease_in'] == 2] # 117
df_d = df[df['disease'] == 1][df['disease_in'] == 1] # 147
df = pd.concat([df_f, df_d])
df = df.reset_index(drop=True)
X = df.iloc[:,1:488].values
X = X[:, ::-1]
y = df['disease']



#### PLS-DA

In [5]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build PLSRegression
model = PLSRegression(n_components=2)

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_pls_da = pd.DataFrame(metrics_summary)
r_pls_da['model'] = 'PLS-DA'
r_pls_da

Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.717701,0.660952,0.775714,PLS-DA
1,0.700575,0.592857,0.809524,PLS-DA
2,0.713908,0.640476,0.788095,PLS-DA
3,0.673678,0.58619,0.760952,PLS-DA
4,0.710805,0.600476,0.824762,PLS-DA
5,0.707701,0.614286,0.804762,PLS-DA
6,0.687011,0.59381,0.782381,PLS-DA
7,0.686782,0.607143,0.768571,PLS-DA
8,0.683908,0.607143,0.762857,PLS-DA
9,0.707931,0.614286,0.804286,PLS-DA


#### Lasso

In [13]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build Lasso
model = Lasso(alpha=0.1) 

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_lasso = pd.DataFrame(metrics_summary)
r_lasso['model'] = 'Lasso'
r_lasso

Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.663218,0.654762,0.672381,Lasso
1,0.680115,0.675714,0.687143,Lasso
2,0.690575,0.696667,0.689048,Lasso
3,0.704023,0.689048,0.721905,Lasso
4,0.676897,0.67619,0.680476,Lasso
5,0.697356,0.675238,0.721429,Lasso
6,0.703793,0.682857,0.729524,Lasso
7,0.67977,0.689048,0.672857,Lasso
8,0.682989,0.661429,0.705714,Lasso
9,0.673103,0.675714,0.672381,Lasso


#### Ridge

In [7]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build Ridge
model = RidgeClassifier()

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_ridge = pd.DataFrame(metrics_summary)
r_ridge['model'] = 'Ridge'
r_ridge

Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.738046,0.695238,0.781905,Ridge
1,0.768506,0.709524,0.830476,Ridge
2,0.758046,0.702857,0.815238,Ridge
3,0.741494,0.689048,0.796667,Ridge
4,0.772299,0.71,0.838571,Ridge
5,0.765287,0.689048,0.844286,Ridge
6,0.778851,0.703333,0.857619,Ridge
7,0.778736,0.736667,0.823333,Ridge
8,0.72069,0.648571,0.794762,Ridge
9,0.768966,0.69619,0.844762,Ridge


#### Elastic Net

In [8]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build EN
model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.6)

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_Elastic_Net = pd.DataFrame(metrics_summary)
r_Elastic_Net['model'] = 'EN'
r_Elastic_Net









Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.761839,0.736667,0.788571,EN
1,0.741609,0.73,0.754762,EN
2,0.730575,0.709524,0.755238,EN
3,0.727241,0.702381,0.752381,EN
4,0.721149,0.682381,0.760952,EN
5,0.741149,0.710476,0.776667,EN
6,0.717586,0.654762,0.781905,EN
7,0.697356,0.634286,0.762381,EN
8,0.748506,0.69619,0.803333,EN
9,0.755287,0.737143,0.777143,EN


#### Random forest

In [10]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build RF
model = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42, class_weight='balanced')

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_RF = pd.DataFrame(metrics_summary)
r_RF['model'] = 'RF'
r_RF

Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.667126,0.607143,0.730476,RF
1,0.70046,0.65381,0.748095,RF
2,0.676897,0.607143,0.749048,RF
3,0.704253,0.661429,0.746667,RF
4,0.649885,0.532381,0.765714,RF
5,0.666897,0.566667,0.767143,RF
6,0.690345,0.648095,0.736667,RF
7,0.645977,0.58619,0.708095,RF
8,0.683563,0.600476,0.769048,RF
9,0.697471,0.675238,0.72,RF


#### XGBoost

In [11]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build XGBoost
model = XGBClassifier(max_depth=5, n_estimators=100, use_label_encoder=False, eval_metric='logloss', learning_rate=0.01)

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))
    
    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_XGBoost = pd.DataFrame(metrics_summary)
r_XGBoost['model'] = 'XGBoost'
r_XGBoost

Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.71046,0.728571,0.692381,XGBoost
1,0.694483,0.675238,0.71619,XGBoost
2,0.704368,0.688095,0.720476,XGBoost
3,0.728276,0.701905,0.754762,XGBoost
4,0.707701,0.681429,0.735714,XGBoost
5,0.673448,0.661905,0.687143,XGBoost
6,0.730575,0.694762,0.767619,XGBoost
7,0.731379,0.756667,0.708095,XGBoost
8,0.700805,0.694762,0.709524,XGBoost
9,0.690115,0.667143,0.71381,XGBoost


#### MLP

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam

class MLPClassifierCustom(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, hidden_layer_sizes=(100,), activation='relu', epochs=200, batch_size=32):
        self.input_shape = input_shape  
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden_layer_sizes[0], activation=self.activation, input_shape=(self.input_shape,)))
        
        for layer_size in self.hidden_layer_sizes[1:]:
            model.add(Dense(layer_size, activation=self.activation))
            
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [528]:
# Split health and disease group
X_0 = X[y == 0]
X_1 = X[y == 1]

# Build MLP
model = MLPClassifierCustom(input_shape=X.shape[1], hidden_layer_sizes=(32, 128, 32)) 

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))
    
    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_ANN = pd.DataFrame(metrics_summary)
r_ANN['model'] = 'MLP'
r_ANN



147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.755517,0.69,0.82381,MLP
1,0.755402,0.723333,0.788571,MLP
2,0.772414,0.770476,0.775714,MLP
3,0.793103,0.755714,0.832381,MLP
4,0.812989,0.784286,0.843333,MLP
5,0.809655,0.790952,0.830952,MLP
6,0.786552,0.757619,0.818571,MLP
7,0.78,0.824762,0.73619,MLP
8,0.837241,0.776667,0.897143,MLP
9,0.874023,0.851905,0.897619,MLP


#### CNN

In [530]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam

class CNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, epochs=200, batch_size=32):
        self.input_shape = input_shape  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Conv1D(8, kernel_size=3, activation='relu', input_shape=self.input_shape)) # 16
        model.add(Conv1D(12, kernel_size=3, activation='relu')) 
        model.add(Flatten())
        model.add(Dense(16, activation='relu')) # 16
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [531]:
# Split health and disease group
X_cnn = X.reshape(X.shape[0], X.shape[1], 1)
X_0 = X_cnn[y == 0]
X_1 = X_cnn[y == 1]

# Build CNN
model = CNNClassifier(input_shape=(X_cnn.shape[1], 1))

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))
    print(y_sampled.tolist().count(1))
    print(y_sampled.tolist().count(0))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_CNN = pd.DataFrame(metrics_summary)
r_CNN['model'] = 'CNN'
r_CNN



147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.725287,0.67,0.78381,CNN
1,0.758276,0.730476,0.791905,CNN
2,0.772184,0.76381,0.782857,CNN
3,0.782529,0.76381,0.803333,CNN
4,0.782414,0.764286,0.803333,CNN
5,0.775517,0.755714,0.795238,CNN
6,0.785977,0.777143,0.79619,CNN
7,0.758621,0.76381,0.755714,CNN
8,0.775402,0.742381,0.809048,CNN
9,0.795747,0.777619,0.817143,CNN


#### LSTM

In [533]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, MaxPooling1D
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam

class LSTMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, units=20, epochs=200, batch_size=32):
        self.input_shape = input_shape  
        self.units = units  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(LSTM(self.units, activation='relu', input_shape=self.input_shape))
        model.add(Dense(64, activation='relu')) 
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [534]:
# Split health and disease group
X_rnn = X.reshape(X.shape[0],1, X.shape[1])
X_0 = X_rnn[y == 0]
X_1 = X_rnn[y == 1]

# Build LSTM
model = LSTMClassifier(input_shape=(1, X_rnn.shape[2]), units=20, epochs=200, batch_size=32)

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_LSTM = pd.DataFrame(metrics_summary)
r_LSTM['model'] = 'LSTM'
r_LSTM



147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.724713,0.683333,0.769048,LSTM
1,0.758276,0.735714,0.783333,LSTM
2,0.755057,0.750476,0.761429,LSTM
3,0.782414,0.75619,0.808571,LSTM
4,0.761954,0.755714,0.768571,LSTM
5,0.782299,0.742857,0.824286,LSTM
6,0.755057,0.762857,0.748571,LSTM
7,0.795517,0.777619,0.815714,LSTM
8,0.78931,0.764762,0.818095,LSTM
9,0.822874,0.810952,0.837619,LSTM


#### GRU

In [536]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from sklearn.base import BaseEstimator, ClassifierMixin
from tensorflow.keras.optimizers import Adam

class GRUClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_shape, units=20, epochs=200, batch_size=32):
        self.input_shape = input_shape  
        self.units = units  
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(GRU(self.units, activation='relu', input_shape=self.input_shape))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, X, y):
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self
    
    def predict_proba(self, X):
        return self.model.predict(X)
    
    def predict(self, X):
        y_pred = self.model.predict(X)
        return (y_pred > 0.5).astype(int).flatten()

In [537]:
# Split health and disease group
X_rnn = X.reshape(X.shape[0],1, X.shape[1])
X_0 = X_rnn[y == 0]
X_1 = X_rnn[y == 1]

# Build GRU
model = GRUClassifier(input_shape=(1, X_rnn.shape[2]), units=20, epochs=200, batch_size=32)

# Down-sample and cross-valiation
n_samples = 50  
n_splits = 10  

# Construct dataframe for evaluation metrics
metrics_summary = {
    'accuracy': [],
    'sensitivity': [],
    'specificity': []
}

# Start combined down-sample and cross-valiation procedure 
for i in range(n_samples):
    sampled_indices = np.random.choice(len(X_0), size=len(X_1), replace=True)
    X_0_sampled = X_0[sampled_indices]

    X_sampled = np.vstack((X_0_sampled, X_1))
    y_sampled = np.array([0] * len(X_0_sampled) + [1] * len(X_1))

    cv = StratifiedKFold(n_splits=n_splits)

    acc_scores = []
    sens_scores = []
    spec_scores = []

    for train_idx, test_idx in cv.split(X_sampled, y_sampled):
        X_train, X_test = X_sampled[train_idx], X_sampled[test_idx]
        y_train, y_test = y_sampled[train_idx], y_sampled[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x >= 0.5 else 0 for x in y_pred]

        acc_scores.append(accuracy_score(y_test, y_pred))
        sens_scores.append(recall_score(y_test, y_pred))
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        spec_scores.append(tn / (tn + fp))

    metrics_summary['accuracy'].append(np.mean(acc_scores))
    metrics_summary['sensitivity'].append(np.mean(sens_scores))
    metrics_summary['specificity'].append(np.mean(spec_scores))

r_GRU = pd.DataFrame(metrics_summary)
r_GRU['model'] = 'GRU'
r_GRU



147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147
147


147
147
147
147


Unnamed: 0,accuracy,sensitivity,specificity,model
0,0.766207,0.736667,0.798571,GRU
1,0.775172,0.76381,0.789048,GRU
2,0.751954,0.744286,0.761905,GRU
3,0.792529,0.744286,0.843333,GRU
4,0.775057,0.722857,0.828571,GRU
5,0.751494,0.661905,0.843333,GRU
6,0.795862,0.77,0.822381,GRU
7,0.765402,0.744762,0.789524,GRU
8,0.802414,0.777143,0.830476,GRU
9,0.813103,0.78381,0.843333,GRU
