In [1]:
# Install libraries if necessary
!pip install tensorflow scikit-learn

# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler




In [2]:
# Load Breast Cancer Dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [3]:
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    tss = (tp / (tp + fn)) - (fp / (fp + tn))
    hss = 2 * (tp * tn - fp * fn) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))
    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn, "FPR": fpr, "FNR": fnr, "TSS": tss, "HSS": hss}


In [4]:
def random_forest_model(X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    metrics_list = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)

        metrics = calculate_metrics(y_test, y_pred)
        metrics["ROC_AUC"] = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
        metrics_list.append(metrics)

    return pd.DataFrame(metrics_list)


In [5]:
def svm_model(X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    metrics_list = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        svm_model = SVC(kernel='linear', probability=True, random_state=42)
        svm_model.fit(X_train, y_train)
        y_pred = svm_model.predict(X_test)

        metrics = calculate_metrics(y_test, y_pred)
        metrics["ROC_AUC"] = roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1])
        metrics_list.append(metrics)

    return pd.DataFrame(metrics_list)


In [8]:
def lstm_model(X, y):
    # Reshape for LSTM input
    X_reshaped = X.reshape((X.shape[0], 1, X.shape[1]))

    # No one-hot encoding; ensure binary labels (0 or 1)
    y = y.reshape(-1, 1)

    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    metrics_list = []

    for train_index, test_index in kf.split(X_reshaped):
        X_train, X_test = X_reshaped[train_index], X_reshaped[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Build LSTM model
        model = Sequential([
            Dense(32, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
            LSTM(32, return_sequences=False, activation='relu'),
            Dense(1, activation='sigmoid')  # Output single probability value for binary classification
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        # Train model
        model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)

        # Predict probabilities and convert to binary predictions
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype(int).flatten()

        # Calculate metrics
        metrics = calculate_metrics(y_test, y_pred)
        metrics["ROC_AUC"] = roc_auc_score(y_test, y_pred_prob)
        metrics_list.append(metrics)

    return pd.DataFrame(metrics_list)


In [9]:
# Run models
rf_results = random_forest_model(X_scaled, y.to_numpy())
svm_results = svm_model(X_scaled, y.to_numpy())
lstm_results = lstm_model(X_scaled, y.to_numpy())

# Average metrics
print("Random Forest Results")
print(rf_results.mean())

print("\nSVM Results")
print(svm_results.mean())

print("\nLSTM Results")
print(lstm_results.mean())


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 481ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 164ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 239ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
Random Forest Results
TP         35.000000
TN         19.800000
FP          1.400000
FN          0.700000
FPR         0.064258
FNR         0.019118
TSS         0.916625
HSS         0.919317
ROC_AUC     0.992707
dtype: float64

SVM Results
TP         34.900000
TN         20.300000
FP          0.900000
FN          0.800000
FPR         0.040278
FNR         0.021884
TSS         0.937838
HSS         0.935072
ROC_AUC     0.993692
dtype: float64

LSTM Results
TP         34.900000
TN         19.300000
FP          1.900000
FN          0.800000
FPR         0.088286
FNR         0.021506
TSS         0.890209
HSS         0.895411
ROC_AUC     0.987847
dtype: float64


In [10]:
# Combine results
rf_summary = rf_results.mean().to_frame(name="Random Forest")
svm_summary = svm_results.mean().to_frame(name="SVM")
lstm_summary = lstm_results.mean().to_frame(name="LSTM")

summary_table = pd.concat([rf_summary, svm_summary, lstm_summary], axis=1)
print(summary_table)


         Random Forest        SVM       LSTM
TP           35.000000  34.900000  34.900000
TN           19.800000  20.300000  19.300000
FP            1.400000   0.900000   1.900000
FN            0.700000   0.800000   0.800000
FPR           0.064258   0.040278   0.088286
FNR           0.019118   0.021884   0.021506
TSS           0.916625   0.937838   0.890209
HSS           0.919317   0.935072   0.895411
ROC_AUC       0.992707   0.993692   0.987847
