<a href="https://colab.research.google.com/github/hadif1999/iot_smoke_detection_ML/blob/main/smoke_detection_iot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#smoke_detection_iot project

# Preprocessing


In this section, data are loaded, inspected, and cleaned. Correlation is checked, outliers are removed, and missing values are verified before modeling.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow.keras as tfk
import os

In [None]:
import os
import random
import tensorflow as tf

os.environ["PYTHONHASHSEED"] = "42"
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
import requests

# Replace with the actual raw GitHub URL of your CSV file
github_csv_url = "https://raw.githubusercontent.com/hadif1999/iot_smoke_detection_ML/main/smoke_detection_iot.csv"

response = requests.get(github_csv_url)
response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

with open("smoke_detection_iot.csv", "wb") as f:
    f.write(response.content)

print("Dataset downloaded successfully to smoke_detection_iot.csv")

In [None]:

data = pd.read_csv("smoke_detection_iot.csv")
data

Columns explanation :
<br>
Air Temperature
<br>
Air Humidity
<br>

TVOC: Total Volatile Organic Compounds; measured in parts per billion (Source)
<br>
eCO2: co2 equivalent concentration; calculated from different values like TVCO
<br>
Raw H2: raw molecular hydrogen; not compensated (Bias, temperature, etc.)
<br>
Raw Ethanol: raw ethanol gas (Source)¶
<br>
PM 1.0 and PM 2.5: particulate matter size < 1.0 µm (PM1.0). 1.0 µm < 2.5 µm (PM2.5)
<br>
Fire Alarm: ground truth is "1" if a fire is there
<br>
CNT: Sample counter
<br>
UTC: Timestamp UTC seconds
<br>
NC0.5/NC1.0 and NC2.5: Number concentration of particulate matter. This differs from PM because NC gives the actual number of particles in the air.
<br>
The raw NC is also classified by the particle size: < 0.5 µm (NC0.5); 0.5 µm < 1.0 µm (NC1.0); 1.0 µm < 2.5 µm (NC2.5)¶

In [None]:
data.corr()["Fire Alarm"].sort_values().plot(kind='bar')

In [None]:
cols = ['Unnamed: 0',"PM2.5" , 'CNT' ,
             'UTC' , "Raw H2",
       'PM1.0' , 'NC0.5' ,'NC1.0' ,'NC2.5']

In [None]:
data2 = data.copy()
data2.drop(cols, axis=1, inplace=True)
data2

In [None]:
data2.plot(kind='box', subplots=True, layout=(8,5), figsize=(17,20))

In [None]:
def outlier_bands(df, multiplier: int = 1.5):

 Q1 = df.quantile(0.25)
 Q3 = df.quantile(0.75)

 IQR = Q3-Q1
 lower_band = Q1 - multiplier*IQR
 upper_band = Q3 + multiplier*IQR


 return lower_band,upper_band




In [None]:
low_band, up_band = outlier_bands(data2, multiplier=2)

data_clean = data2.copy()

# Create a boolean mask to identify rows with outliers
# A row is an outlier if any of its values are either greater than the upper band
# or less than the lower band for the respective column.
outlier_mask = ((data_clean > up_band) | (data_clean < low_band)).any(axis=1)

# Filter out the outlier rows
data_clean = data_clean[~outlier_mask]

print(f"Removed {len(data2) - len(data_clean)} outlier rows.")

**Note on Outlier Removal**
Outliers are removed using bounds computed on the full dataset. This can introduce leakage in strict evaluation, but it is retained here to preserve consistency with the current results.


In [None]:
data_clean.shape, data2.shape

In [None]:
data_clean = data_clean.reset_index(drop = True)
#data_clean.drop(["level_0","index"],axis=1,inplace=True)
data_clean

In [None]:
data_clean.describe().T

In [None]:
y = data_clean["Fire Alarm"]

In [None]:
y = y.values
y

In [None]:
# Data profiling: missing values and class balance
missing_summary = data_clean.isnull().sum()
print("Missing values per column:")
print(missing_summary)

print("")
print("Class balance (Fire Alarm):")
class_counts = data_clean["Fire Alarm"].value_counts().sort_index()
class_perc = (class_counts / class_counts.sum() * 100).round(2)
class_balance = pd.DataFrame({"Count": class_counts, "Percent": class_perc})
class_balance


In [None]:
x = data_clean[["Temperature[C]","Humidity[%]","TVOC[ppb]","eCO2[ppm]","Raw Ethanol","Pressure[hPa]"]].values

In [None]:
feature_names = ["Temperature[C]", "Humidity[%]", "TVOC[ppb]", "eCO2[ppm]", "Raw Ethanol", "Pressure[hPa]"]


# Neural Network (MLP)


A simple feed-forward neural network is trained and evaluated first with a holdout split, then with K-fold cross-validation.


## Holdout Training and Evaluation


In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.15, random_state=10)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:

def model_gen ( n_HD_layer , n_neuron_list , input_shape = data_clean.shape[1:], ac_f = "sigmoid"):

  if not n_HD_layer == len(n_neuron_list) : print("number of hidden layers must be equal to len of list of number of neurons in each layer ")

  layer_list = []

  input_layer = tfk.layers.Input(input_shape,name = "input_layer")
  layer_list.append(input_layer)

  for i in range(n_HD_layer):

    hd_layer = tfk.layers.Dense(n_neuron_list[i], ac_f ,False,"glorot_normal",name = "hd_layer{}".format(i+1))(layer_list[-1])
    batch_layer = tfk.layers.BatchNormalization(name = "batch_layer{}".format(i+1))(hd_layer)
    drop_layer = tfk.layers.Dropout( 0.4 ,name = "drop_layer{}".format(i+1) )(batch_layer)

    layer_list.append(hd_layer)
    layer_list.append(batch_layer)
    layer_list.append(drop_layer)

  #drop_layer_final = tfk.layers.Dropout(0.4)(layer_list[-1])
  #layer_list.append(drop_layer_final)

  out_layer = tfk.layers.Dense(1,"sigmoid",False,name = "output_layer")(layer_list[-1])

  model = tfk.Model(inputs=input_layer , outputs =out_layer,name = "model01" )

  return model



In [None]:
model_args = dict(n_HD_layer=2 , n_neuron_list=[40, 20])

model1 = model_gen(**model_args, input_shape = X_train.shape[1:])
model1.summary()

In [None]:
model1.compile(optimizer = "Adam", loss="binary_crossentropy",
               metrics = ["accuracy","binary_accuracy",
                          tfk.metrics.AUC(),tfk.metrics.Precision(),tfk.metrics.Recall()],)

In [None]:
hist1 = model1.fit(x = X_train, y = Y_train , batch_size = 24 , epochs = 3, validation_split=0.15)

## Training History Plots
Learning curves are plotted to visualize optimization progress (loss/metrics across epochs).


In [None]:
import math

history = hist1.history
metrics = [m for m in history.keys() if not m.startswith("val_")]

cols = 2
rows = math.ceil(len(metrics) / cols)
fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
axes = axes.flatten() if hasattr(axes, "flatten") else [axes]

for i, m in enumerate(metrics):
    axes[i].plot(history[m], label=m)
    if f"val_{m}" in history:
        axes[i].plot(history[f"val_{m}"], label=f"val_{m}")
    axes[i].set_title(m)
    axes[i].set_xlabel("epoch")
    axes[i].set_ylabel(m)
    axes[i].legend()

# remove unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
model1.save("smoke_detection_iot.h5")

In [None]:
eval = model1.evaluate(X_test, Y_test)

In [None]:
print("final accuracy for test data is : " , eval[1])

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt

y_hat = model1.predict(X_test)
y_hat_ = np.round(y_hat.T[0])

# Generate the confusion matrix
cm = confusion_matrix(Y_test, y_hat_)

# Define class labels for clarity
class_labels = ['No Alarm', 'Alarm']

# Plotting the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=class_labels, yticklabels=class_labels)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(Y_test, y_hat_, target_names=class_labels))

## ROC Curve (Neural Network)
ROC is computed from the holdout test probabilities.


In [None]:
fpr, tpr, _ = roc_curve(Y_test, y_hat)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"NN ROC (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Neural Network (Holdout)")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()


## K-Fold Cross-Validation (Neural Network)


In [None]:
if not isinstance(y, np.ndarray):
    y = y.values

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
print("KFold object 'kf' initialized successfully.")

In [None]:
accuracy_scores = []
auc_scores = []
precision_scores = []
recall_scores = []
loss_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(x)):
    print(f"\n--- Fold {fold+1}/{kf.n_splits} ---")
    # 3. Split the data into training and validation sets
    X_train_fold, X_val_fold = x[train_index], x[val_index]
    Y_train_fold, Y_val_fold = y[train_index], y[val_index]

    # 4 & 5. Instantiate StandardScaler and apply it
    sc = StandardScaler()
    X_train_fold = sc.fit_transform(X_train_fold)
    X_val_fold = sc.transform(X_val_fold)

    # 6. Create a new model instance for the current fold
    model_fold = model_gen(**model_args, input_shape=X_train_fold.shape[1:])

    # 7. Compile the new model
    model_fold.compile(optimizer="Adam", loss="binary_crossentropy",
                       metrics=["accuracy", "binary_accuracy",
                                  tfk.metrics.AUC(), tfk.metrics.Precision(), tfk.metrics.Recall()])

    # 8. Train the model
    history = model_fold.fit(x=X_train_fold, y=Y_train_fold, batch_size=24, epochs=3, verbose=1) # Set verbose to 0 to suppress output for each epoch

    # 9. Evaluate the trained model
    eval_metrics = model_fold.evaluate(X_val_fold, Y_val_fold, verbose=0)
    print(f"Fold {fold+1} Evaluation: Loss = {eval_metrics[0]:.4f}, Accuracy = {eval_metrics[1]:.4f}, AUC = {eval_metrics[3]:.4f}, Precision = {eval_metrics[4]:.4f}, Recall = {eval_metrics[5]:.4f}")

    # 10. Append the evaluated metrics to their respective lists
    loss_scores.append(eval_metrics[0])
    accuracy_scores.append(eval_metrics[1])
    auc_scores.append(eval_metrics[3]) # Assuming AUC is at index 3 based on compile metrics
    precision_scores.append(eval_metrics[4])
    recall_scores.append(eval_metrics[5]) # Assuming Recall is at index 5 based on compile metrics

print("\nK-fold cross-validation complete. Metrics stored.")

In [None]:
print("Average Accuracy: {:.4f} (+/- {:.4f})".format(np.mean(accuracy_scores), np.std(accuracy_scores)))
print("Average AUC: {:.4f} (+/- {:.4f})".format(np.mean(auc_scores), np.std(auc_scores)))
print("Average Precision: {:.4f} (+/- {:.4f})".format(np.mean(precision_scores), np.std(precision_scores)))
print("Average Recall: {:.4f} (+/- {:.4f})".format(np.mean(recall_scores), np.std(recall_scores)))
print("Average Loss: {:.4f} (+/- {:.4f})".format(np.mean(loss_scores), np.std(loss_scores)))


# Support Vector Machine (SVM)


The same K-fold protocol is applied to a classical SVM baseline for comparison.


## K-Fold Cross-Validation (SVM)


In [None]:
from sklearn.svm import SVC
svm_accuracy_scores = []
svm_precision_scores = []
svm_recall_scores = []
svm_f1_scores = []

all_y_true = []
all_y_pred_svm = []
all_y_score_svm = []

for fold, (train_index, val_index) in enumerate(kf.split(x)):
    print(f"\n--- SVM Fold {fold+1}/{kf.n_splits} ---")

    # Split the data
    X_train_fold, X_val_fold = x[train_index], x[val_index]
    Y_train_fold, Y_val_fold = y[train_index], y[val_index]

    # Scale the data
    sc_svm = StandardScaler()
    X_train_fold_scaled = sc_svm.fit_transform(X_train_fold)
    X_val_fold_scaled = sc_svm.transform(X_val_fold)

    # Instantiate a new SVC classifier
    svm_model_fold = SVC(random_state=42, C=0.5)

    # Train the SVM model
    svm_model_fold.fit(X_train_fold_scaled, Y_train_fold)

    # Make predictions
    y_pred_svm_fold = svm_model_fold.predict(X_val_fold_scaled)

    # Decision scores for ROC
    y_score_svm_fold = svm_model_fold.decision_function(X_val_fold_scaled)

    # Calculate metrics for the current fold
    report = classification_report(Y_val_fold, y_pred_svm_fold, output_dict=True)

    # Extract metrics for the positive class (class 1)
    accuracy_fold = report['accuracy']
    precision_fold = report['1']['precision']
    recall_fold = report['1']['recall']
    f1_fold = report['1']['f1-score']

    print(f"Fold {fold+1} Accuracy: {accuracy_fold:.4f}")
    print(f"Fold {fold+1} Precision (Class 1): {precision_fold:.4f}")
    print(f"Fold {fold+1} Recall (Class 1): {recall_fold:.4f}")
    print(f"Fold {fold+1} F1-Score (Class 1): {f1_fold:.4f}")

    # Append metrics to lists
    svm_accuracy_scores.append(accuracy_fold)
    svm_precision_scores.append(precision_fold)
    svm_recall_scores.append(recall_fold)
    svm_f1_scores.append(f1_fold)

    # Accumulate true and predicted labels
    all_y_true.extend(Y_val_fold)
    all_y_pred_svm.extend(y_pred_svm_fold)
    all_y_score_svm.extend(y_score_svm_fold)

print("\nSVM K-fold cross-validation complete. Metrics and predictions stored.")

## SVM Evaluation Summary
A consolidated set of metrics is computed from all folds to summarize SVM performance.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

svm_accuracy = accuracy_score(all_y_true, all_y_pred_svm)
svm_precision = precision_score(all_y_true, all_y_pred_svm)
svm_recall = recall_score(all_y_true, all_y_pred_svm)
svm_f1 = f1_score(all_y_true, all_y_pred_svm)

print(f"SVM Accuracy:  {svm_accuracy:.4f}")
print(f"SVM Precision: {svm_precision:.4f}")
print(f"SVM Recall:    {svm_recall:.4f}")
print(f"SVM F1-Score:  {svm_f1:.4f}")

# store for comparison table
svm_summary = {
    "Accuracy": svm_accuracy,
    "Precision": svm_precision,
    "Recall": svm_recall,
    "F1": svm_f1,
}


## Metrics and Confusion Matrix for SVM
A consolidated classification report and confusion matrix are produced from all folds.


In [None]:
print("Consolidated Classification Report for SVM (K-fold):")
print(classification_report(all_y_true, all_y_pred_svm, target_names=['No Alarm', 'Alarm']))

# Generate the consolidated confusion matrix
cm_svm_consolidated = confusion_matrix(all_y_true, all_y_pred_svm)

# Plotting the consolidated confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_consolidated, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['No Alarm', 'Alarm'], yticklabels=['No Alarm', 'Alarm'])

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Consolidated Confusion Matrix for SVM (K-fold)')
plt.show()

print("Consolidated Confusion Matrix (K-fold):\n", cm_svm_consolidated)


## ROC Curve (SVM)
ROC is computed from consolidated decision scores across folds.


In [None]:
fpr_svm, tpr_svm, _ = roc_curve(all_y_true, all_y_score_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)

plt.figure(figsize=(6, 5))
plt.plot(fpr_svm, tpr_svm, label=f"SVM ROC (AUC = {roc_auc_svm:.4f})")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM (K-Fold Consolidated)")
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()


## SHAP Value Analysis



In [None]:
try:
    import shap
except ImportError:
    print("SHAP is not installed. run pip install shap")
else:
    # Train a full SVM model for interpretability
    sc_full = StandardScaler()
    X_full_scaled = sc_full.fit_transform(x)

    svm_full = SVC(random_state=42, C=0.5, probability=True)
    svm_full.fit(X_full_scaled, y)

    # Sample data for faster SHAP computation
    sample_size = min(200, len(X_full_scaled))
    background_size = min(100, len(X_full_scaled))

    rng = np.random.default_rng(42)
    sample_idx = rng.choice(len(X_full_scaled), size=sample_size, replace=False)
    background_idx = rng.choice(len(X_full_scaled), size=background_size, replace=False)

    X_sample = X_full_scaled[sample_idx]
    background = X_full_scaled[background_idx]

    # Wrap predict_proba to return only the probability of the positive class (index 1)
    # This ensures explainer.shap_values returns a 2D array, which summary_plot expects directly.
    def model_predict_proba_wrapper(x_input):
        return svm_full.predict_proba(x_input)[:, 1]

    explainer = shap.KernelExplainer(model_predict_proba_wrapper, background)
    shap_values = explainer.shap_values(X_sample, nsamples=100) # This will now be (num_samples, num_features)

    # Plot SHAP summary (shap_values is now directly for the positive class)
    shap.summary_plot(shap_values, X_sample, feature_names=feature_names)


## Comparison Table
A small summary table is generated to compare the neural network and SVM using the stored cross-validation metrics.


In [None]:
# Neural network (K-fold) summary
nn_precision = float(np.mean(precision_scores))
nn_recall = float(np.mean(recall_scores))
nn_f1 = (2 * nn_precision * nn_recall / (nn_precision + nn_recall)) if (nn_precision + nn_recall) > 0 else np.nan

nn_summary = {
    "Accuracy": float(np.mean(accuracy_scores)),
    "Precision": nn_precision,
    "Recall": nn_recall,
    "AUC": float(np.mean(auc_scores)),
    "F1": nn_f1,
    "Loss": float(np.mean(loss_scores)),
}

# SVM summary from consolidated predictions
# (Assumes svm_summary was created in the SVM evaluation cell)
comparison_df = pd.DataFrame([
    {"Model": "Neural Network", **nn_summary},
    {"Model": "SVM", **svm_summary, "AUC": np.nan, "Loss": np.nan},
], columns=["Model", "Accuracy", "Precision", "Recall", "AUC", "F1", "Loss"])

comparison_df


# Leakage-Free Evaluation (Strict)
This section re-evaluates the models with train-only preprocessing to reduce leakage. Outlier bounds and scaling are fitted on train data and applied to validation/test splits.


## Leakage-Free Data Setup
Features and labels are taken directly from the raw dataset (no correlation-based filtering).


In [None]:
# Leakage-free feature setup
lf_features = ["Temperature[C]", "Humidity[%]", "TVOC[ppb]", "eCO2[ppm]", "Raw Ethanol", "Pressure[hPa]"]
lf_df = data[lf_features + ["Fire Alarm"]].copy()

# Basic missing-value check
print("Missing values (leakage-free setup):")
print(lf_df.isnull().sum())


## Leakage-Free Outlier Utilities
Outlier bounds are computed on the training split only and then applied to validation/test.


In [None]:
def lf_outlier_bounds(train_df, features, multiplier=2.0):
    q1 = train_df[features].quantile(0.25)
    q3 = train_df[features].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - multiplier * iqr
    upper = q3 + multiplier * iqr
    return lower, upper

def lf_filter_outliers(df, features, lower, upper):
    mask = ~((df[features] < lower) | (df[features] > upper)).any(axis=1)
    return df[mask]


## Holdout Evaluation (Leakage-Free)


In [None]:
from sklearn.model_selection import train_test_split

    lf_train_df, lf_test_df = train_test_split(
        lf_df, test_size=0.15, random_state=10, stratify=lf_df["Fire Alarm"]
    )

    # Compute bounds from train only
    lf_lower, lf_upper = lf_outlier_bounds(lf_train_df, lf_features, multiplier=2.0)

    # Filter outliers in train and test using train bounds
    lf_train_df = lf_filter_outliers(lf_train_df, lf_features, lf_lower, lf_upper)
    lf_test_df = lf_filter_outliers(lf_test_df, lf_features, lf_lower, lf_upper)

    print(f"Leakage-free holdout: train={len(lf_train_df)}, test={len(lf_test_df)}")

    X_train_lf = lf_train_df[lf_features].values
    y_train_lf = lf_train_df["Fire Alarm"].values
    X_test_lf = lf_test_df[lf_features].values
    y_test_lf = lf_test_df["Fire Alarm"].values

    sc_lf = StandardScaler()
    X_train_lf = sc_lf.fit_transform(X_train_lf)
    X_test_lf = sc_lf.transform(X_test_lf)

    model_lf = model_gen(**model_args, input_shape=X_train_lf.shape[1:])
    model_lf.compile(optimizer="Adam", loss="binary_crossentropy",
                     metrics=["accuracy", "binary_accuracy",
                              tfk.metrics.AUC(), tfk.metrics.Precision(), tfk.metrics.Recall()])

    hist_lf = model_lf.fit(x=X_train_lf, y=y_train_lf, batch_size=24, epochs=3, validation_split=0.15)

    eval_lf = model_lf.evaluate(X_test_lf, y_test_lf)
    print("Leakage-free holdout accuracy:", eval_lf[1])

    # Holdout confusion matrix and report
    y_hat_lf = model_lf.predict(X_test_lf)
    y_hat_lf_bin = (y_hat_lf.T[0] >= 0.5).astype(int)

    cm_lf = confusion_matrix(y_test_lf, y_hat_lf_bin)
    print("
Confusion Matrix (Leakage-Free Holdout):
", cm_lf)
    print("
Classification Report (Leakage-Free Holdout):
", classification_report(y_test_lf, y_hat_lf_bin, target_names=['No Alarm', 'Alarm']))


## K-Fold Cross-Validation (Leakage-Free NN)


In [None]:
lf_nn_accuracy_scores = []
    lf_nn_auc_scores = []
    lf_nn_precision_scores = []
    lf_nn_recall_scores = []
    lf_nn_loss_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(lf_df)):
        train_df = lf_df.iloc[train_index]
        val_df = lf_df.iloc[val_index]

        # Train-only outlier bounds
        lower, upper = lf_outlier_bounds(train_df, lf_features, multiplier=2.0)
        train_df = lf_filter_outliers(train_df, lf_features, lower, upper)
        val_df = lf_filter_outliers(val_df, lf_features, lower, upper)

        X_train = train_df[lf_features].values
        y_train = train_df["Fire Alarm"].values
        X_val = val_df[lf_features].values
        y_val = val_df["Fire Alarm"].values

        sc_fold = StandardScaler()
        X_train = sc_fold.fit_transform(X_train)
        X_val = sc_fold.transform(X_val)

        model_fold = model_gen(**model_args, input_shape=X_train.shape[1:])
        model_fold.compile(optimizer="Adam", loss="binary_crossentropy",
                           metrics=["accuracy", "binary_accuracy",
                                    tfk.metrics.AUC(), tfk.metrics.Precision(), tfk.metrics.Recall()])

        model_fold.fit(x=X_train, y=y_train, batch_size=24, epochs=3, verbose=0)
        eval_metrics = model_fold.evaluate(X_val, y_val, verbose=0)

        lf_nn_loss_scores.append(eval_metrics[0])
        lf_nn_accuracy_scores.append(eval_metrics[1])
        lf_nn_auc_scores.append(eval_metrics[3])
        lf_nn_precision_scores.append(eval_metrics[4])
        lf_nn_recall_scores.append(eval_metrics[5])

    print("
Leakage-free NN K-fold summary:")
    print("Accuracy:  {:.4f} (+/- {:.4f})".format(np.mean(lf_nn_accuracy_scores), np.std(lf_nn_accuracy_scores)))
    print("AUC:       {:.4f} (+/- {:.4f})".format(np.mean(lf_nn_auc_scores), np.std(lf_nn_auc_scores)))
    print("Precision: {:.4f} (+/- {:.4f})".format(np.mean(lf_nn_precision_scores), np.std(lf_nn_precision_scores)))
    print("Recall:    {:.4f} (+/- {:.4f})".format(np.mean(lf_nn_recall_scores), np.std(lf_nn_recall_scores)))
    print("Loss:      {:.4f} (+/- {:.4f})".format(np.mean(lf_nn_loss_scores), np.std(lf_nn_loss_scores)))


## K-Fold Cross-Validation (Leakage-Free SVM)


In [None]:
lf_svm_accuracy_scores = []
    lf_svm_precision_scores = []
    lf_svm_recall_scores = []
    lf_svm_f1_scores = []
    lf_all_y_true = []
    lf_all_y_pred = []
    lf_all_y_score = []

    for fold, (train_index, val_index) in enumerate(kf.split(lf_df)):
        train_df = lf_df.iloc[train_index]
        val_df = lf_df.iloc[val_index]

        # Train-only outlier bounds
        lower, upper = lf_outlier_bounds(train_df, lf_features, multiplier=2.0)
        train_df = lf_filter_outliers(train_df, lf_features, lower, upper)
        val_df = lf_filter_outliers(val_df, lf_features, lower, upper)

        X_train = train_df[lf_features].values
        y_train = train_df["Fire Alarm"].values
        X_val = val_df[lf_features].values
        y_val = val_df["Fire Alarm"].values

        sc_fold = StandardScaler()
        X_train = sc_fold.fit_transform(X_train)
        X_val = sc_fold.transform(X_val)

        svm_model = SVC(random_state=42, C=0.5)
        svm_model.fit(X_train, y_train)

        y_pred = svm_model.predict(X_val)
        y_score = svm_model.decision_function(X_val)

        report = classification_report(y_val, y_pred, output_dict=True)
        lf_svm_accuracy_scores.append(report['accuracy'])
        lf_svm_precision_scores.append(report['1']['precision'])
        lf_svm_recall_scores.append(report['1']['recall'])
        lf_svm_f1_scores.append(report['1']['f1-score'])

        lf_all_y_true.extend(y_val)
        lf_all_y_pred.extend(y_pred)
        lf_all_y_score.extend(y_score)

    print("
Leakage-free SVM K-fold summary:")
    print("Accuracy:  {:.4f} (+/- {:.4f})".format(np.mean(lf_svm_accuracy_scores), np.std(lf_svm_accuracy_scores)))
    print("Precision: {:.4f} (+/- {:.4f})".format(np.mean(lf_svm_precision_scores), np.std(lf_svm_precision_scores)))
    print("Recall:    {:.4f} (+/- {:.4f})".format(np.mean(lf_svm_recall_scores), np.std(lf_svm_recall_scores)))
    print("F1:        {:.4f} (+/- {:.4f})".format(np.mean(lf_svm_f1_scores), np.std(lf_svm_f1_scores)))


## Leakage-Free Comparison Table


In [None]:
lf_nn_precision = float(np.mean(lf_nn_precision_scores))
lf_nn_recall = float(np.mean(lf_nn_recall_scores))
lf_nn_f1 = (2 * lf_nn_precision * lf_nn_recall / (lf_nn_precision + lf_nn_recall)) if (lf_nn_precision + lf_nn_recall) > 0 else np.nan

lf_nn_summary = {
    "Accuracy": float(np.mean(lf_nn_accuracy_scores)),
    "Precision": lf_nn_precision,
    "Recall": lf_nn_recall,
    "AUC": float(np.mean(lf_nn_auc_scores)),
    "F1": lf_nn_f1,
    "Loss": float(np.mean(lf_nn_loss_scores)),
}

lf_svm_summary = {
    "Accuracy": float(np.mean(lf_svm_accuracy_scores)),
    "Precision": float(np.mean(lf_svm_precision_scores)),
    "Recall": float(np.mean(lf_svm_recall_scores)),
    "F1": float(np.mean(lf_svm_f1_scores)),
}

leakage_free_comparison = pd.DataFrame([
    {"Model": "Neural Network", **lf_nn_summary},
    {"Model": "SVM", **lf_svm_summary, "AUC": np.nan, "Loss": np.nan},
], columns=["Model", "Accuracy", "Precision", "Recall", "AUC", "F1", "Loss"])

leakage_free_comparison
