In [2]:
#%% [code]
import os
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, matthews_corrcoef, confusion_matrix
)
import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import Callback
from pandas.plotting import parallel_coordinates

In [3]:


# Ensure plot directory exists
os.makedirs("plot", exist_ok=True)

#---------------------------
# Custom EarlyStopping
#---------------------------
class ConsecutiveEarlyStopping(Callback):
    def __init__(self, monitor='accuracy', threshold=0.99, patience=5, verbose=1):
        super().__init__()
        self.monitor = monitor
        self.threshold = threshold
        self.patience = patience
        self.verbose = verbose
        self.counter = 0

    def on_epoch_end(self, epoch, logs=None):
        val = logs.get(self.monitor)
        if val is None:
            return
        if val > self.threshold:
            self.counter += 1
            if self.counter >= self.patience:
                if self.verbose:
                    print(f"\nEpoch {epoch+1}: {self.monitor} > {self.threshold}"
                          f" for {self.patience} consecutive epochs. Stopping.")
                self.model.stop_training = True
        else:
            self.counter = 0

#---------------------------
# Build Models
#---------------------------
def build_cnn_model():
    inputs = tf.keras.Input(shape=(100, 6))
    x = layers.Conv1D(32, 3, activation="relu")(inputs)
    x = layers.Conv1D(64, 3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(128, 3, activation="relu")(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inputs=inputs, outputs=outputs, name="CNN")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
                  loss="binary_crossentropy", metrics=["accuracy"])
    return model


def build_lstm_model():
    inputs = tf.keras.Input(shape=(100, 6))
    x = layers.LSTM(64, return_sequences=True)(inputs)
    x = layers.LSTM(32)(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inputs=inputs, outputs=outputs, name="LSTM")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
                  loss="binary_crossentropy", metrics=["accuracy"])
    return model


def build_cnn_lstm_model():
    inputs = tf.keras.Input(shape=(100, 6))
    x = layers.Conv1D(32, 3, activation="relu")(inputs)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Conv1D(64, 3, activation="relu")(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.LSTM(32)(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = models.Model(inputs=inputs, outputs=outputs, name="CNN_LSTM")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
                  loss="binary_crossentropy", metrics=["accuracy"])
    return model

model_builders = [
    ("CNN", build_cnn_model),
    ("LSTM", build_lstm_model),
    ("CNN_LSTM", build_cnn_lstm_model)
]

#---------------------------
# Load and Prepare Data
#---------------------------
def load_and_prepare():
    # Load pre-saved CSVs
    df_feat = pd.read_csv("final_f.csv")
    df_lab  = pd.read_csv("final_l.csv")
    # Handle missing values
    if df_feat.isnull().values.any():
        df_feat.fillna(df_feat.mean(), inplace=True)
    if df_lab.isnull().values.any():
        df_lab.fillna(df_lab.mean(), inplace=True)
    # Reshape
    rows_per = 100
    n_feat = df_feat.shape[1]
    n_samples = df_lab.shape[0]
    X = df_feat.values.reshape(n_samples, rows_per, n_feat)
    y = df_lab.iloc[:,0].values
    return X, y

#---------------------------
# Scale Data
#---------------------------
def scale_data(X):
    n_samples, rows, n_feat = X.shape
    scaler = MinMaxScaler((0,1))
    X_flat = X.reshape(-1, n_feat)
    Xs_flat = scaler.fit_transform(X_flat)
    with open("scaler1.json", "w") as f:
        json.dump({
            "min_": scaler.min_.tolist(),
            "scale_": scaler.scale_.tolist(),
            "data_min_": scaler.data_min_.tolist(),
            "data_max_": scaler.data_max_.tolist(),
            "data_range_": scaler.data_range_.tolist()
        }, f, indent=4)
    return Xs_flat.reshape(n_samples, rows, n_feat)

#---------------------------
# Main Workflow
#---------------------------
def main():
    # Load & scale
    X, y = load_and_prepare()
    Xs = scale_data(X)
    # Training settings
    iterations = 3
    records = []
    total = len(model_builders) * iterations
    pbar = tqdm(total=total, desc="Overall Progress")

    for name, builder in model_builders:
        model = builder()
        for it in range(iterations):
            try:
                # Train-test split
                X_tr, X_temp, y_tr, y_temp = train_test_split(Xs, y, test_size=0.2, random_state=it)
                X_val, X_te, y_val, y_te = train_test_split(X_temp, y_temp, test_size=0.5, random_state=it)
                # Train
                model.fit(
                    X_tr, y_tr,
                    epochs=100,
                    batch_size=16,
                    validation_data=(X_val, y_val),
                    callbacks=[ConsecutiveEarlyStopping()],
                    verbose=0
                )
                # Evaluate
                prob = model.predict(X_te)
                pred = (prob > 0.5).astype(int).flatten()
                tn, fp, fn, tp = confusion_matrix(y_te, pred).ravel()
                records.append({
                    'model': name,
                    'accuracy': accuracy_score(y_te, pred),
                    'precision': precision_score(y_te, pred, zero_division=0),
                    'recall': recall_score(y_te, pred, zero_division=0),
                    'f1_score': f1_score(y_te, pred, zero_division=0),
                    'roc_auc': roc_auc_score(y_te, prob) if len(np.unique(y_te))>1 else np.nan,
                    'mcc': matthews_corrcoef(y_te, pred),
                    'specificity': tn/(tn+fp) if (tn+fp)>0 else np.nan,
                    'log_loss': log_loss(y_te, prob)
                })
            except Exception as e:
                print(f"{name} iteration {it} failed: {e}")
            pbar.update(1)
    pbar.close()

    # Aggregate and save
    df = pd.DataFrame(records)
    metrics = ['accuracy','precision','recall','f1_score','roc_auc','mcc','specificity','log_loss']
    agg = df.groupby('model')[metrics].mean().reset_index()
    agg.to_csv('eval.csv', index=False)

    # Plotting aggregated
    agg['inv_log_loss'] = 1/(1+agg['log_loss'])
    labels = ['accuracy','precision','recall','f1_score','roc_auc','mcc','specificity','inv_log_loss']
    # Heatmap
    plt.figure(figsize=(8,6))
    sns.heatmap(agg.set_index('model')[labels], annot=True, cmap='YlGnBu', fmt='.3f')
    plt.title('Heatmap of Average Metrics per Model')
    plt.savefig('plot/heatmap_avg.png'); plt.close()
    # Radar Chart
    angles = np.linspace(0,2*np.pi,len(labels),endpoint=False).tolist(); angles += angles[:1]
    fig, ax = plt.subplots(figsize=(8,8), subplot_kw=dict(polar=True))
    for _,row in agg.iterrows():
        vals = row[labels].tolist(); vals += vals[:1]
        ax.plot(angles, vals, label=row['model'])
        ax.fill(angles, vals, alpha=0.1)
    ax.set_xticks(angles[:-1]); ax.set_xticklabels(labels)
    ax.legend(bbox_to_anchor=(1.1,1.1)); plt.savefig('plot/radar_avg.png'); plt.close()
    # Parallel Coordinates
    pc = agg.copy(); pc['model_label'] = pc['model']
    plt.figure(figsize=(8,6))
    parallel_coordinates(pc[['model_label']+labels], 'model_label')
    plt.title('Parallel Coordinates of Average Metrics')
    plt.savefig('plot/parallel_avg.png'); plt.close()
    # Grouped Bar
    melt = agg.melt(id_vars='model', value_vars=labels, var_name='metric', value_name='value')
    plt.figure(figsize=(10,6))
    sns.barplot(data=melt, x='metric', y='value', hue='model')
    plt.title('Grouped Bar Chart of Average Metrics')
    plt.savefig('plot/grouped_bar_avg.png'); plt.close()

if __name__ == '__main__':
    main()

# -- End of Notebook Cell --

# What this code does:
# - Loads `final_f.csv` and `final_l.csv`, fills missing values with column means.
# - Reshapes data into (samples, 100, features) and scales features with MinMaxScaler.
# - Initializes three models: CNN, LSTM, CNN_LSTM.
# - Trains each model for 3 iterations (100 epochs, early stopping at 99% accuracy for 5 epochs) with a progress bar.
# - Collects accuracy, precision, recall, F1, ROC AUC, MCC, specificity, and log loss per iteration.
# - Aggregates metrics by model (averaged over iterations), saves `eval.csv`.
# - Plots four comparison charts (heatmap, radar, parallel coordinates, grouped bar) from the averaged metrics.


Overall Progress:   0%|                                                                          | 0/9 [00:00<?, ?it/s]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step  


Overall Progress:  11%|███████▏                                                         | 1/9 [05:04<40:39, 304.89s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Overall Progress:  22%|██████████████▍                                                  | 2/9 [09:46<33:57, 291.14s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


Overall Progress:  33%|█████████████████████▋                                           | 3/9 [14:17<28:11, 281.95s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step  


Overall Progress:  44%|███████████████████████████▌                                  | 4/9 [54:48<1:34:12, 1130.41s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


Overall Progress:  56%|█████████████████████████████████▎                          | 5/9 [1:34:28<1:45:23, 1580.94s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


Overall Progress:  67%|████████████████████████████████████████                    | 6/9 [2:14:00<1:32:29, 1849.96s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step  


Overall Progress:  78%|████████████████████████████████████████████████▏             | 7/9 [2:21:10<46:11, 1385.60s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Overall Progress:  89%|███████████████████████████████████████████████████████       | 8/9 [2:28:32<18:05, 1085.37s/it]

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


Overall Progress: 100%|██████████████████████████████████████████████████████████████| 9/9 [2:35:40<00:00, 1037.86s/it]
