Connected to .venv (Python 3.12.10)

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[]

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    confusion_matrix as sk_confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import StandardScaler
import logging
import time
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from catboost import CatBoostClassifier
from tensorflow.keras.utils import to_categorical 
import tensorflow as tf 
import logging
import gc 
import matplotlib.pyplot as plt

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
ENSEMBLE_WEIGHTS = [0.4, 0.3, 0.3]
num_classes = 2
epochs = 50
batch_size = 16
learning_rate = 0.1
all_possible_labels = list(range(num_classes))

In [5]:
def create_cnn_model_1(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dense1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_2(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters= 10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    dropout1 = Dropout(0.5)(dense1)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dropout1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_3(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn3_output')(flatten)
    return Model(inputs=input_layer, outputs=output_layer)

In [6]:
def weighted_ensemble_predictions(predictions, weights):
    if len(predictions) != len(weights):
        raise ValueError("Number of models must match number of weights")
    weighted_predictions = np.array([predictions[i] * weights[i] for i in range(len(predictions))])
    ensemble_predictions = np.sum(weighted_predictions, axis=0)
    ensemble_predictions = np.argmax(ensemble_predictions, axis=1)
    return ensemble_predictions

In [7]:
def train_and_evaluate_cnn_ensemble(X_train, y_train, X_val, y_val, params):

    start_time = time.time()
    logging.info("Training CNN Ensemble...")


    X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else np.array(X_train)
    y_train_np = y_train.values if isinstance(y_train, pd.Series) else np.array(y_train)
    X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else np.array(X_val)
    y_val_np = y_val.values if isinstance(y_val, pd.Series) else np.array(y_val)

    y_train_np = y_train_np.astype(int)
    y_val_np = y_val_np.astype(int)


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_np)
    X_val_scaled = scaler.transform(X_val_np)

    X_train_reshaped = np.expand_dims(X_train_scaled, axis=-1).astype(np.float32)
    X_val_reshaped = np.expand_dims(X_val_scaled, axis=-1).astype(np.float32)

    if X_train_reshaped.shape[1] == 0:
        logging.error("Error: X_train has 0 features after preprocessing/selection.")
        return None 
    input_shape = (X_train_reshaped.shape[1], 1)


    y_train_categorical = to_categorical(y_train_np, num_classes=num_classes)
    y_val_categorical = to_categorical(y_val_np, num_classes=num_classes) 


    cnn_model_1 = create_cnn_model_1(input_shape, num_classes)
    cnn_model_2 = create_cnn_model_2(input_shape, num_classes)
    cnn_model_3 = create_cnn_model_3(input_shape, num_classes)

    loss_function = 'binary_crossentropy'

    cnn_model_1.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_2.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_3.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])


    train_start_time = time.time()
    logging.info("Fitting CNN Models...")
    cnn_model_1.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical)) 

    cnn_model_2.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))

    cnn_model_3.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))
    
    train_end_time = time.time()
    training_time = train_end_time - train_start_time
    logging.info(f"CNN Ensemble trained in {training_time:.2f} seconds.")
    
    predict_start_time = time.time()
    logging.info("Predicting with CNN Models...")
    cnn_predictions_1 = cnn_model_1.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_2 = cnn_model_2.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_3 = cnn_model_3.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)

    ensemble_predictions = weighted_ensemble_predictions(
        [cnn_predictions_1, cnn_predictions_2, cnn_predictions_3], ENSEMBLE_WEIGHTS)
    
    predict_end_time = time.time()
    prediction_time = predict_end_time - predict_start_time
    logging.info(f"CNN Ensemble predicted in {prediction_time:.2f} seconds.")


    logging.info("Calculating Evaluation Metrics...")
    try:
        metrics = {
            'accuracy': accuracy_score(y_val_np, ensemble_predictions),
            'precision': precision_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'recall': recall_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'f1_score': f1_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'confusion_matrix': sk_confusion_matrix(y_val_np, ensemble_predictions,labels=all_possible_labels).tolist(),
            'roc_auc': roc_auc_score(y_val, ensemble_predictions),
            'confusion_matrix': confusion_matrix(y_val, ensemble_predictions).tolist(),
            'specificity': confusion_matrix(y_val, ensemble_predictions)[0, 0] / (
                confusion_matrix(y_val, ensemble_predictions)[0, 0] + confusion_matrix(y_val, ensemble_predictions)[0, 1]),
            'sensitivity': confusion_matrix(y_val, ensemble_predictions)[1, 1] / (
                confusion_matrix(y_val, ensemble_predictions)[1, 0] + confusion_matrix(y_val, ensemble_predictions)[1, 1]),
        }
        logging.info(f"Evaluation metrics: {metrics}")
    except Exception as e:
        logging.error(f"Error calculating metrics: {e}")
        metrics = None 

    logging.info("Cleaning up models...")
    del cnn_model_1, cnn_model_2, cnn_model_3
    del cnn_predictions_1, cnn_predictions_2, cnn_predictions_3
    tf.keras.backend.clear_session() 
    gc.collect() 

    logging.info(f"CNN Ensemble trained and evaluated in {time.time() - start_time:.2f} seconds.")

    return metrics, training_time, prediction_time

In [8]:
def load_and_preprocess_data_and_split(data_path, target_column='target'):

    logging.info(f"Loading data from: {data_path}")

    data = pd.read_csv(data_path)
    data = data.drop(columns='Unnamed: 0', errors='ignore')
    data = data.fillna(data.mean())
    print(data.shape)
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    logging.info("Data loaded and preprocessed.")
    logging.info(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")
    return data, X_train, y_train, X_val, y_val

In [9]:
def reduce_dimensionality(X, y, top_n_features=50, save_path='reduced_data.pkl'):
    start_time = time.time()
    logging.info("Starting dimensionality reduction...")

    if save_path and os.path.exists(save_path):
        logging.info(f"Loading reduced data from: {save_path}")
        saved_data = pd.read_pickle(save_path)
        X_reduced = saved_data['X']
        selected_features = saved_data['features']
        logging.info(f"Loaded reduced data from {save_path} in {time.time() - start_time:.2f} seconds.")
        return X_reduced, selected_features

   
    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=200, 
                               depth=4, 
                               learning_rate=0.1,
                               loss_function='Logloss',
                               verbose=1, 
                               random_seed=42,
                               early_stopping_rounds=200,
                               task_type='GPU')
    model.fit(X, y, verbose=1)
    importance_df = pd.DataFrame({'feature': X.columns,
                                  'importance': model.get_feature_importance()})
    top_features_catboost = importance_df.nlargest(top_n_features,
                                                 'importance')['feature'].tolist()

    import shap
    logging.info("Calculating SHAP values...")
    top_features_shap = None 
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)

        if isinstance(shap_values, list):
            logging.info("SHAP values appear to be multiclass (list).") 
    
            if not shap_values:
                 raise ValueError("SHAP explainer returned an empty list.")
            shap_values = [np.array(vals) for vals in shap_values]

            shapes_in_list = [vals.shape for vals in shap_values]
            logging.info(f"Shapes within SHAP values list: {shapes_in_list}")

            shap_class_importance = [np.abs(vals).mean(axis=0) for vals in shap_values]
            shap_importance = np.mean(shap_class_importance, axis=0)
        elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
            logging.info(f"SHAP values: Multiclass (3D Array), Shape: {shap_values.shape}")

            shap_importance = np.abs(shap_values).mean(axis=(0, 2)) 
        else:            
            if not isinstance(shap_values, np.ndarray):
                 raise TypeError(f"Expected shap_values to be list or numpy array, got {type(shap_values)}")

            logging.info(f"SHAP values appear to be binary/regression (shape: {shap_values.shape}).") 
            shap_importance = np.abs(shap_values).mean(axis=0)

        logging.info(f"Type of shap_importance: {type(shap_importance)}")
        logging.info(f"Shape of shap_importance: {np.array(shap_importance).shape}")
        logging.info(f"Type of X.columns: {type(X.columns)}")
        logging.info(f"Shape of X.columns: {X.columns.shape}")
        logging.info(f"Length comparison: len(X.columns)={len(X.columns)}, len(shap_importance)={len(shap_importance)}")
   
        if np.isnan(shap_importance).any() or np.isinf(shap_importance).any():
             logging.warning("NaN or Inf detected in shap_importance values.")
             shap_importance = np.nan_to_num(shap_importance, nan=0.0, posinf=0.0, neginf=0.0)


        shap_importance_df = pd.DataFrame({'feature': X.columns,
                                           'importance': shap_importance}) 

        top_features_shap = shap_importance_df.nlargest(top_n_features, 'importance')['feature'].tolist()
        logging.info("SHAP values calculated.")
        

    except Exception as e:
        logging.error(f"Error calculating SHAP values: {e}") 
        logging.warning("Using only CatBoost features due to SHAP error.")
        top_features_shap = top_features_catboost 


    selected_features = list(set(top_features_catboost) & set(top_features_shap))
    X_reduced = X[selected_features]
    


    if save_path:
        data_to_save = {'X': X_reduced, 'features': selected_features}
        pd.to_pickle(data_to_save, save_path)
        logging.info(f"Saved reduced data to {save_path}")

    logging.info(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds.")
    return X_reduced, selected_features, top_features_catboost, top_features_shap

In [10]:
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

if __name__ == '__main__':
    total_start_time = time.time()
    data_path = 'C:/Class/HK6/IPrj/Datasets/merged_3374_data.csv'

    data, X_train, y_train, X_test, y_test = load_and_preprocess_data_and_split(data_path)

    X = pd.concat([X_train, X_test], axis=0)
    y = pd.concat([y_train, y_test], axis=0)

    params_cnn = {
        'learning_rate': 0.001,
        'epochs': 50,
        'batch_size': 32,
    }

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    all_metrics = []
    all_training_times = []
    all_prediction_times = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        logging.info(f"Fold {fold + 1}/{kf.get_n_splits()}")

        X_fold_train, X_fold_val = X.iloc[train_index], X.iloc[val_index]
        y_fold_train, y_fold_val = y.iloc[train_index], y.iloc[val_index]

        X_fold_train_reduced, selected_features, top_features_catboost, top_features_shap = reduce_dimensionality(
            X_fold_train.copy(), y_fold_train.copy(),
            save_path=f'cnn_ensemble_reduced_data_fold_{fold}.pkl'
        )
        X_fold_val_reduced = X_fold_val[selected_features]

        metrics_cnn, training_time, prediction_time = train_and_evaluate_cnn_ensemble(
            X_fold_train_reduced, y_fold_train, X_fold_val_reduced, y_fold_val,
            params_cnn
        )
        all_metrics.append(metrics_cnn)
        all_training_times.append(training_time)
        all_prediction_times.append(prediction_time)

    avg_metrics = {}
    for metric in all_metrics[0]:
        if metric != 'confusion_matrix':
            avg_metrics[metric] = np.mean([fold_metrics[metric] for fold_metrics in all_metrics])
        else:
            avg_cm = np.sum([np.array(fold_metrics[metric]) for fold_metrics in all_metrics], axis=0)
            avg_metrics[metric] = avg_cm.tolist()

    logging.info("--- Cross-Validation Completed ---")
    logging.info(f"Average CNN Ensemble Metrics: {avg_metrics}")
    
    for fold, (train_t, pred_t) in enumerate(zip(all_training_times, all_prediction_times), 1):
        print(f"Fold {fold}: Training time = {train_t:.2f} s, Prediction time = {pred_t:.2f} s")
    
    print(f"\nAverage training time: {np.mean(all_training_times):.2f} s")
    print(f"Average prediction time: {np.mean(all_prediction_times):.2f} s")
    
    total_end_time = time.time()    
    total_duration = total_end_time - total_start_time
    logging.info(f"Total cross-validation execution time: {total_duration:.2f} seconds")

2025-07-18 10:40:47,200 - INFO - Loading data from: C:/Class/HK6/IPrj/Datasets/merged_3374_data.csv


(3374, 44755)


2025-07-18 10:45:35,474 - INFO - Data loaded and preprocessed.
2025-07-18 10:45:35,474 - INFO - Training set shape: (2699, 44754), Validation set shape: (675, 44754)
2025-07-18 10:45:39,098 - INFO - Fold 1/10
2025-07-18 10:45:48,136 - INFO - Starting dimensionality reduction...


0:	learn: 0.4878172	total: 297ms	remaining: 59s
1:	learn: 0.3294566	total: 577ms	remaining: 57.1s
2:	learn: 0.2392160	total: 3.23s	remaining: 3m 31s
3:	learn: 0.1804072	total: 3.5s	remaining: 2m 51s
4:	learn: 0.1351421	total: 3.78s	remaining: 2m 27s
5:	learn: 0.1115542	total: 6.43s	remaining: 3m 27s
6:	learn: 0.0911198	total: 6.7s	remaining: 3m 4s
7:	learn: 0.0757133	total: 6.96s	remaining: 2m 47s
8:	learn: 0.0628936	total: 7.23s	remaining: 2m 33s
9:	learn: 0.0556788	total: 9.86s	remaining: 3m 7s
10:	learn: 0.0497294	total: 10.1s	remaining: 2m 53s
11:	learn: 0.0461480	total: 10.4s	remaining: 2m 42s
12:	learn: 0.0421591	total: 10.7s	remaining: 2m 33s
13:	learn: 0.0380767	total: 13.3s	remaining: 2m 56s
14:	learn: 0.0349028	total: 13.5s	remaining: 2m 46s
15:	learn: 0.0313259	total: 13.8s	remaining: 2m 38s
16:	learn: 0.0276910	total: 14.1s	remaining: 2m 31s
17:	learn: 0.0257083	total: 16.7s	remaining: 2m 49s
18:	learn: 0.0243686	total: 17s	remaining: 2m 41s
19:	learn: 0.0225466	total: 17.2

  from .autonotebook import tqdm as notebook_tqdm
2025-07-18 10:49:00,080 - INFO - Calculating SHAP values...
2025-07-18 10:49:33,744 - INFO - SHAP values appear to be binary/regression (shape: (3036, 44754)).
2025-07-18 10:49:34,324 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:49:34,324 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:49:34,324 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:49:34,324 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:49:34,324 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:49:34,324 - INFO - SHAP values calculated.
2025-07-18 10:49:34,345 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_0.pkl
2025-07-18 10:49:34,346 - INFO - Dimensionality reduction completed in 226.21 seconds.
2025-07-18 10:49:34,348 - INFO - Training CNN Ensemble...
2025-07-18 10:49:34,580 - INFO - Fitting CNN Models...
2025-07-18 10:51:15,824 - INFO - CN





2025-07-18 10:51:22,133 - INFO - CNN Ensemble trained and evaluated in 107.78 seconds.
2025-07-18 10:51:22,135 - INFO - Fold 2/10
2025-07-18 10:51:29,717 - INFO - Starting dimensionality reduction...


0:	learn: 0.5159912	total: 250ms	remaining: 49.8s
1:	learn: 0.3681818	total: 497ms	remaining: 49.2s
2:	learn: 0.2648510	total: 747ms	remaining: 49.1s
3:	learn: 0.2018491	total: 2.39s	remaining: 1m 57s
4:	learn: 0.1455406	total: 2.64s	remaining: 1m 43s
5:	learn: 0.1167807	total: 2.91s	remaining: 1m 34s
6:	learn: 0.0941282	total: 3.19s	remaining: 1m 28s
7:	learn: 0.0788792	total: 5.18s	remaining: 2m 4s
8:	learn: 0.0686376	total: 5.4s	remaining: 1m 54s
9:	learn: 0.0620943	total: 5.64s	remaining: 1m 47s
10:	learn: 0.0541207	total: 5.87s	remaining: 1m 40s
11:	learn: 0.0471537	total: 7.75s	remaining: 2m 1s
12:	learn: 0.0427132	total: 7.96s	remaining: 1m 54s
13:	learn: 0.0382321	total: 8.18s	remaining: 1m 48s
14:	learn: 0.0340942	total: 8.39s	remaining: 1m 43s
15:	learn: 0.0308115	total: 8.61s	remaining: 1m 38s
16:	learn: 0.0288154	total: 10.4s	remaining: 1m 51s
17:	learn: 0.0263765	total: 10.6s	remaining: 1m 46s
18:	learn: 0.0242370	total: 10.8s	remaining: 1m 43s
19:	learn: 0.0225850	total: 

2025-07-18 10:52:46,020 - INFO - Calculating SHAP values...
2025-07-18 10:53:03,874 - INFO - SHAP values appear to be binary/regression (shape: (3036, 44754)).
2025-07-18 10:53:04,280 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:53:04,282 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:53:04,282 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:53:04,283 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:53:04,283 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:53:04,289 - INFO - SHAP values calculated.
2025-07-18 10:53:04,292 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_1.pkl
2025-07-18 10:53:04,293 - INFO - Dimensionality reduction completed in 94.58 seconds.
2025-07-18 10:53:04,512 - INFO - Training CNN Ensemble...
2025-07-18 10:53:04,590 - INFO - Fitting CNN Models...
2025-07-18 10:55:29,255 - INFO - CNN Ensemble trained in 144.66 seconds.
2025-07-18 10

0:	learn: 0.4951476	total: 445ms	remaining: 1m 28s
1:	learn: 0.3457989	total: 719ms	remaining: 1m 11s
2:	learn: 0.2551226	total: 956ms	remaining: 1m 2s
3:	learn: 0.1914008	total: 1.15s	remaining: 56.2s
4:	learn: 0.1398764	total: 1.34s	remaining: 52.2s
5:	learn: 0.1115756	total: 1.53s	remaining: 49.5s
6:	learn: 0.0936625	total: 1.73s	remaining: 47.7s
7:	learn: 0.0793864	total: 1.92s	remaining: 46s
8:	learn: 0.0707729	total: 2.1s	remaining: 44.6s
9:	learn: 0.0597125	total: 2.28s	remaining: 43.3s
10:	learn: 0.0524479	total: 2.48s	remaining: 42.5s
11:	learn: 0.0447361	total: 2.66s	remaining: 41.7s
12:	learn: 0.0402628	total: 2.84s	remaining: 40.9s
13:	learn: 0.0366277	total: 3.04s	remaining: 40.3s
14:	learn: 0.0337526	total: 3.23s	remaining: 39.9s
15:	learn: 0.0309158	total: 3.43s	remaining: 39.5s
16:	learn: 0.0295542	total: 3.62s	remaining: 39s
17:	learn: 0.0274506	total: 3.81s	remaining: 38.6s
18:	learn: 0.0254136	total: 4s	remaining: 38.1s
19:	learn: 0.0230975	total: 4.18s	remaining: 37

2025-07-18 10:56:34,843 - INFO - Calculating SHAP values...
2025-07-18 10:56:50,827 - INFO - SHAP values appear to be binary/regression (shape: (3036, 44754)).
2025-07-18 10:56:51,207 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:56:51,209 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:56:51,209 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:56:51,209 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:56:51,209 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:56:51,214 - INFO - SHAP values calculated.
2025-07-18 10:56:51,214 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_2.pkl
2025-07-18 10:56:51,214 - INFO - Dimensionality reduction completed in 71.93 seconds.
2025-07-18 10:56:51,388 - INFO - Training CNN Ensemble...
2025-07-18 10:56:51,467 - INFO - Fitting CNN Models...
2025-07-18 10:58:34,731 - INFO - CNN Ensemble trained in 103.26 seconds.
2025-07-18 10

0:	learn: 0.4890783	total: 371ms	remaining: 1m 13s
1:	learn: 0.3342391	total: 561ms	remaining: 55.5s
2:	learn: 0.2368283	total: 743ms	remaining: 48.8s
3:	learn: 0.1788971	total: 929ms	remaining: 45.5s
4:	learn: 0.1277930	total: 1.12s	remaining: 43.7s
5:	learn: 0.0976945	total: 1.33s	remaining: 42.9s
6:	learn: 0.0812907	total: 1.53s	remaining: 42.1s
7:	learn: 0.0700338	total: 1.73s	remaining: 41.5s
8:	learn: 0.0607785	total: 1.92s	remaining: 40.8s
9:	learn: 0.0550846	total: 2.12s	remaining: 40.2s
10:	learn: 0.0493406	total: 2.3s	remaining: 39.6s
11:	learn: 0.0451952	total: 2.5s	remaining: 39.1s
12:	learn: 0.0412090	total: 2.68s	remaining: 38.6s
13:	learn: 0.0373375	total: 2.87s	remaining: 38.1s
14:	learn: 0.0345083	total: 3.05s	remaining: 37.6s
15:	learn: 0.0316360	total: 3.23s	remaining: 37.1s
16:	learn: 0.0294512	total: 3.42s	remaining: 36.8s
17:	learn: 0.0259679	total: 3.62s	remaining: 36.6s
18:	learn: 0.0230734	total: 3.81s	remaining: 36.3s
19:	learn: 0.0214097	total: 3.99s	remainin

2025-07-18 10:59:22,860 - INFO - Calculating SHAP values...
2025-07-18 10:59:37,536 - INFO - SHAP values appear to be binary/regression (shape: (3036, 44754)).
2025-07-18 10:59:38,062 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:59:38,062 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:59:38,066 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:59:38,067 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:59:38,068 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:59:38,078 - INFO - SHAP values calculated.
2025-07-18 10:59:38,081 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_3.pkl
2025-07-18 10:59:38,081 - INFO - Dimensionality reduction completed in 58.56 seconds.
2025-07-18 10:59:38,345 - INFO - Training CNN Ensemble...
2025-07-18 10:59:38,511 - INFO - Fitting CNN Models...
2025-07-18 11:01:32,751 - INFO - CNN Ensemble trained in 114.24 seconds.
2025-07-18 11

0:	learn: 0.4763791	total: 458ms	remaining: 1m 31s
1:	learn: 0.3333178	total: 679ms	remaining: 1m 7s
2:	learn: 0.2486264	total: 897ms	remaining: 58.9s
3:	learn: 0.1846759	total: 1.11s	remaining: 54.4s
4:	learn: 0.1422757	total: 1.33s	remaining: 51.7s
5:	learn: 0.1114990	total: 1.54s	remaining: 49.9s
6:	learn: 0.0917384	total: 1.76s	remaining: 48.5s
7:	learn: 0.0731718	total: 1.98s	remaining: 47.5s
8:	learn: 0.0625680	total: 2.19s	remaining: 46.6s
9:	learn: 0.0528602	total: 2.41s	remaining: 45.8s
10:	learn: 0.0466759	total: 2.62s	remaining: 45.1s
11:	learn: 0.0427000	total: 2.83s	remaining: 44.4s
12:	learn: 0.0387658	total: 3.05s	remaining: 43.8s
13:	learn: 0.0349138	total: 3.26s	remaining: 43.4s
14:	learn: 0.0329327	total: 3.48s	remaining: 42.9s
15:	learn: 0.0304728	total: 3.69s	remaining: 42.5s
16:	learn: 0.0284821	total: 3.9s	remaining: 42s
17:	learn: 0.0268783	total: 4.12s	remaining: 41.7s
18:	learn: 0.0250280	total: 4.34s	remaining: 41.3s
19:	learn: 0.0230618	total: 4.55s	remaining

2025-07-18 11:02:38,783 - INFO - Calculating SHAP values...
2025-07-18 11:03:11,897 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:03:12,592 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:03:12,592 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:03:12,592 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:03:12,592 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:03:12,592 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:03:12,608 - INFO - SHAP values calculated.
2025-07-18 11:03:12,608 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_4.pkl
2025-07-18 11:03:12,608 - INFO - Dimensionality reduction completed in 91.16 seconds.
2025-07-18 11:03:12,909 - INFO - Training CNN Ensemble...
2025-07-18 11:03:13,069 - INFO - Fitting CNN Models...
2025-07-18 11:05:34,225 - INFO - CNN Ensemble trained in 141.16 seconds.
2025-07-18 11

0:	learn: 0.4935212	total: 232ms	remaining: 46.2s
1:	learn: 0.3387461	total: 455ms	remaining: 45.1s
2:	learn: 0.2483694	total: 700ms	remaining: 46s
3:	learn: 0.1809107	total: 938ms	remaining: 46s
4:	learn: 0.1418665	total: 3.68s	remaining: 2m 23s
5:	learn: 0.1088018	total: 3.92s	remaining: 2m 6s
6:	learn: 0.0931420	total: 4.15s	remaining: 1m 54s
7:	learn: 0.0752948	total: 4.39s	remaining: 1m 45s
8:	learn: 0.0619456	total: 7.27s	remaining: 2m 34s
9:	learn: 0.0550095	total: 7.57s	remaining: 2m 23s
10:	learn: 0.0499049	total: 7.91s	remaining: 2m 15s
11:	learn: 0.0455867	total: 10.8s	remaining: 2m 48s
12:	learn: 0.0410493	total: 11s	remaining: 2m 38s
13:	learn: 0.0359031	total: 11.3s	remaining: 2m 30s
14:	learn: 0.0324876	total: 11.6s	remaining: 2m 23s
15:	learn: 0.0296961	total: 14.4s	remaining: 2m 45s
16:	learn: 0.0283042	total: 14.6s	remaining: 2m 37s
17:	learn: 0.0269316	total: 14.9s	remaining: 2m 30s
18:	learn: 0.0249321	total: 15.1s	remaining: 2m 23s
19:	learn: 0.0221007	total: 15.3s

2025-07-18 11:09:01,952 - INFO - Calculating SHAP values...
2025-07-18 11:09:41,244 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:09:41,889 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:09:41,889 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:09:41,889 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:09:41,889 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:09:41,889 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:09:41,889 - INFO - SHAP values calculated.
2025-07-18 11:09:41,905 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_5.pkl
2025-07-18 11:09:41,905 - INFO - Dimensionality reduction completed in 236.54 seconds.
2025-07-18 11:09:42,141 - INFO - Training CNN Ensemble...
2025-07-18 11:09:42,361 - INFO - Fitting CNN Models...
2025-07-18 11:11:39,529 - INFO - CNN Ensemble trained in 117.17 seconds.
2025-07-18 1

0:	learn: 0.4867330	total: 220ms	remaining: 43.8s
1:	learn: 0.3442705	total: 429ms	remaining: 42.5s
2:	learn: 0.2516065	total: 2.97s	remaining: 3m 15s
3:	learn: 0.1936998	total: 3.19s	remaining: 2m 36s
4:	learn: 0.1438140	total: 3.4s	remaining: 2m 12s
5:	learn: 0.1168810	total: 3.62s	remaining: 1m 57s
6:	learn: 0.0942434	total: 6.22s	remaining: 2m 51s
7:	learn: 0.0733261	total: 6.43s	remaining: 2m 34s
8:	learn: 0.0600183	total: 6.64s	remaining: 2m 20s
9:	learn: 0.0526408	total: 6.85s	remaining: 2m 10s
10:	learn: 0.0463510	total: 7.06s	remaining: 2m 1s
11:	learn: 0.0394077	total: 9.71s	remaining: 2m 32s
12:	learn: 0.0366196	total: 9.91s	remaining: 2m 22s
13:	learn: 0.0331056	total: 10.1s	remaining: 2m 14s
14:	learn: 0.0299857	total: 10.3s	remaining: 2m 7s
15:	learn: 0.0280755	total: 10.6s	remaining: 2m 1s
16:	learn: 0.0260079	total: 13.2s	remaining: 2m 21s
17:	learn: 0.0233894	total: 13.4s	remaining: 2m 15s
18:	learn: 0.0206631	total: 13.6s	remaining: 2m 9s
19:	learn: 0.0196211	total: 1

2025-07-18 11:13:31,735 - INFO - Calculating SHAP values...
2025-07-18 11:14:04,986 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:14:05,508 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:14:05,508 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:14:05,508 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:14:05,508 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:14:05,508 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:14:05,524 - INFO - SHAP values calculated.
2025-07-18 11:14:05,540 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_6.pkl
2025-07-18 11:14:05,540 - INFO - Dimensionality reduction completed in 137.12 seconds.
2025-07-18 11:14:05,825 - INFO - Training CNN Ensemble...
2025-07-18 11:14:06,001 - INFO - Fitting CNN Models...
2025-07-18 11:16:10,661 - INFO - CNN Ensemble trained in 124.66 seconds.
2025-07-18 1

0:	learn: 0.4895717	total: 398ms	remaining: 1m 19s
1:	learn: 0.3457973	total: 590ms	remaining: 58.4s
2:	learn: 0.2545460	total: 792ms	remaining: 52s
3:	learn: 0.1884798	total: 1.01s	remaining: 49.4s
4:	learn: 0.1450466	total: 1.2s	remaining: 46.8s
5:	learn: 0.1149338	total: 1.4s	remaining: 45.4s
6:	learn: 0.0933028	total: 1.59s	remaining: 44s
7:	learn: 0.0769289	total: 1.78s	remaining: 42.8s
8:	learn: 0.0667301	total: 1.97s	remaining: 41.9s
9:	learn: 0.0559543	total: 2.17s	remaining: 41.2s
10:	learn: 0.0495417	total: 2.36s	remaining: 40.6s
11:	learn: 0.0449514	total: 2.55s	remaining: 40s
12:	learn: 0.0423064	total: 2.74s	remaining: 39.4s
13:	learn: 0.0393403	total: 2.93s	remaining: 38.9s
14:	learn: 0.0365648	total: 3.11s	remaining: 38.3s
15:	learn: 0.0335230	total: 3.29s	remaining: 37.9s
16:	learn: 0.0312299	total: 3.48s	remaining: 37.5s
17:	learn: 0.0289189	total: 3.67s	remaining: 37.1s
18:	learn: 0.0268020	total: 3.86s	remaining: 36.8s
19:	learn: 0.0253473	total: 4.04s	remaining: 36.

2025-07-18 11:17:00,161 - INFO - Calculating SHAP values...
2025-07-18 11:17:15,369 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:17:15,869 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:17:15,869 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:17:15,869 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:17:15,869 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:17:15,869 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:17:15,877 - INFO - SHAP values calculated.
2025-07-18 11:17:15,885 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_7.pkl
2025-07-18 11:17:15,886 - INFO - Dimensionality reduction completed in 59.97 seconds.
2025-07-18 11:17:16,055 - INFO - Training CNN Ensemble...
2025-07-18 11:17:16,135 - INFO - Fitting CNN Models...
2025-07-18 11:19:02,699 - INFO - CNN Ensemble trained in 106.56 seconds.
2025-07-18 11

0:	learn: 0.4729150	total: 488ms	remaining: 1m 37s
1:	learn: 0.3043730	total: 705ms	remaining: 1m 9s
2:	learn: 0.2308962	total: 924ms	remaining: 1m
3:	learn: 0.1724635	total: 1.14s	remaining: 55.9s
4:	learn: 0.1337870	total: 1.35s	remaining: 52.8s
5:	learn: 0.1037084	total: 1.57s	remaining: 50.6s
6:	learn: 0.0877794	total: 1.78s	remaining: 49.2s
7:	learn: 0.0770721	total: 2s	remaining: 48s
8:	learn: 0.0674856	total: 2.21s	remaining: 47s
9:	learn: 0.0584791	total: 2.43s	remaining: 46.3s
10:	learn: 0.0531049	total: 2.65s	remaining: 45.5s
11:	learn: 0.0465897	total: 2.87s	remaining: 45s
12:	learn: 0.0420545	total: 3.08s	remaining: 44.4s
13:	learn: 0.0381824	total: 3.29s	remaining: 43.8s
14:	learn: 0.0343771	total: 3.51s	remaining: 43.3s
15:	learn: 0.0308688	total: 3.73s	remaining: 42.9s
16:	learn: 0.0290251	total: 3.96s	remaining: 42.7s
17:	learn: 0.0263685	total: 4.19s	remaining: 42.3s
18:	learn: 0.0232120	total: 4.45s	remaining: 42.4s
19:	learn: 0.0223170	total: 4.7s	remaining: 42.3s
20

2025-07-18 11:20:03,237 - INFO - Calculating SHAP values...
2025-07-18 11:20:18,803 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:20:19,221 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:20:19,222 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:20:19,222 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:20:19,222 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:20:19,222 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:20:19,227 - INFO - SHAP values calculated.
2025-07-18 11:20:19,227 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_8.pkl
2025-07-18 11:20:19,233 - INFO - Dimensionality reduction completed in 67.91 seconds.
2025-07-18 11:20:19,402 - INFO - Training CNN Ensemble...
2025-07-18 11:20:19,492 - INFO - Fitting CNN Models...
2025-07-18 11:22:04,133 - INFO - CNN Ensemble trained in 104.64 seconds.
2025-07-18 11

0:	learn: 0.5002450	total: 207ms	remaining: 41.2s
1:	learn: 0.3534442	total: 414ms	remaining: 41s
2:	learn: 0.2567063	total: 627ms	remaining: 41.2s
3:	learn: 0.1784751	total: 848ms	remaining: 41.5s
4:	learn: 0.1429671	total: 3.47s	remaining: 2m 15s
5:	learn: 0.1166317	total: 3.64s	remaining: 1m 57s
6:	learn: 0.0957286	total: 3.81s	remaining: 1m 45s
7:	learn: 0.0802093	total: 3.98s	remaining: 1m 35s
8:	learn: 0.0661870	total: 4.16s	remaining: 1m 28s
9:	learn: 0.0566937	total: 4.33s	remaining: 1m 22s
10:	learn: 0.0494016	total: 6.89s	remaining: 1m 58s
11:	learn: 0.0436243	total: 7.09s	remaining: 1m 51s
12:	learn: 0.0403446	total: 7.29s	remaining: 1m 44s
13:	learn: 0.0374289	total: 7.5s	remaining: 1m 39s
14:	learn: 0.0350064	total: 7.69s	remaining: 1m 34s
15:	learn: 0.0322327	total: 10.2s	remaining: 1m 57s
16:	learn: 0.0300120	total: 10.4s	remaining: 1m 51s
17:	learn: 0.0274143	total: 10.6s	remaining: 1m 46s
18:	learn: 0.0250130	total: 10.7s	remaining: 1m 42s
19:	learn: 0.0237026	total: 1

2025-07-18 11:24:46,533 - INFO - Calculating SHAP values...
2025-07-18 11:25:24,905 - INFO - SHAP values appear to be binary/regression (shape: (3037, 44754)).
2025-07-18 11:25:25,535 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 11:25:25,536 - INFO - Shape of shap_importance: (44754,)
2025-07-18 11:25:25,537 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 11:25:25,540 - INFO - Shape of X.columns: (44754,)
2025-07-18 11:25:25,541 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 11:25:25,549 - INFO - SHAP values calculated.
2025-07-18 11:25:25,558 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_9.pkl
2025-07-18 11:25:25,560 - INFO - Dimensionality reduction completed in 196.68 seconds.
2025-07-18 11:25:25,819 - INFO - Training CNN Ensemble...
2025-07-18 11:25:26,040 - INFO - Fitting CNN Models...
2025-07-18 11:26:35,308 - INFO - CNN Ensemble trained in 69.27 seconds.
2025-07-18 11

Fold 1: Training time = 101.24 s, Prediction time = 1.06 s
Fold 2: Training time = 144.66 s, Prediction time = 1.32 s
Fold 3: Training time = 103.26 s, Prediction time = 0.60 s
Fold 4: Training time = 114.24 s, Prediction time = 1.26 s
Fold 5: Training time = 141.16 s, Prediction time = 1.46 s
Fold 6: Training time = 117.17 s, Prediction time = 1.13 s
Fold 7: Training time = 124.66 s, Prediction time = 0.59 s
Fold 8: Training time = 106.56 s, Prediction time = 1.23 s
Fold 9: Training time = 104.64 s, Prediction time = 0.58 s
Fold 10: Training time = 69.27 s, Prediction time = 0.56 s

Average training time: 112.69 s
Average prediction time: 0.98 s
