Connected to .venv (Python 3.12.10)

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[]

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    confusion_matrix as sk_confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import StandardScaler
import logging
import time
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from catboost import CatBoostClassifier
from tensorflow.keras.utils import to_categorical 
import tensorflow as tf 
import logging
import gc 
import matplotlib.pyplot as plt

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
ENSEMBLE_WEIGHTS = [0.4, 0.3, 0.3]
num_classes = 2
epochs = 50
batch_size = 16
learning_rate = 0.1
all_possible_labels = list(range(num_classes))

In [5]:
def create_cnn_model_1(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dense1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_2(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters= 10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    dropout1 = Dropout(0.5)(dense1)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dropout1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_3(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn3_output')(flatten)
    return Model(inputs=input_layer, outputs=output_layer)

In [6]:
def weighted_ensemble_predictions(predictions, weights):
    if len(predictions) != len(weights):
        raise ValueError("Number of models must match number of weights")
    weighted_predictions = np.array([predictions[i] * weights[i] for i in range(len(predictions))])
    ensemble_predictions = np.sum(weighted_predictions, axis=0)
    ensemble_predictions = np.argmax(ensemble_predictions, axis=1)
    return ensemble_predictions

In [7]:
def train_and_evaluate_cnn_ensemble(X_train, y_train, X_val, y_val, params):

    start_time = time.time()
    logging.info("Training CNN Ensemble...")

    X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else np.array(X_train)
    y_train_np = y_train.values if isinstance(y_train, pd.Series) else np.array(y_train)
    X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else np.array(X_val)
    y_val_np = y_val.values if isinstance(y_val, pd.Series) else np.array(y_val)

    y_train_np = y_train_np.astype(int)
    y_val_np = y_val_np.astype(int)



    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_np)
    X_val_scaled = scaler.transform(X_val_np)


    X_train_reshaped = np.expand_dims(X_train_scaled, axis=-1).astype(np.float32)
    X_val_reshaped = np.expand_dims(X_val_scaled, axis=-1).astype(np.float32)

    if X_train_reshaped.shape[1] == 0:
        logging.error("Error: X_train has 0 features after preprocessing/selection.")
        return None 
    input_shape = (X_train_reshaped.shape[1], 1)


    y_train_categorical = to_categorical(y_train_np, num_classes=num_classes)
    y_val_categorical = to_categorical(y_val_np, num_classes=num_classes) 



    cnn_model_1 = create_cnn_model_1(input_shape, num_classes)
    cnn_model_2 = create_cnn_model_2(input_shape, num_classes)
    cnn_model_3 = create_cnn_model_3(input_shape, num_classes)

    loss_function = 'binary_crossentropy' 

    cnn_model_1.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_2.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_3.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])



    train_start_time = time.time()
    logging.info("Fitting CNN Models...")
    cnn_model_1.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical)) 

    cnn_model_2.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))

    cnn_model_3.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))
    
    train_end_time = time.time()
    training_time = train_end_time - train_start_time
    logging.info(f"CNN Ensemble trained in {training_time:.2f} seconds.")
    
    predict_start_time = time.time()
    logging.info("Predicting with CNN Models...")
    cnn_predictions_1 = cnn_model_1.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_2 = cnn_model_2.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_3 = cnn_model_3.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)

    ensemble_predictions = weighted_ensemble_predictions(
        [cnn_predictions_1, cnn_predictions_2, cnn_predictions_3], ENSEMBLE_WEIGHTS)
    
    predict_end_time = time.time()
    prediction_time = predict_end_time - predict_start_time
    logging.info(f"CNN Ensemble predicted in {prediction_time:.2f} seconds.")


    logging.info("Calculating Evaluation Metrics...")
    try:
        metrics = {
            'accuracy': accuracy_score(y_val_np, ensemble_predictions),
            'precision': precision_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'recall': recall_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'f1_score': f1_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'confusion_matrix': sk_confusion_matrix(y_val_np, ensemble_predictions,labels=all_possible_labels).tolist(),
            'roc_auc': roc_auc_score(y_val, ensemble_predictions),
            'confusion_matrix': confusion_matrix(y_val, ensemble_predictions).tolist(),
            'specificity': confusion_matrix(y_val, ensemble_predictions)[0, 0] / (
                confusion_matrix(y_val, ensemble_predictions)[0, 0] + confusion_matrix(y_val, ensemble_predictions)[0, 1]),
            'sensitivity': confusion_matrix(y_val, ensemble_predictions)[1, 1] / (
                confusion_matrix(y_val, ensemble_predictions)[1, 0] + confusion_matrix(y_val, ensemble_predictions)[1, 1]),
        }
        logging.info(f"Evaluation metrics: {metrics}")
    except Exception as e:
        logging.error(f"Error calculating metrics: {e}")
        metrics = None 


    logging.info("Cleaning up models...")
    del cnn_model_1, cnn_model_2, cnn_model_3
    del cnn_predictions_1, cnn_predictions_2, cnn_predictions_3
    tf.keras.backend.clear_session() 
    gc.collect() 

    logging.info(f"CNN Ensemble trained and evaluated in {time.time() - start_time:.2f} seconds.")

    return metrics, training_time, prediction_time

In [8]:
def load_and_preprocess_data_and_split(data_path, target_column='target'):

    logging.info(f"Loading data from: {data_path}")

    data = pd.read_csv(data_path)
    data = data.drop(columns='Unnamed: 0', errors='ignore')
    data = data.fillna(data.mean())
    print(data.shape)
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    logging.info("Data loaded and preprocessed.")
    logging.info(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")
    return data, X_train, y_train, X_val, y_val

In [9]:
def reduce_dimensionality(X, y, top_n_features=50, save_path='reduced_data.pkl'):
    start_time = time.time()
    logging.info("Starting dimensionality reduction...")

    if save_path and os.path.exists(save_path):
        logging.info(f"Loading reduced data from: {save_path}")
        saved_data = pd.read_pickle(save_path)
        X_reduced = saved_data['X']
        selected_features = saved_data['features']
        logging.info(f"Loaded reduced data from {save_path} in {time.time() - start_time:.2f} seconds.")
        return X_reduced, selected_features


    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=200, 
                               depth=4, 
                               learning_rate=0.1,
                               loss_function='Logloss',
                               verbose=1, 
                               random_seed=42,
                               early_stopping_rounds=200,
                               task_type='GPU')
    model.fit(X, y, verbose=1)
    importance_df = pd.DataFrame({'feature': X.columns,
                                  'importance': model.get_feature_importance()})
    top_features_catboost = importance_df.nlargest(top_n_features,
                                                 'importance')['feature'].tolist()


    import shap
    logging.info("Calculating SHAP values...")
    top_features_shap = None 
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)

        if isinstance(shap_values, list):
            logging.info("SHAP values appear to be multiclass (list).")
   
            if not shap_values:
                 raise ValueError("SHAP explainer returned an empty list.")

            shap_values = [np.array(vals) for vals in shap_values]


            shapes_in_list = [vals.shape for vals in shap_values]
            logging.info(f"Shapes within SHAP values list: {shapes_in_list}")

            shap_class_importance = [np.abs(vals).mean(axis=0) for vals in shap_values]
            shap_importance = np.mean(shap_class_importance, axis=0)
        elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
            logging.info(f"SHAP values: Multiclass (3D Array), Shape: {shap_values.shape}")

            shap_importance = np.abs(shap_values).mean(axis=(0, 2))
        else:            
            if not isinstance(shap_values, np.ndarray):
                 raise TypeError(f"Expected shap_values to be list or numpy array, got {type(shap_values)}")

            logging.info(f"SHAP values appear to be binary/regression (shape: {shap_values.shape}).")
            shap_importance = np.abs(shap_values).mean(axis=0)
            
            
        logging.info(f"Type of shap_importance: {type(shap_importance)}")
        logging.info(f"Shape of shap_importance: {np.array(shap_importance).shape}")
        logging.info(f"Type of X.columns: {type(X.columns)}")
        logging.info(f"Shape of X.columns: {X.columns.shape}")
        logging.info(f"Length comparison: len(X.columns)={len(X.columns)}, len(shap_importance)={len(shap_importance)}")

        if np.isnan(shap_importance).any() or np.isinf(shap_importance).any():
             logging.warning("NaN or Inf detected in shap_importance values.")
             shap_importance = np.nan_to_num(shap_importance, nan=0.0, posinf=0.0, neginf=0.0)


        shap_importance_df = pd.DataFrame({'feature': X.columns,
                                           'importance': shap_importance}) 

        top_features_shap = shap_importance_df.nlargest(top_n_features, 'importance')['feature'].tolist()
        logging.info("SHAP values calculated.")
        

    except Exception as e:
        logging.error(f"Error calculating SHAP values: {e}") 
        logging.warning("Using only CatBoost features due to SHAP error.")
        top_features_shap = top_features_catboost 


    selected_features = list(set(top_features_catboost) & set(top_features_shap))
    X_reduced = X[selected_features]
    


    if save_path:
        data_to_save = {'X': X_reduced, 'features': selected_features}
        pd.to_pickle(data_to_save, save_path)
        logging.info(f"Saved reduced data to {save_path}")

    logging.info(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds.")
    return X_reduced, selected_features, top_features_catboost, top_features_shap

In [10]:
from sklearn.model_selection import StratifiedKFold
if __name__ == '__main__':
    total_start_time = time.time()
    data_path = 'C:/Class/HK6/IPrj/Datasets/merged_2761_data.csv'

    data, X_train, y_train, X_test, y_test = load_and_preprocess_data_and_split(data_path)
    
    logging.info("--- CatBoost26 Model ---")
    filtered_features = ["234632_x_at","209603_at","230527_at","229963_at","217901_at","214719_at","219513_s_at","210789_x_at","204777_s_at","203294_s_at","230753_at","242056_at","217680_x_at","214945_at","222312_s_at","214705_at","241688_at","241611_s_at","236952_at","207636_at","243659_at","226311_at","211772_x_at","244719_at","239766_at","243272_at"]
    X_filtered = X_train[filtered_features]

    X = pd.concat([X_train, X_test], axis=0)
    y = pd.concat([y_train, y_test], axis=0)

    params_cnn = {
        'learning_rate': 0.001,
        'epochs': 50,
        'batch_size': 32,
    }

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    all_metrics = []
    all_training_times = []
    all_prediction_times = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        logging.info(f"Fold {fold + 1}/{kf.get_n_splits()}")

        X_fold_train, X_fold_val = X.iloc[train_index], X.iloc[val_index]
        y_fold_train, y_fold_val = y.iloc[train_index], y.iloc[val_index]

        X_fold_train_reduced, selected_features, top_features_catboost, top_features_shap = reduce_dimensionality(
            X_fold_train.copy(), y_fold_train.copy(),
            save_path=f'cnn_ensemble_reduced_data_fold_{fold}.pkl'
        )
        X_fold_val_reduced = X_fold_val[selected_features]

        metrics_cnn, training_time, prediction_time = train_and_evaluate_cnn_ensemble(
            X_fold_train_reduced, y_fold_train, X_fold_val_reduced, y_fold_val,
            params_cnn
        )
        all_metrics.append(metrics_cnn)
        all_training_times.append(training_time)
        all_prediction_times.append(prediction_time)

    avg_metrics = {}
    for metric in all_metrics[0]:
        if metric != 'confusion_matrix':
            avg_metrics[metric] = np.mean([fold_metrics[metric] for fold_metrics in all_metrics])
        else:
            avg_cm = np.sum([np.array(fold_metrics[metric]) for fold_metrics in all_metrics], axis=0)
            avg_metrics[metric] = avg_cm.tolist()

    logging.info("--- Cross-Validation Completed ---")
    logging.info(f"Average CNN Ensemble Metrics: {avg_metrics}")
    
    for fold, (train_t, pred_t) in enumerate(zip(all_training_times, all_prediction_times), 1):
        print(f"Fold {fold}: Training time = {train_t:.2f} s, Prediction time = {pred_t:.2f} s")
    
    print(f"\nAverage training time: {np.mean(all_training_times):.2f} s")
    print(f"Average prediction time: {np.mean(all_prediction_times):.2f} s")
    
    total_end_time = time.time()    
    total_duration = total_end_time - total_start_time
    logging.info(f"Total cross-validation execution time: {total_duration:.2f} seconds")

2025-07-18 09:52:28,129 - INFO - Loading data from: C:/Class/HK6/IPrj/Datasets/merged_2761_data.csv


(2761, 44755)


2025-07-18 09:54:26,538 - INFO - Data loaded and preprocessed.
2025-07-18 09:54:26,539 - INFO - Training set shape: (2208, 44754), Validation set shape: (553, 44754)
2025-07-18 09:54:26,569 - INFO - --- CatBoost26 Model ---
2025-07-18 09:54:28,726 - INFO - Fold 1/10
2025-07-18 09:54:32,552 - INFO - Starting dimensionality reduction...


0:	learn: 0.4121261	total: 264ms	remaining: 52.5s
1:	learn: 0.2503581	total: 476ms	remaining: 47.1s
2:	learn: 0.1598928	total: 687ms	remaining: 45.1s
3:	learn: 0.1043497	total: 938ms	remaining: 46s
4:	learn: 0.0679422	total: 1.16s	remaining: 45.2s
5:	learn: 0.0445775	total: 1.41s	remaining: 45.5s
6:	learn: 0.0295810	total: 1.63s	remaining: 44.9s
7:	learn: 0.0209288	total: 1.86s	remaining: 44.6s
8:	learn: 0.0155250	total: 2.11s	remaining: 44.8s
9:	learn: 0.0124280	total: 2.36s	remaining: 44.9s
10:	learn: 0.0093586	total: 2.62s	remaining: 45s
11:	learn: 0.0074504	total: 2.9s	remaining: 45.4s
12:	learn: 0.0061211	total: 3.17s	remaining: 45.6s
13:	learn: 0.0049983	total: 3.49s	remaining: 46.4s
14:	learn: 0.0041718	total: 3.8s	remaining: 46.9s
15:	learn: 0.0035488	total: 4.1s	remaining: 47.2s
16:	learn: 0.0029733	total: 4.41s	remaining: 47.5s
17:	learn: 0.0026428	total: 4.73s	remaining: 47.8s
18:	learn: 0.0023449	total: 5.06s	remaining: 48.2s
19:	learn: 0.0020507	total: 5.38s	remaining: 48.

  from .autonotebook import tqdm as notebook_tqdm
2025-07-18 09:56:27,323 - INFO - Calculating SHAP values...
2025-07-18 09:56:40,887 - INFO - SHAP values appear to be binary/regression (shape: (2484, 44754)).
2025-07-18 09:56:41,401 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 09:56:41,401 - INFO - Shape of shap_importance: (44754,)
2025-07-18 09:56:41,401 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 09:56:41,401 - INFO - Shape of X.columns: (44754,)
2025-07-18 09:56:41,401 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 09:56:41,429 - INFO - SHAP values calculated.
2025-07-18 09:56:41,491 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_0.pkl
2025-07-18 09:56:41,491 - INFO - Dimensionality reduction completed in 128.94 seconds.
2025-07-18 09:56:41,574 - INFO - Training CNN Ensemble...
2025-07-18 09:56:42,572 - INFO - Fitting CNN Models...
2025-07-18 09:57:35,596 - INFO - CN





2025-07-18 09:57:39,777 - INFO - CNN Ensemble trained and evaluated in 58.20 seconds.
2025-07-18 09:57:39,795 - INFO - Fold 2/10
2025-07-18 09:57:51,112 - INFO - Starting dimensionality reduction...


0:	learn: 0.4187072	total: 371ms	remaining: 1m 13s
1:	learn: 0.2508680	total: 592ms	remaining: 58.6s
2:	learn: 0.1607090	total: 812ms	remaining: 53.3s
3:	learn: 0.0998795	total: 1.04s	remaining: 51s
4:	learn: 0.0675090	total: 1.26s	remaining: 49.3s
5:	learn: 0.0500384	total: 1.49s	remaining: 48.1s
6:	learn: 0.0336332	total: 1.71s	remaining: 47.3s
7:	learn: 0.0239184	total: 1.93s	remaining: 46.4s
8:	learn: 0.0173618	total: 2.16s	remaining: 45.9s
9:	learn: 0.0128144	total: 2.38s	remaining: 45.2s
10:	learn: 0.0101563	total: 2.6s	remaining: 44.6s
11:	learn: 0.0078088	total: 2.82s	remaining: 44.2s
12:	learn: 0.0061619	total: 3.04s	remaining: 43.7s
13:	learn: 0.0050223	total: 3.26s	remaining: 43.3s
14:	learn: 0.0042021	total: 3.49s	remaining: 43s
15:	learn: 0.0036459	total: 3.71s	remaining: 42.7s
16:	learn: 0.0032449	total: 3.94s	remaining: 42.4s
17:	learn: 0.0027459	total: 4.16s	remaining: 42s
18:	learn: 0.0023848	total: 4.38s	remaining: 41.7s
19:	learn: 0.0021510	total: 4.6s	remaining: 41.

2025-07-18 09:58:59,539 - INFO - Calculating SHAP values...
2025-07-18 09:59:15,219 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 09:59:15,832 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 09:59:15,832 - INFO - Shape of shap_importance: (44754,)
2025-07-18 09:59:15,832 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 09:59:15,848 - INFO - Shape of X.columns: (44754,)
2025-07-18 09:59:15,848 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 09:59:15,853 - INFO - SHAP values calculated.
2025-07-18 09:59:15,853 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_1.pkl
2025-07-18 09:59:15,862 - INFO - Dimensionality reduction completed in 84.75 seconds.
2025-07-18 09:59:16,082 - INFO - Training CNN Ensemble...
2025-07-18 09:59:16,223 - INFO - Fitting CNN Models...
2025-07-18 10:00:13,602 - INFO - CNN Ensemble trained in 57.38 seconds.
2025-07-18 10:

0:	learn: 0.4186691	total: 492ms	remaining: 1m 37s
1:	learn: 0.2571288	total: 771ms	remaining: 1m 16s
2:	learn: 0.1567663	total: 994ms	remaining: 1m 5s
3:	learn: 0.0965017	total: 1.22s	remaining: 59.7s
4:	learn: 0.0646409	total: 1.45s	remaining: 56.7s
5:	learn: 0.0425190	total: 1.76s	remaining: 56.9s
6:	learn: 0.0281977	total: 2s	remaining: 55.3s
7:	learn: 0.0194450	total: 2.22s	remaining: 53.3s
8:	learn: 0.0138955	total: 2.44s	remaining: 51.8s
9:	learn: 0.0110493	total: 2.66s	remaining: 50.5s
10:	learn: 0.0084392	total: 2.88s	remaining: 49.5s
11:	learn: 0.0064513	total: 3.11s	remaining: 48.7s
12:	learn: 0.0053085	total: 3.33s	remaining: 47.8s
13:	learn: 0.0043312	total: 3.55s	remaining: 47.1s
14:	learn: 0.0034878	total: 3.77s	remaining: 46.5s
15:	learn: 0.0030080	total: 3.99s	remaining: 45.9s
16:	learn: 0.0026499	total: 4.2s	remaining: 45.2s
17:	learn: 0.0023336	total: 4.42s	remaining: 44.7s
18:	learn: 0.0021212	total: 4.65s	remaining: 44.3s
19:	learn: 0.0018990	total: 4.88s	remaining

2025-07-18 10:01:29,491 - INFO - Calculating SHAP values...
2025-07-18 10:02:01,606 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:02:02,157 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:02:02,159 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:02:02,161 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:02:02,163 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:02:02,165 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:02:02,179 - INFO - SHAP values calculated.
2025-07-18 10:02:02,189 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_2.pkl
2025-07-18 10:02:02,192 - INFO - Dimensionality reduction completed in 101.92 seconds.
2025-07-18 10:02:02,461 - INFO - Training CNN Ensemble...
2025-07-18 10:02:02,716 - INFO - Fitting CNN Models...
2025-07-18 10:03:47,252 - INFO - CNN Ensemble trained in 104.54 seconds.
2025-07-18 1

0:	learn: 0.4065876	total: 467ms	remaining: 1m 32s
1:	learn: 0.2611917	total: 809ms	remaining: 1m 20s
2:	learn: 0.1673556	total: 1.11s	remaining: 1m 12s
3:	learn: 0.1063465	total: 1.46s	remaining: 1m 11s
4:	learn: 0.0683579	total: 1.81s	remaining: 1m 10s
5:	learn: 0.0438323	total: 2.13s	remaining: 1m 8s
6:	learn: 0.0299944	total: 2.46s	remaining: 1m 7s
7:	learn: 0.0219904	total: 2.76s	remaining: 1m 6s
8:	learn: 0.0156840	total: 3.12s	remaining: 1m 6s
9:	learn: 0.0119473	total: 3.42s	remaining: 1m 5s
10:	learn: 0.0095025	total: 3.85s	remaining: 1m 6s
11:	learn: 0.0075643	total: 4.16s	remaining: 1m 5s
12:	learn: 0.0062929	total: 4.47s	remaining: 1m 4s
13:	learn: 0.0051596	total: 4.8s	remaining: 1m 3s
14:	learn: 0.0042253	total: 5.09s	remaining: 1m 2s
15:	learn: 0.0036319	total: 5.42s	remaining: 1m 2s
16:	learn: 0.0030199	total: 5.72s	remaining: 1m 1s
17:	learn: 0.0026359	total: 6.04s	remaining: 1m 1s
18:	learn: 0.0023016	total: 6.37s	remaining: 1m
19:	learn: 0.0020315	total: 6.67s	remain

2025-07-18 10:05:07,381 - INFO - Calculating SHAP values...
2025-07-18 10:05:22,815 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:05:23,338 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:05:23,338 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:05:23,338 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:05:23,338 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:05:23,338 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:05:23,357 - INFO - SHAP values calculated.
2025-07-18 10:05:23,361 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_3.pkl
2025-07-18 10:05:23,361 - INFO - Dimensionality reduction completed in 81.80 seconds.
2025-07-18 10:05:23,494 - INFO - Training CNN Ensemble...
2025-07-18 10:05:23,630 - INFO - Fitting CNN Models...
2025-07-18 10:06:22,200 - INFO - CNN Ensemble trained in 58.57 seconds.
2025-07-18 10:

0:	learn: 0.4192687	total: 446ms	remaining: 1m 28s
1:	learn: 0.2544386	total: 686ms	remaining: 1m 7s
2:	learn: 0.1622469	total: 914ms	remaining: 1m
3:	learn: 0.1049365	total: 1.15s	remaining: 56.1s
4:	learn: 0.0668045	total: 1.37s	remaining: 53.6s
5:	learn: 0.0474354	total: 1.61s	remaining: 52.2s
6:	learn: 0.0321326	total: 1.83s	remaining: 50.5s
7:	learn: 0.0236031	total: 2.07s	remaining: 49.7s
8:	learn: 0.0168566	total: 2.29s	remaining: 48.7s
9:	learn: 0.0123229	total: 2.53s	remaining: 48s
10:	learn: 0.0097439	total: 2.75s	remaining: 47.3s
11:	learn: 0.0080490	total: 3s	remaining: 46.9s
12:	learn: 0.0067663	total: 3.24s	remaining: 46.6s
13:	learn: 0.0052415	total: 3.47s	remaining: 46.2s
14:	learn: 0.0045422	total: 3.71s	remaining: 45.8s
15:	learn: 0.0038722	total: 3.94s	remaining: 45.4s
16:	learn: 0.0032329	total: 4.18s	remaining: 45s
17:	learn: 0.0028456	total: 4.44s	remaining: 44.9s
18:	learn: 0.0024877	total: 4.71s	remaining: 44.9s
19:	learn: 0.0021596	total: 4.97s	remaining: 44.7s

2025-07-18 10:07:25,227 - INFO - Calculating SHAP values...
2025-07-18 10:07:39,479 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:07:39,893 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:07:39,893 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:07:39,893 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:07:39,893 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:07:39,893 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:07:39,909 - INFO - SHAP values calculated.
2025-07-18 10:07:39,916 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_4.pkl
2025-07-18 10:07:39,917 - INFO - Dimensionality reduction completed in 72.26 seconds.
2025-07-18 10:07:40,049 - INFO - Training CNN Ensemble...
2025-07-18 10:07:40,215 - INFO - Fitting CNN Models...
2025-07-18 10:09:02,559 - INFO - CNN Ensemble trained in 82.34 seconds.
2025-07-18 10:

0:	learn: 0.4190564	total: 490ms	remaining: 1m 37s
1:	learn: 0.2539895	total: 756ms	remaining: 1m 14s
2:	learn: 0.1502876	total: 1.02s	remaining: 1m 7s
3:	learn: 0.0942504	total: 1.26s	remaining: 1m 1s
4:	learn: 0.0626660	total: 1.5s	remaining: 58.3s
5:	learn: 0.0427279	total: 1.73s	remaining: 56s
6:	learn: 0.0292367	total: 1.98s	remaining: 54.5s
7:	learn: 0.0202876	total: 2.21s	remaining: 52.9s
8:	learn: 0.0152295	total: 2.45s	remaining: 51.9s
9:	learn: 0.0117045	total: 2.68s	remaining: 51s
10:	learn: 0.0090453	total: 2.9s	remaining: 49.9s
11:	learn: 0.0068903	total: 3.13s	remaining: 49s
12:	learn: 0.0057723	total: 3.4s	remaining: 48.9s
13:	learn: 0.0046826	total: 3.63s	remaining: 48.3s
14:	learn: 0.0039019	total: 3.86s	remaining: 47.6s
15:	learn: 0.0032648	total: 4.1s	remaining: 47.1s
16:	learn: 0.0028149	total: 4.32s	remaining: 46.5s
17:	learn: 0.0024307	total: 4.55s	remaining: 46s
18:	learn: 0.0021687	total: 4.86s	remaining: 46.3s
19:	learn: 0.0019547	total: 5.13s	remaining: 46.2s


2025-07-18 10:10:18,536 - INFO - Calculating SHAP values...
2025-07-18 10:10:46,537 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:10:47,074 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:10:47,074 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:10:47,074 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:10:47,074 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:10:47,074 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:10:47,081 - INFO - SHAP values calculated.
2025-07-18 10:10:47,083 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_5.pkl
2025-07-18 10:10:47,083 - INFO - Dimensionality reduction completed in 93.17 seconds.
2025-07-18 10:10:47,196 - INFO - Training CNN Ensemble...
2025-07-18 10:10:47,340 - INFO - Fitting CNN Models...
2025-07-18 10:11:43,707 - INFO - CNN Ensemble trained in 56.37 seconds.
2025-07-18 10:

0:	learn: 0.4116177	total: 2.52s	remaining: 8m 21s
1:	learn: 0.2497692	total: 2.74s	remaining: 4m 30s
2:	learn: 0.1587365	total: 2.94s	remaining: 3m 13s
3:	learn: 0.1007561	total: 3.15s	remaining: 2m 34s
4:	learn: 0.0642599	total: 3.35s	remaining: 2m 10s
5:	learn: 0.0424360	total: 5.8s	remaining: 3m 7s
6:	learn: 0.0279836	total: 6s	remaining: 2m 45s
7:	learn: 0.0194965	total: 6.22s	remaining: 2m 29s
8:	learn: 0.0149634	total: 6.42s	remaining: 2m 16s
9:	learn: 0.0108209	total: 6.62s	remaining: 2m 5s
10:	learn: 0.0087464	total: 9.13s	remaining: 2m 36s
11:	learn: 0.0067225	total: 9.34s	remaining: 2m 26s
12:	learn: 0.0056555	total: 9.56s	remaining: 2m 17s
13:	learn: 0.0048931	total: 9.77s	remaining: 2m 9s
14:	learn: 0.0040188	total: 9.96s	remaining: 2m 2s
15:	learn: 0.0035478	total: 12.5s	remaining: 2m 23s
16:	learn: 0.0031424	total: 12.7s	remaining: 2m 16s
17:	learn: 0.0028039	total: 12.9s	remaining: 2m 10s
18:	learn: 0.0024199	total: 13.1s	remaining: 2m 4s
19:	learn: 0.0022009	total: 13.

2025-07-18 10:13:18,211 - INFO - Calculating SHAP values...
2025-07-18 10:13:49,927 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:13:50,666 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:13:50,666 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:13:50,666 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:13:50,666 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:13:50,666 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:13:50,682 - INFO - SHAP values calculated.
2025-07-18 10:13:50,682 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_6.pkl
2025-07-18 10:13:50,698 - INFO - Dimensionality reduction completed in 119.87 seconds.
2025-07-18 10:13:50,923 - INFO - Training CNN Ensemble...
2025-07-18 10:13:51,317 - INFO - Fitting CNN Models...
2025-07-18 10:16:07,172 - INFO - CNN Ensemble trained in 135.85 seconds.
2025-07-18 1

0:	learn: 0.4186280	total: 323ms	remaining: 1m 4s
1:	learn: 0.2482998	total: 705ms	remaining: 1m 9s
2:	learn: 0.1563112	total: 3.81s	remaining: 4m 10s
3:	learn: 0.0945026	total: 4.1s	remaining: 3m 21s
4:	learn: 0.0616247	total: 4.41s	remaining: 2m 52s
5:	learn: 0.0411301	total: 7.57s	remaining: 4m 4s
6:	learn: 0.0296585	total: 7.88s	remaining: 3m 37s
7:	learn: 0.0210857	total: 8.17s	remaining: 3m 16s
8:	learn: 0.0151913	total: 8.47s	remaining: 2m 59s
9:	learn: 0.0116684	total: 11.5s	remaining: 3m 38s
10:	learn: 0.0091143	total: 11.8s	remaining: 3m 23s
11:	learn: 0.0073164	total: 12.2s	remaining: 3m 10s
12:	learn: 0.0060585	total: 15.2s	remaining: 3m 38s
13:	learn: 0.0047316	total: 15.4s	remaining: 3m 25s
14:	learn: 0.0038537	total: 15.7s	remaining: 3m 13s
15:	learn: 0.0032867	total: 16s	remaining: 3m 3s
16:	learn: 0.0028717	total: 18.9s	remaining: 3m 23s
17:	learn: 0.0024985	total: 19.2s	remaining: 3m 13s
18:	learn: 0.0021487	total: 19.5s	remaining: 3m 5s
19:	learn: 0.0018904	total: 22

2025-07-18 10:18:32,443 - INFO - Calculating SHAP values...
2025-07-18 10:18:46,883 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:18:47,340 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:18:47,340 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:18:47,340 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:18:47,340 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:18:47,340 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:18:47,356 - INFO - SHAP values calculated.
2025-07-18 10:18:47,366 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_7.pkl
2025-07-18 10:18:47,367 - INFO - Dimensionality reduction completed in 148.88 seconds.
2025-07-18 10:18:47,496 - INFO - Training CNN Ensemble...
2025-07-18 10:18:47,629 - INFO - Fitting CNN Models...
2025-07-18 10:20:40,654 - INFO - CNN Ensemble trained in 113.03 seconds.
2025-07-18 1

0:	learn: 0.4277324	total: 2.7s	remaining: 8m 56s
1:	learn: 0.2623646	total: 2.95s	remaining: 4m 52s
2:	learn: 0.1688638	total: 3.18s	remaining: 3m 28s
3:	learn: 0.1141215	total: 3.4s	remaining: 2m 46s
4:	learn: 0.0732687	total: 6.14s	remaining: 3m 59s
5:	learn: 0.0467833	total: 6.37s	remaining: 3m 25s
6:	learn: 0.0329337	total: 6.63s	remaining: 3m 2s
7:	learn: 0.0229155	total: 6.87s	remaining: 2m 44s
8:	learn: 0.0158830	total: 9.57s	remaining: 3m 23s
9:	learn: 0.0116755	total: 9.8s	remaining: 3m 6s
10:	learn: 0.0091190	total: 10s	remaining: 2m 52s
11:	learn: 0.0071063	total: 10.3s	remaining: 2m 40s
12:	learn: 0.0060238	total: 10.5s	remaining: 2m 31s
13:	learn: 0.0051515	total: 13.2s	remaining: 2m 55s
14:	learn: 0.0041495	total: 13.4s	remaining: 2m 45s
15:	learn: 0.0036803	total: 13.7s	remaining: 2m 37s
16:	learn: 0.0031907	total: 13.9s	remaining: 2m 29s
17:	learn: 0.0028407	total: 16.7s	remaining: 2m 48s
18:	learn: 0.0024981	total: 16.9s	remaining: 2m 40s
19:	learn: 0.0022388	total: 1

2025-07-18 10:22:47,955 - INFO - Calculating SHAP values...
2025-07-18 10:23:19,348 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:23:19,909 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:23:19,914 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:23:19,915 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:23:19,916 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:23:19,916 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:23:19,923 - INFO - SHAP values calculated.
2025-07-18 10:23:19,937 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_8.pkl
2025-07-18 10:23:19,937 - INFO - Dimensionality reduction completed in 147.97 seconds.
2025-07-18 10:23:20,131 - INFO - Training CNN Ensemble...
2025-07-18 10:23:20,449 - INFO - Fitting CNN Models...
2025-07-18 10:25:44,205 - INFO - CNN Ensemble trained in 143.76 seconds.
2025-07-18 1

0:	learn: 0.4021798	total: 2.61s	remaining: 8m 40s
1:	learn: 0.2443977	total: 2.83s	remaining: 4m 39s
2:	learn: 0.1549151	total: 3.04s	remaining: 3m 19s
3:	learn: 0.0917617	total: 3.25s	remaining: 2m 39s
4:	learn: 0.0613648	total: 5.91s	remaining: 3m 50s
5:	learn: 0.0402958	total: 6.13s	remaining: 3m 18s
6:	learn: 0.0292126	total: 6.34s	remaining: 2m 54s
7:	learn: 0.0221147	total: 6.55s	remaining: 2m 37s
8:	learn: 0.0152706	total: 6.76s	remaining: 2m 23s
9:	learn: 0.0116116	total: 9.44s	remaining: 2m 59s
10:	learn: 0.0086751	total: 9.65s	remaining: 2m 45s
11:	learn: 0.0067995	total: 9.86s	remaining: 2m 34s
12:	learn: 0.0054595	total: 10.1s	remaining: 2m 24s
13:	learn: 0.0045250	total: 10.3s	remaining: 2m 16s
14:	learn: 0.0036587	total: 12.9s	remaining: 2m 39s
15:	learn: 0.0032223	total: 13.1s	remaining: 2m 30s
16:	learn: 0.0027554	total: 13.3s	remaining: 2m 23s
17:	learn: 0.0024459	total: 13.5s	remaining: 2m 16s
18:	learn: 0.0020742	total: 13.7s	remaining: 2m 10s
19:	learn: 0.0018734	t

2025-07-18 10:28:05,239 - INFO - Calculating SHAP values...
2025-07-18 10:28:34,994 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44754)).
2025-07-18 10:28:35,549 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 10:28:35,549 - INFO - Shape of shap_importance: (44754,)
2025-07-18 10:28:35,549 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 10:28:35,549 - INFO - Shape of X.columns: (44754,)
2025-07-18 10:28:35,549 - INFO - Length comparison: len(X.columns)=44754, len(shap_importance)=44754
2025-07-18 10:28:35,565 - INFO - SHAP values calculated.
2025-07-18 10:28:35,597 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_9.pkl
2025-07-18 10:28:35,597 - INFO - Dimensionality reduction completed in 161.27 seconds.
2025-07-18 10:28:36,100 - INFO - Training CNN Ensemble...
2025-07-18 10:28:36,484 - INFO - Fitting CNN Models...
2025-07-18 10:31:09,398 - INFO - CNN Ensemble trained in 152.91 seconds.
2025-07-18 1

Fold 1: Training time = 53.02 s, Prediction time = 0.67 s
Fold 2: Training time = 57.38 s, Prediction time = 0.72 s
Fold 3: Training time = 104.54 s, Prediction time = 1.47 s
Fold 4: Training time = 58.57 s, Prediction time = 0.65 s
Fold 5: Training time = 82.34 s, Prediction time = 1.31 s
Fold 6: Training time = 56.37 s, Prediction time = 0.82 s
Fold 7: Training time = 135.85 s, Prediction time = 1.42 s
Fold 8: Training time = 113.03 s, Prediction time = 1.40 s
Fold 9: Training time = 143.76 s, Prediction time = 0.93 s
Fold 10: Training time = 152.91 s, Prediction time = 0.59 s

Average training time: 95.78 s
Average prediction time: 1.00 s
