Connected to .venv (Python 3.12.10)

In [1]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[]

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    confusion_matrix as sk_confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.preprocessing import StandardScaler
import logging
import time
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from catboost import CatBoostClassifier
from tensorflow.keras.utils import to_categorical 
import tensorflow as tf 
import logging
import gc 
import matplotlib.pyplot as plt

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [4]:
ENSEMBLE_WEIGHTS = [0.4, 0.3, 0.3]
num_classes = 2
epochs = 50
batch_size = 16
learning_rate = 0.001
all_possible_labels = list(range(num_classes))

In [5]:
def create_cnn_model_1(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dense1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_2(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters= 10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    dense1 = Dense(units=8, activation='relu')(flatten)
    dropout1 = Dropout(0.5)(dense1)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn2_output')(dropout1)
    return Model(inputs=input_layer, outputs=output_layer)


def create_cnn_model_3(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    conv1 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(input_layer)
    conv2 = Conv1D(filters=10, kernel_size=6, activation='relu', padding='same')(conv1)
    pool = MaxPooling1D(pool_size=2, padding='same')(conv2)
    flatten = Flatten()(pool)
    output_layer = Dense(units=num_classes, activation='softmax', name='ecn3_output')(flatten)
    return Model(inputs=input_layer, outputs=output_layer)

In [6]:
def weighted_ensemble_predictions(predictions, weights):
    if len(predictions) != len(weights):
        raise ValueError("Number of models must match number of weights")
    weighted_predictions = np.array([predictions[i] * weights[i] for i in range(len(predictions))])
    ensemble_predictions = np.sum(weighted_predictions, axis=0)
    ensemble_predictions = np.argmax(ensemble_predictions, axis=1)
    return ensemble_predictions

In [7]:
def train_and_evaluate_cnn_ensemble(X_train, y_train, X_val, y_val, params):
    logging.info("Training CNN Ensemble...")

    X_train_np = X_train.values if isinstance(X_train, pd.DataFrame) else np.array(X_train)
    y_train_np = y_train.values if isinstance(y_train, pd.Series) else np.array(y_train)
    X_val_np = X_val.values if isinstance(X_val, pd.DataFrame) else np.array(X_val)
    y_val_np = y_val.values if isinstance(y_val, pd.Series) else np.array(y_val)

    y_train_np = y_train_np.astype(int)
    y_val_np = y_val_np.astype(int)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_np)
    X_val_scaled = scaler.transform(X_val_np)

    X_train_reshaped = np.expand_dims(X_train_scaled, axis=-1).astype(np.float32)
    X_val_reshaped = np.expand_dims(X_val_scaled, axis=-1).astype(np.float32)

    if X_train_reshaped.shape[1] == 0:
        logging.error("Error: X_train has 0 features after preprocessing/selection.")
        return None 
    input_shape = (X_train_reshaped.shape[1], 1)


    y_train_categorical = to_categorical(y_train_np, num_classes=num_classes)
    y_val_categorical = to_categorical(y_val_np, num_classes=num_classes) 

    cnn_model_1 = create_cnn_model_1(input_shape, num_classes)
    cnn_model_2 = create_cnn_model_2(input_shape, num_classes)
    cnn_model_3 = create_cnn_model_3(input_shape, num_classes)

    loss_function = 'categorical_crossentropy' 

    cnn_model_1.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_2.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])
    cnn_model_3.compile(optimizer=Adam(learning_rate=params['learning_rate']), loss=loss_function, metrics=['accuracy'])

    train_start_time = time.time()
    logging.info("Fitting CNN Models...")
    history1 = cnn_model_1.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical)) 

    history2 = cnn_model_2.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))

    history3 = cnn_model_3.fit(X_train_reshaped, y_train_categorical, epochs=params['epochs'],
                    batch_size=params['batch_size'], verbose=0,
                    validation_data=(X_val_reshaped, y_val_categorical))
    
    train_end_time = time.time()
    training_time = train_end_time - train_start_time
    logging.info(f"CNN Ensemble trained in {training_time:.2f} seconds.")
    
    predict_start_time = time.time()
    logging.info("Predicting with CNN Models...")
    cnn_predictions_1 = cnn_model_1.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_2 = cnn_model_2.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)
    cnn_predictions_3 = cnn_model_3.predict(X_val_reshaped, batch_size=params['batch_size'], verbose=0)

    ensemble_predictions = weighted_ensemble_predictions(
        [cnn_predictions_1, cnn_predictions_2, cnn_predictions_3], ENSEMBLE_WEIGHTS)
    
    predict_end_time = time.time()
    prediction_time = predict_end_time - predict_start_time
    logging.info(f"CNN Ensemble predicted in {prediction_time:.2f} seconds.")
    
    
    logging.info("Calculating Evaluation Metrics...")
    try:
        metrics = {
            'accuracy': accuracy_score(y_val_np, ensemble_predictions),
            'precision': precision_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'recall': recall_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'f1_score': f1_score(y_val_np, ensemble_predictions, average='macro', zero_division=0),
            'confusion_matrix': sk_confusion_matrix(y_val_np, ensemble_predictions,labels=all_possible_labels).tolist(),
        }
        logging.info(f"Evaluation metrics: {metrics}")
    except Exception as e:
        logging.error(f"Error calculating metrics: {e}")
        metrics = None 



    logging.info("Cleaning up models...")
    del cnn_model_1, cnn_model_2, cnn_model_3
    del cnn_predictions_1, cnn_predictions_2, cnn_predictions_3
    tf.keras.backend.clear_session() 
    gc.collect() 

    return metrics, training_time, prediction_time, [history1, history2, history3]

In [8]:
def load_and_preprocess_data_and_split(data_path, target_column='target'):

    logging.info(f"Loading data from: {data_path}")

    data = pd.read_csv(data_path)
    data = data.drop(columns='Unnamed: 0', errors='ignore')
    data = data.fillna(data.mean())
    print(data.shape)
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    logging.info("Data loaded and preprocessed.")
    logging.info(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}")
    return data, X_train, y_train, X_val, y_val

In [9]:
def reduce_dimensionality(X, y, top_n_features=50, save_path='reduced_data.pkl'):
    start_time = time.time()
    logging.info("Starting dimensionality reduction...")

    if save_path and os.path.exists(save_path):
        logging.info(f"Loading reduced data from: {save_path}")
        saved_data = pd.read_pickle(save_path)
        X_reduced = saved_data['X']
        selected_features = saved_data['features']
        logging.info(f"Loaded reduced data from {save_path} in {time.time() - start_time:.2f} seconds.")
        return X_reduced, selected_features


    from catboost import CatBoostClassifier
    model = CatBoostClassifier(iterations=200, 
                               depth=4, 
                               learning_rate=0.1,
                               loss_function='Logloss',
                               verbose=1, 
                               random_seed=42,
                               early_stopping_rounds=200,
                               task_type='GPU')
    model.fit(X, y, verbose=1)
    importance_df = pd.DataFrame({'feature': X.columns,
                                  'importance': model.get_feature_importance()})
    top_features_catboost = importance_df.nlargest(top_n_features,
                                                 'importance')['feature'].tolist()


    import shap
    logging.info("Calculating SHAP values...")
    top_features_shap = None 
    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)


        if isinstance(shap_values, list):
            logging.info("SHAP values appear to be multiclass (list).") 
            if not shap_values:
                 raise ValueError("SHAP explainer returned an empty list.")

            shap_values = [np.array(vals) for vals in shap_values]


            shapes_in_list = [vals.shape for vals in shap_values]
            logging.info(f"Shapes within SHAP values list: {shapes_in_list}")

            shap_class_importance = [np.abs(vals).mean(axis=0) for vals in shap_values]
            shap_importance = np.mean(shap_class_importance, axis=0)
        elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
            logging.info(f"SHAP values: Multiclass (3D Array), Shape: {shap_values.shape}")

            shap_importance = np.abs(shap_values).mean(axis=(0, 2)) 
        else:            
            if not isinstance(shap_values, np.ndarray):
                 raise TypeError(f"Expected shap_values to be list or numpy array, got {type(shap_values)}")

            logging.info(f"SHAP values appear to be binary/regression (shape: {shap_values.shape}).") 
            shap_importance = np.abs(shap_values).mean(axis=0)

        logging.info(f"Type of shap_importance: {type(shap_importance)}")
        logging.info(f"Shape of shap_importance: {np.array(shap_importance).shape}")
        logging.info(f"Type of X.columns: {type(X.columns)}")
        logging.info(f"Shape of X.columns: {X.columns.shape}")
        logging.info(f"Length comparison: len(X.columns)={len(X.columns)}, len(shap_importance)={len(shap_importance)}")

        if np.isnan(shap_importance).any() or np.isinf(shap_importance).any():
             logging.warning("NaN or Inf detected in shap_importance values.")
             shap_importance = np.nan_to_num(shap_importance, nan=0.0, posinf=0.0, neginf=0.0)



        shap_importance_df = pd.DataFrame({'feature': X.columns,
                                           'importance': shap_importance}) 

        top_features_shap = shap_importance_df.nlargest(top_n_features, 'importance')['feature'].tolist()
        logging.info("SHAP values calculated.")
        

    except Exception as e:
        logging.error(f"Error calculating SHAP values: {e}") 
        logging.warning("Using only CatBoost features due to SHAP error.")
        top_features_shap = top_features_catboost 


    selected_features = list(set(top_features_catboost) & set(top_features_shap))
    X_reduced = X[selected_features]
    


    if save_path:
        data_to_save = {'X': X_reduced, 'features': selected_features}
        pd.to_pickle(data_to_save, save_path)
        logging.info(f"Saved reduced data to {save_path}")

    logging.info(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds.")
    
        
    return X_reduced, selected_features, top_features_catboost, top_features_shap

In [10]:
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

if __name__ == '__main__':
    total_start_time = time.time()
    data_path = 'C:/Class/HK6/IPrj/Datasets/merged_2761_with_age_data.csv'

    data, X_train, y_train, X_test, y_test = load_and_preprocess_data_and_split(data_path)
    
    logging.info("--- CatBoost26 Model ---")
    filtered_features = ["234632_x_at","209603_at","230527_at","229963_at","217901_at","214719_at","219513_s_at","210789_x_at","204777_s_at","203294_s_at","230753_at","242056_at","217680_x_at","214945_at","222312_s_at","214705_at","241688_at","241611_s_at","236952_at","207636_at","243659_at","226311_at","211772_x_at","244719_at","239766_at","243272_at"]
    X_filtered = X_train[filtered_features]

    X = pd.concat([X_train, X_test], axis=0)
    y = pd.concat([y_train, y_test], axis=0)

    params_cnn = {
        'learning_rate': 0.001,
        'epochs': 50,
        'batch_size': 32,
    }

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    all_metrics = []
    all_training_times = []
    all_prediction_times = []
    all_histories = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        logging.info(f"Fold {fold + 1}/{kf.get_n_splits()}")

        X_fold_train, X_fold_val = X.iloc[train_index], X.iloc[val_index]
        y_fold_train, y_fold_val = y.iloc[train_index], y.iloc[val_index]


        X_fold_train_reduced, selected_features, top_features_catboost, top_features_shap = reduce_dimensionality(
            X_fold_train.copy(), y_fold_train.copy(),
            save_path=f'cnn_ensemble_reduced_data_fold_{fold}.pkl'
        )
        X_fold_val_reduced = X_fold_val[selected_features]

        metrics_cnn, training_time, prediction_time, histories = train_and_evaluate_cnn_ensemble(
            X_fold_train_reduced, y_fold_train, X_fold_val_reduced, y_fold_val,
            params_cnn
        )
        all_metrics.append(metrics_cnn)
        all_training_times.append(training_time)
        all_prediction_times.append(prediction_time)
        all_histories.append(histories)

    avg_metrics = {}
    for metric in all_metrics[0]:
        if metric != 'confusion_matrix':
            avg_metrics[metric] = np.mean([fold_metrics[metric] for fold_metrics in all_metrics])
        else:
            avg_cm = np.sum([np.array(fold_metrics[metric]) for fold_metrics in all_metrics], axis=0)
            avg_metrics[metric] = avg_cm.tolist()

    logging.info("--- Cross-Validation Completed ---")
    logging.info(f"Average CNN Ensemble Metrics: {avg_metrics}")

    for fold, (train_t, pred_t) in enumerate(zip(all_training_times, all_prediction_times), 1):
        print(f"Fold {fold}: Training time = {train_t:.2f} s, Prediction time = {pred_t:.2f} s")
    
    print(f"\nAverage training time: {np.mean(all_training_times):.2f} s")
    print(f"Average prediction time: {np.mean(all_prediction_times):.2f} s")


    total_end_time = time.time()    
    total_duration = total_end_time - total_start_time
    logging.info(f"Total cross-validation execution time: {total_duration:.2f} seconds")

2025-07-18 11:58:42,797 - INFO - Loading data from: C:/Class/HK6/IPrj/Datasets/merged_2761_with_age_data.csv


(2761, 44756)


2025-07-18 12:05:09,549 - INFO - Data loaded and preprocessed.
2025-07-18 12:05:09,549 - INFO - Training set shape: (2208, 44755), Validation set shape: (553, 44755)
2025-07-18 12:05:09,680 - INFO - --- CatBoost26 Model ---
2025-07-18 12:05:17,174 - INFO - Fold 1/10
2025-07-18 12:05:33,257 - INFO - Starting dimensionality reduction...


0:	learn: 0.4121261	total: 795ms	remaining: 2m 38s
1:	learn: 0.2503581	total: 1.53s	remaining: 2m 31s
2:	learn: 0.1598928	total: 2.15s	remaining: 2m 21s
3:	learn: 0.1043497	total: 2.99s	remaining: 2m 26s
4:	learn: 0.0679422	total: 3.76s	remaining: 2m 26s
5:	learn: 0.0445775	total: 4.47s	remaining: 2m 24s
6:	learn: 0.0295810	total: 5.14s	remaining: 2m 21s
7:	learn: 0.0209288	total: 5.85s	remaining: 2m 20s
8:	learn: 0.0155250	total: 6.63s	remaining: 2m 20s
9:	learn: 0.0124280	total: 7.26s	remaining: 2m 17s
10:	learn: 0.0093586	total: 7.85s	remaining: 2m 14s
11:	learn: 0.0074504	total: 8.46s	remaining: 2m 12s
12:	learn: 0.0061211	total: 9.05s	remaining: 2m 10s
13:	learn: 0.0049983	total: 9.61s	remaining: 2m 7s
14:	learn: 0.0041718	total: 10.2s	remaining: 2m 5s
15:	learn: 0.0035488	total: 10.9s	remaining: 2m 4s
16:	learn: 0.0029733	total: 11.4s	remaining: 2m 3s
17:	learn: 0.0026428	total: 12s	remaining: 2m 1s
18:	learn: 0.0023449	total: 12.7s	remaining: 2m 1s
19:	learn: 0.0020507	total: 13

  from .autonotebook import tqdm as notebook_tqdm
2025-07-18 12:08:20,562 - INFO - Calculating SHAP values...
2025-07-18 12:08:49,802 - INFO - SHAP values appear to be binary/regression (shape: (2484, 44755)).
2025-07-18 12:08:50,300 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:08:50,306 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:08:50,307 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:08:50,307 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:08:50,307 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:08:50,314 - INFO - SHAP values calculated.
2025-07-18 12:08:50,328 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_0.pkl
2025-07-18 12:08:50,328 - INFO - Dimensionality reduction completed in 197.07 seconds.
2025-07-18 12:08:50,334 - INFO - Training CNN Ensemble...
2025-07-18 12:08:50,689 - INFO - Fitting CNN Models...
2025-07-18 12:11:13,229 - INFO - CN





2025-07-18 12:11:18,949 - INFO - Fold 2/10
2025-07-18 12:11:25,647 - INFO - Starting dimensionality reduction...


0:	learn: 0.4187073	total: 486ms	remaining: 1m 36s
1:	learn: 0.2508680	total: 783ms	remaining: 1m 17s
2:	learn: 0.1607090	total: 1.09s	remaining: 1m 11s
3:	learn: 0.0998795	total: 1.36s	remaining: 1m 6s
4:	learn: 0.0675090	total: 1.63s	remaining: 1m 3s
5:	learn: 0.0500384	total: 1.91s	remaining: 1m 1s
6:	learn: 0.0336332	total: 2.22s	remaining: 1m 1s
7:	learn: 0.0239184	total: 2.58s	remaining: 1m 1s
8:	learn: 0.0173618	total: 2.85s	remaining: 1m
9:	learn: 0.0128144	total: 3.15s	remaining: 59.8s
10:	learn: 0.0101563	total: 3.44s	remaining: 59s
11:	learn: 0.0078088	total: 3.71s	remaining: 58.1s
12:	learn: 0.0061619	total: 3.98s	remaining: 57.2s
13:	learn: 0.0050223	total: 4.24s	remaining: 56.3s
14:	learn: 0.0042021	total: 4.46s	remaining: 55s
15:	learn: 0.0036459	total: 4.67s	remaining: 53.6s
16:	learn: 0.0032449	total: 4.87s	remaining: 52.4s
17:	learn: 0.0027459	total: 5.07s	remaining: 51.3s
18:	learn: 0.0023848	total: 5.27s	remaining: 50.2s
19:	learn: 0.0021510	total: 5.47s	remaining: 

2025-07-18 12:12:22,080 - INFO - Calculating SHAP values...
2025-07-18 12:12:36,836 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:12:37,254 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:12:37,260 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:12:37,261 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:12:37,261 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:12:37,261 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:12:37,267 - INFO - SHAP values calculated.
2025-07-18 12:12:37,267 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_1.pkl
2025-07-18 12:12:37,274 - INFO - Dimensionality reduction completed in 71.63 seconds.
2025-07-18 12:12:37,434 - INFO - Training CNN Ensemble...
2025-07-18 12:12:37,538 - INFO - Fitting CNN Models...
2025-07-18 12:13:35,661 - INFO - CNN Ensemble trained in 58.12 seconds.
2025-07-18 12:

0:	learn: 0.4186691	total: 341ms	remaining: 1m 7s
1:	learn: 0.2571288	total: 551ms	remaining: 54.6s
2:	learn: 0.1567663	total: 782ms	remaining: 51.4s
3:	learn: 0.0965017	total: 988ms	remaining: 48.4s
4:	learn: 0.0646409	total: 1.19s	remaining: 46.5s
5:	learn: 0.0425190	total: 1.4s	remaining: 45.1s
6:	learn: 0.0281977	total: 1.6s	remaining: 44s
7:	learn: 0.0194450	total: 1.79s	remaining: 43.1s
8:	learn: 0.0138955	total: 1.99s	remaining: 42.3s
9:	learn: 0.0110493	total: 2.2s	remaining: 41.7s
10:	learn: 0.0084392	total: 2.41s	remaining: 41.4s
11:	learn: 0.0064513	total: 2.62s	remaining: 41s
12:	learn: 0.0053085	total: 2.82s	remaining: 40.6s
13:	learn: 0.0043312	total: 3.03s	remaining: 40.3s
14:	learn: 0.0034878	total: 3.25s	remaining: 40.1s
15:	learn: 0.0030080	total: 3.46s	remaining: 39.8s
16:	learn: 0.0026499	total: 3.66s	remaining: 39.4s
17:	learn: 0.0023336	total: 3.87s	remaining: 39.1s
18:	learn: 0.0021212	total: 4.07s	remaining: 38.8s
19:	learn: 0.0018990	total: 4.27s	remaining: 38.

2025-07-18 12:14:29,736 - INFO - Calculating SHAP values...
2025-07-18 12:14:43,354 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:14:43,761 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:14:43,761 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:14:43,761 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:14:43,761 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:14:43,761 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:14:43,777 - INFO - SHAP values calculated.
2025-07-18 12:14:43,786 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_2.pkl
2025-07-18 12:14:43,787 - INFO - Dimensionality reduction completed in 62.80 seconds.
2025-07-18 12:14:43,918 - INFO - Training CNN Ensemble...
2025-07-18 12:14:44,013 - INFO - Fitting CNN Models...
2025-07-18 12:16:05,384 - INFO - CNN Ensemble trained in 81.37 seconds.
2025-07-18 12:

0:	learn: 0.4065876	total: 369ms	remaining: 1m 13s
1:	learn: 0.2611917	total: 587ms	remaining: 58.1s
2:	learn: 0.1673556	total: 801ms	remaining: 52.6s
3:	learn: 0.1063465	total: 1.02s	remaining: 50.2s
4:	learn: 0.0683579	total: 1.24s	remaining: 48.3s
5:	learn: 0.0438323	total: 1.45s	remaining: 47s
6:	learn: 0.0299944	total: 1.68s	remaining: 46.3s
7:	learn: 0.0219904	total: 1.9s	remaining: 45.7s
8:	learn: 0.0156840	total: 2.14s	remaining: 45.5s
9:	learn: 0.0119473	total: 2.35s	remaining: 44.8s
10:	learn: 0.0095025	total: 2.56s	remaining: 44.1s
11:	learn: 0.0075643	total: 2.79s	remaining: 43.8s
12:	learn: 0.0062929	total: 3s	remaining: 43.2s
13:	learn: 0.0051596	total: 3.21s	remaining: 42.6s
14:	learn: 0.0042253	total: 3.41s	remaining: 42.1s
15:	learn: 0.0036319	total: 3.63s	remaining: 41.7s
16:	learn: 0.0030199	total: 3.84s	remaining: 41.3s
17:	learn: 0.0026359	total: 4.05s	remaining: 40.9s
18:	learn: 0.0023016	total: 4.26s	remaining: 40.6s
19:	learn: 0.0020315	total: 4.48s	remaining: 4

2025-07-18 12:17:02,677 - INFO - Calculating SHAP values...
2025-07-18 12:17:35,674 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:17:36,721 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:17:36,724 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:17:36,727 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:17:36,729 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:17:36,730 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:17:36,772 - INFO - SHAP values calculated.
2025-07-18 12:17:36,800 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_3.pkl
2025-07-18 12:17:36,803 - INFO - Dimensionality reduction completed in 86.57 seconds.
2025-07-18 12:17:37,063 - INFO - Training CNN Ensemble...
2025-07-18 12:17:37,552 - INFO - Fitting CNN Models...
2025-07-18 12:19:32,363 - INFO - CNN Ensemble trained in 114.81 seconds.
2025-07-18 12

0:	learn: 0.4192687	total: 346ms	remaining: 1m 8s
1:	learn: 0.2544386	total: 553ms	remaining: 54.8s
2:	learn: 0.1622469	total: 767ms	remaining: 50.4s
3:	learn: 0.1049365	total: 979ms	remaining: 48s
4:	learn: 0.0668045	total: 1.19s	remaining: 46.4s
5:	learn: 0.0474354	total: 1.41s	remaining: 45.7s
6:	learn: 0.0321326	total: 1.61s	remaining: 44.4s
7:	learn: 0.0236031	total: 1.81s	remaining: 43.6s
8:	learn: 0.0168566	total: 2.02s	remaining: 43s
9:	learn: 0.0123229	total: 2.22s	remaining: 42.3s
10:	learn: 0.0097440	total: 2.43s	remaining: 41.8s
11:	learn: 0.0080490	total: 2.67s	remaining: 41.8s
12:	learn: 0.0067663	total: 2.95s	remaining: 42.4s
13:	learn: 0.0052415	total: 3.22s	remaining: 42.8s
14:	learn: 0.0045422	total: 3.48s	remaining: 43s
15:	learn: 0.0038722	total: 3.74s	remaining: 43s
16:	learn: 0.0032329	total: 3.99s	remaining: 43s
17:	learn: 0.0028456	total: 4.24s	remaining: 42.9s
18:	learn: 0.0024877	total: 4.48s	remaining: 42.7s
19:	learn: 0.0021596	total: 4.68s	remaining: 42.1s


2025-07-18 12:20:33,239 - INFO - Calculating SHAP values...
2025-07-18 12:20:48,132 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:20:48,570 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:20:48,571 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:20:48,571 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:20:48,572 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:20:48,572 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:20:48,578 - INFO - SHAP values calculated.
2025-07-18 12:20:48,583 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_4.pkl
2025-07-18 12:20:48,584 - INFO - Dimensionality reduction completed in 69.50 seconds.
2025-07-18 12:20:48,746 - INFO - Training CNN Ensemble...
2025-07-18 12:20:48,840 - INFO - Fitting CNN Models...
2025-07-18 12:21:39,721 - INFO - CNN Ensemble trained in 50.88 seconds.
2025-07-18 12:

0:	learn: 0.4190564	total: 385ms	remaining: 1m 16s
1:	learn: 0.2539895	total: 610ms	remaining: 1m
2:	learn: 0.1502876	total: 827ms	remaining: 54.3s
3:	learn: 0.0942504	total: 1.05s	remaining: 51.4s
4:	learn: 0.0626660	total: 1.27s	remaining: 49.5s
5:	learn: 0.0427279	total: 1.49s	remaining: 48.3s
6:	learn: 0.0292367	total: 1.72s	remaining: 47.5s
7:	learn: 0.0202876	total: 1.95s	remaining: 46.9s
8:	learn: 0.0152295	total: 2.18s	remaining: 46.2s
9:	learn: 0.0117045	total: 2.39s	remaining: 45.4s
10:	learn: 0.0090453	total: 2.61s	remaining: 44.9s
11:	learn: 0.0068903	total: 2.83s	remaining: 44.4s
12:	learn: 0.0057723	total: 3.06s	remaining: 44s
13:	learn: 0.0046826	total: 3.27s	remaining: 43.5s
14:	learn: 0.0039019	total: 3.48s	remaining: 43s
15:	learn: 0.0032648	total: 3.71s	remaining: 42.7s
16:	learn: 0.0028149	total: 3.95s	remaining: 42.5s
17:	learn: 0.0024307	total: 4.17s	remaining: 42.2s
18:	learn: 0.0021687	total: 4.39s	remaining: 41.9s
19:	learn: 0.0019547	total: 4.61s	remaining: 41

2025-07-18 12:22:37,390 - INFO - Calculating SHAP values...
2025-07-18 12:22:51,249 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:22:51,948 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:22:51,950 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:22:51,951 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:22:51,954 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:22:51,957 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:22:52,013 - INFO - SHAP values calculated.
2025-07-18 12:22:52,020 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_5.pkl
2025-07-18 12:22:52,021 - INFO - Dimensionality reduction completed in 67.00 seconds.
2025-07-18 12:22:52,361 - INFO - Training CNN Ensemble...
2025-07-18 12:22:52,704 - INFO - Fitting CNN Models...
2025-07-18 12:25:49,428 - INFO - CNN Ensemble trained in 176.72 seconds.
2025-07-18 12

0:	learn: 0.4116177	total: 942ms	remaining: 3m 7s
1:	learn: 0.2497691	total: 1.76s	remaining: 2m 54s
2:	learn: 0.1587365	total: 2.63s	remaining: 2m 52s
3:	learn: 0.1007561	total: 3.41s	remaining: 2m 47s
4:	learn: 0.0642599	total: 4.19s	remaining: 2m 43s
5:	learn: 0.0424360	total: 5s	remaining: 2m 41s
6:	learn: 0.0279836	total: 5.89s	remaining: 2m 42s
7:	learn: 0.0194965	total: 6.73s	remaining: 2m 41s
8:	learn: 0.0149634	total: 7.47s	remaining: 2m 38s
9:	learn: 0.0108209	total: 8.23s	remaining: 2m 36s
10:	learn: 0.0087464	total: 9.02s	remaining: 2m 35s
11:	learn: 0.0067225	total: 9.83s	remaining: 2m 33s
12:	learn: 0.0056555	total: 10.6s	remaining: 2m 32s
13:	learn: 0.0048931	total: 11.3s	remaining: 2m 30s
14:	learn: 0.0040188	total: 12s	remaining: 2m 28s
15:	learn: 0.0035478	total: 12.8s	remaining: 2m 27s
16:	learn: 0.0031424	total: 13.6s	remaining: 2m 26s
17:	learn: 0.0028039	total: 14.5s	remaining: 2m 26s
18:	learn: 0.0024199	total: 15.2s	remaining: 2m 24s
19:	learn: 0.0022009	total: 

2025-07-18 12:29:08,065 - INFO - Calculating SHAP values...
2025-07-18 12:29:57,101 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:29:58,890 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:29:58,892 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:29:58,894 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:29:58,895 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:29:58,897 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:29:58,915 - INFO - SHAP values calculated.
2025-07-18 12:29:58,934 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_6.pkl
2025-07-18 12:29:58,939 - INFO - Dimensionality reduction completed in 230.65 seconds.
2025-07-18 12:29:59,490 - INFO - Training CNN Ensemble...
2025-07-18 12:29:59,962 - INFO - Fitting CNN Models...
2025-07-18 12:33:37,118 - INFO - CNN Ensemble trained in 217.16 seconds.
2025-07-18 1

0:	learn: 0.4186280	total: 812ms	remaining: 2m 41s
1:	learn: 0.2482998	total: 1.55s	remaining: 2m 33s
2:	learn: 0.1563112	total: 2.38s	remaining: 2m 36s
3:	learn: 0.0945026	total: 3.19s	remaining: 2m 36s
4:	learn: 0.0616247	total: 3.83s	remaining: 2m 29s
5:	learn: 0.0411301	total: 4.44s	remaining: 2m 23s
6:	learn: 0.0296585	total: 5.05s	remaining: 2m 19s
7:	learn: 0.0210857	total: 5.71s	remaining: 2m 16s
8:	learn: 0.0151913	total: 6.38s	remaining: 2m 15s
9:	learn: 0.0116684	total: 7.04s	remaining: 2m 13s
10:	learn: 0.0091143	total: 7.63s	remaining: 2m 11s
11:	learn: 0.0073164	total: 8.23s	remaining: 2m 8s
12:	learn: 0.0060585	total: 8.87s	remaining: 2m 7s
13:	learn: 0.0047316	total: 9.62s	remaining: 2m 7s
14:	learn: 0.0038537	total: 10.3s	remaining: 2m 7s
15:	learn: 0.0032867	total: 11.1s	remaining: 2m 7s
16:	learn: 0.0028717	total: 11.8s	remaining: 2m 6s
17:	learn: 0.0024985	total: 12.5s	remaining: 2m 6s
18:	learn: 0.0021487	total: 13.3s	remaining: 2m 6s
19:	learn: 0.0018904	total: 14

2025-07-18 12:38:42,026 - INFO - Calculating SHAP values...
2025-07-18 12:39:18,254 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:39:20,111 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:39:20,115 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:39:20,116 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:39:20,117 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:39:20,118 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:39:20,164 - INFO - SHAP values calculated.
2025-07-18 12:39:20,207 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_7.pkl
2025-07-18 12:39:20,209 - INFO - Dimensionality reduction completed in 322.85 seconds.
2025-07-18 12:39:20,977 - INFO - Training CNN Ensemble...
2025-07-18 12:39:21,521 - INFO - Fitting CNN Models...
2025-07-18 12:42:30,215 - INFO - CNN Ensemble trained in 188.69 seconds.
2025-07-18 1

0:	learn: 0.4277324	total: 380ms	remaining: 1m 15s
1:	learn: 0.2623646	total: 589ms	remaining: 58.3s
2:	learn: 0.1688638	total: 806ms	remaining: 52.9s
3:	learn: 0.1141215	total: 1.04s	remaining: 50.9s
4:	learn: 0.0732687	total: 1.3s	remaining: 50.7s
5:	learn: 0.0467833	total: 1.52s	remaining: 49.2s
6:	learn: 0.0329337	total: 1.74s	remaining: 47.9s
7:	learn: 0.0229155	total: 1.95s	remaining: 46.7s
8:	learn: 0.0158830	total: 2.17s	remaining: 46s
9:	learn: 0.0116755	total: 2.37s	remaining: 45.1s
10:	learn: 0.0091190	total: 2.58s	remaining: 44.3s
11:	learn: 0.0071063	total: 2.79s	remaining: 43.6s
12:	learn: 0.0060238	total: 2.99s	remaining: 43s
13:	learn: 0.0051515	total: 3.19s	remaining: 42.4s
14:	learn: 0.0041495	total: 3.4s	remaining: 42s
15:	learn: 0.0036803	total: 3.63s	remaining: 41.8s
16:	learn: 0.0031907	total: 3.89s	remaining: 41.9s
17:	learn: 0.0028407	total: 4.09s	remaining: 41.3s
18:	learn: 0.0024981	total: 4.3s	remaining: 41s
19:	learn: 0.0022388	total: 4.5s	remaining: 40.5s
2

2025-07-18 12:43:58,139 - INFO - Calculating SHAP values...
2025-07-18 12:44:13,284 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:44:13,914 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:44:13,916 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:44:13,918 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:44:13,919 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:44:13,920 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:44:13,932 - INFO - SHAP values calculated.
2025-07-18 12:44:13,944 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_8.pkl
2025-07-18 12:44:13,946 - INFO - Dimensionality reduction completed in 81.54 seconds.
2025-07-18 12:44:14,190 - INFO - Training CNN Ensemble...
2025-07-18 12:44:14,408 - INFO - Fitting CNN Models...
2025-07-18 12:45:37,136 - INFO - CNN Ensemble trained in 82.73 seconds.
2025-07-18 12:

0:	learn: 0.4021798	total: 393ms	remaining: 1m 18s
1:	learn: 0.2443977	total: 647ms	remaining: 1m 4s
2:	learn: 0.1549151	total: 891ms	remaining: 58.5s
3:	learn: 0.0917617	total: 1.14s	remaining: 56.1s
4:	learn: 0.0613648	total: 1.38s	remaining: 53.7s
5:	learn: 0.0402958	total: 1.62s	remaining: 52.5s
6:	learn: 0.0292126	total: 1.86s	remaining: 51.4s
7:	learn: 0.0221147	total: 2.11s	remaining: 50.6s
8:	learn: 0.0152706	total: 2.34s	remaining: 49.7s
9:	learn: 0.0116116	total: 2.63s	remaining: 50s
10:	learn: 0.0086751	total: 2.88s	remaining: 49.5s
11:	learn: 0.0067995	total: 3.1s	remaining: 48.6s
12:	learn: 0.0054595	total: 3.35s	remaining: 48.2s
13:	learn: 0.0045250	total: 3.6s	remaining: 47.8s
14:	learn: 0.0036587	total: 3.86s	remaining: 47.6s
15:	learn: 0.0032223	total: 4.09s	remaining: 47s
16:	learn: 0.0027554	total: 4.36s	remaining: 46.9s
17:	learn: 0.0024459	total: 4.59s	remaining: 46.5s
18:	learn: 0.0020742	total: 4.84s	remaining: 46.1s
19:	learn: 0.0018734	total: 5.09s	remaining: 4

2025-07-18 12:46:44,254 - INFO - Calculating SHAP values...
2025-07-18 12:46:57,931 - INFO - SHAP values appear to be binary/regression (shape: (2485, 44755)).
2025-07-18 12:46:58,271 - INFO - Type of shap_importance: <class 'numpy.ndarray'>
2025-07-18 12:46:58,272 - INFO - Shape of shap_importance: (44755,)
2025-07-18 12:46:58,273 - INFO - Type of X.columns: <class 'pandas.core.indexes.base.Index'>
2025-07-18 12:46:58,274 - INFO - Shape of X.columns: (44755,)
2025-07-18 12:46:58,276 - INFO - Length comparison: len(X.columns)=44755, len(shap_importance)=44755
2025-07-18 12:46:58,283 - INFO - SHAP values calculated.
2025-07-18 12:46:58,287 - INFO - Saved reduced data to cnn_ensemble_reduced_data_fold_9.pkl
2025-07-18 12:46:58,288 - INFO - Dimensionality reduction completed in 68.31 seconds.
2025-07-18 12:46:58,420 - INFO - Training CNN Ensemble...
2025-07-18 12:46:58,572 - INFO - Fitting CNN Models...
2025-07-18 12:47:52,290 - INFO - CNN Ensemble trained in 53.72 seconds.
2025-07-18 12:

Fold 1: Training time = 142.54 s, Prediction time = 1.56 s
Fold 2: Training time = 58.12 s, Prediction time = 0.79 s
Fold 3: Training time = 81.37 s, Prediction time = 0.59 s
Fold 4: Training time = 114.81 s, Prediction time = 0.63 s
Fold 5: Training time = 50.88 s, Prediction time = 0.63 s
Fold 6: Training time = 176.72 s, Prediction time = 2.05 s
Fold 7: Training time = 217.16 s, Prediction time = 2.21 s
Fold 8: Training time = 188.69 s, Prediction time = 2.46 s
Fold 9: Training time = 82.73 s, Prediction time = 2.48 s
Fold 10: Training time = 53.72 s, Prediction time = 0.72 s

Average training time: 116.67 s
Average prediction time: 1.41 s
