In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool, cv
import time
import os
import shap

In [37]:
QUICK_TEST = False
SAMPLE_SIZE = 1000
MAX_ITERATIONS = 100
EARLY_STOPPING = 200
GPU_DEVICE = 0

In [38]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [39]:
def load_and_preprocess_data_and_split(data_path, target_column='target'):

    logging.info(f"Loading data from: {data_path}")

    data = pd.read_csv(data_path)
    try:
        data = data.drop(columns = 'Unnamed: 0')
    except:
        pass

    data = data.fillna(data.mean())

    X = data.drop(target_column, axis=1)
    y = data[target_column]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )  

    logging.info("Data loaded and preprocessed.")
    logging.info(f"Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}") 
    return data, X_train, y_train, X_val, y_val

In [40]:
def reduce_dimensionality(X_train, y_train, X_val, top_n_features=100, save_path='reduced_data.pkl'):
    start_time = time.time()
    logging.info("Starting dimensionality reduction...")

    if save_path and os.path.exists(save_path):
        logging.info(f"Loading reduced data from: {save_path}")
        saved_data = pd.read_pickle(save_path)
        X_train_reduced = saved_data['X_train']
        X_val_reduced = saved_data['X_val']
        selected_features = saved_data['features']
        logging.info(f"Loaded reduced data from {save_path} in {time.time() - start_time:.2f} seconds.")
        logging.info(f"selected_features: {selected_features}")
        logging.info(f"X_train_reduced: {X_train_reduced}")
        logging.info(f"X_val_reduced: {X_val_reduced}")
        return X_train_reduced, X_val_reduced, selected_features

    model_params = {
        'iterations': MAX_ITERATIONS,
        'depth': 6,
        'learning_rate': 0.1,
        'loss_function': 'Logloss',
        'verbose': True,
        'random_seed': 42,
        'early_stopping_rounds': EARLY_STOPPING,
    }

    if GPU_DEVICE >= 0:
        model_params['task_type'] = 'GPU'
        model_params['devices'] = [GPU_DEVICE] 

    logging.info(f"Catboost model params: {model_params}")
    model = CatBoostClassifier(**model_params)

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0)

    importance_df = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.get_feature_importance(),
    })
    top_features_catboost = importance_df.nlargest(
        top_n_features, 'importance'
    )['feature'].tolist()
    logging.info(f"top_features_catboost: {top_features_catboost}")
    
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    shap_importance = np.abs(shap_values).mean(axis=1)  
    shap_importance_df = pd.DataFrame({
        'feature': X_train.columns,
        'importance': shap_importance.mean(axis=0),  
    })
    top_features_shap = shap_importance_df.nlargest(
        top_n_features, 'importance'
    )['feature'].tolist()
    logging.info(f"top_features_shap: {top_features_shap}")
    selected_features = list(set(top_features_catboost) & set(top_features_shap))

    X_train_reduced = X_train[selected_features]
    X_val_reduced = X_val[selected_features]

    if save_path:
        data_to_save = {'X_train': X_train_reduced, 'X_val': X_val_reduced, 'features': selected_features}
        pd.to_pickle(data_to_save, save_path)
        logging.info(f"Saved reduced data to {save_path}")

    logging.info(f"Dimensionality reduction completed in {time.time() - start_time:.2f} seconds.")
    logging.info(f"selected_features: {selected_features}")
    return X_train_reduced, X_val_reduced, selected_features

In [41]:
def train_and_evaluate_catboost(X, y, X_val, y_val, params, cv_folds=10):
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    metrics_per_fold = []
    train_loss_history = []
    val_loss_history = []

    training_times = []
    prediction_times = []

    fold = 1
    for train_index, test_index in skf.split(X, y):
        logging.info(f"Fold {fold}")

        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        model = CatBoostClassifier(**params, verbose=0)

        train_start = time.time()
        model.fit(X_train_fold, y_train_fold, eval_set=(X_test_fold, y_test_fold), use_best_model=True)
        train_end = time.time()

        predict_start = time.time()
        y_pred = model.predict(X_test_fold)
        y_proba = model.predict_proba(X_test_fold)[:, 1]
        predict_end = time.time()

        training_time = train_end - train_start
        prediction_time = predict_end - predict_start

        training_times.append(training_time)
        prediction_times.append(prediction_time)

        acc = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred)
        recall = recall_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred)
        roc_auc = roc_auc_score(y_test_fold, y_proba)

        tn, fp, fn, tp = confusion_matrix(y_test_fold, y_pred).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)

        metrics_per_fold.append({
            'Accuracy': acc,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': roc_auc,
            'Specificity': specificity,
            'Sensitivity': sensitivity
        })

        fold += 1

    # Average metrics
    df_metrics = pd.DataFrame(metrics_per_fold)
    avg_metrics = df_metrics.mean().to_dict()

    return model, avg_metrics, {'train_loss': train_loss_history, 'val_loss': val_loss_history}, training_times, prediction_times


In [42]:
if __name__ == '__main__':
    total_start_time = time.time()
    data_path = 'C:/Class/HK6/IPrj/Datasets/merged_2761_with_age_data.csv'

    data, X_train, y_train, X_val, y_val = load_and_preprocess_data_and_split(data_path)

    filtered_features = [
        "234632_x_at", "209603_at", "230527_at", "229963_at", "217901_at", "214719_at",
        "219513_s_at", "210789_x_at", "204777_s_at", "203294_s_at", "230753_at", "242056_at",
        "217680_x_at", "214945_at", "222312_s_at", "214705_at", "241688_at", "241611_s_at",
        "236952_at", "207636_at", "243659_at", "226311_at", "211772_x_at", "244719_at",
        "239766_at", "243272_at"
    ]
    X_filtered = X_train[filtered_features]
    X_val_filtered = X_val[filtered_features]

    params_26 = {
        'iterations': 1000,
        'depth': 11,
        'learning_rate': 0.1,
    }

    logging.info("--- CatBoost26 Model ---")

    model_26, metrics_26, history_fold, training_times, prediction_times = train_and_evaluate_catboost(
        X_filtered, y_train, X_val_filtered, y_val, params_26, cv_folds=10
    )

    avg_training_time = np.mean(training_times)
    avg_prediction_time = np.mean(prediction_times)

    print('CatBoost26 Metrics:', metrics_26)
    logging.info(f"CatBoost26 Metrics: {metrics_26}")

    print(f"\nAverage Training Time per Fold: {avg_training_time:.2f} seconds")
    print(f"Average Prediction Time per Fold: {avg_prediction_time:.2f} seconds")

    total_end_time = time.time()
    total_duration = total_end_time - total_start_time
    logging.info(f"Total execution time: {total_duration:.2f} seconds")

    logging.info("--- Main Execution Completed ---")


2025-07-18 11:49:06,227 - INFO - Loading data from: C:/Class/HK6/IPrj/Datasets/merged_2761_with_age_data.csv
2025-07-18 11:51:26,598 - INFO - Data loaded and preprocessed.
2025-07-18 11:51:26,598 - INFO - Training set shape: (2208, 44755), Validation set shape: (553, 44755)
2025-07-18 11:51:27,089 - INFO - --- CatBoost26 Model ---
2025-07-18 11:51:27,089 - INFO - Fold 1
2025-07-18 11:52:54,822 - INFO - Fold 2
2025-07-18 11:54:21,692 - INFO - Fold 3
2025-07-18 11:55:49,237 - INFO - Fold 4
2025-07-18 11:57:13,264 - INFO - Fold 5
2025-07-18 11:58:37,921 - INFO - Fold 6
2025-07-18 12:00:03,061 - INFO - Fold 7
2025-07-18 12:01:32,936 - INFO - Fold 8
2025-07-18 12:03:09,476 - INFO - Fold 9
2025-07-18 12:05:28,784 - INFO - Fold 10
2025-07-18 12:08:00,093 - INFO - CatBoost26 Metrics: {'Accuracy': 0.9995454545454546, 'Precision': 0.999438202247191, 'Recall': 1.0, 'F1 Score': 0.9997183098591549, 'ROC AUC': 0.9999868611220603, 'Specificity': 0.9976744186046511, 'Sensitivity': 1.0}
2025-07-18 12:0

CatBoost26 Metrics: {'Accuracy': 0.9995454545454546, 'Precision': 0.999438202247191, 'Recall': 1.0, 'F1 Score': 0.9997183098591549, 'ROC AUC': 0.9999868611220603, 'Specificity': 0.9976744186046511, 'Sensitivity': 1.0}

Average Training Time per Fold: 99.27 seconds
Average Prediction Time per Fold: 0.01 seconds
