In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from itertools import product
import random
from sklearn.naive_bayes import GaussianNB
from scipy.optimize import minimize
from kalman_filter.kalman_filter import (
    ConstantVelocityKalmanFilter, FinancialModelKalmanFilter, optimize_kalman_hyperparameters
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pywt  # Ensure you have pywavelets installed for wavelet transforms
# from sklearn.metrics import mean_squared_error, accuracy_score
# from sklearn.model_selection import ParameterGrid
# from joblib import Parallel, delayed

In [2]:
# -----------------
# Hyperparameter Configurations
# -----------------

RANDOM_STATE = 42
WINDOW_SIZE = 10

LASSO_PARAM_GRID = {"logisticregression__C": np.logspace(-3, 2, 10)}
RF_PARAM_GRID = {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
XGB_PARAM_GRID = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0]
}
NN_PARAM_GRID = {
    "hidden_size": [32, 64, 128],
    "learning_rate": [0.001, 0.01],
    "num_epochs": [50, 100]
}
LSTM_PARAM_GRID = {
    "hidden_size": [32, 64, 128],
    "num_layers": [1, 2],
    "learning_rate": [0.001, 0.01],
    "num_epochs": [50, 100]
}

# Kalman Filter Hyperparameters
CVKF_PARAM_GRID = [
    {"initial_state": np.array([0.0]), "Q_diag": [q], "R_diag": [r]}
    for q in [0.01, 0.1, 1.0, 10.0]
    for r in [0.01, 0.1, 1.0, 10.0]
]
FMKF_PARAM_GRID = [
    {"initial_state": np.array([0.0]), "Q_diag": [q], "R_diag": [r], "alpha": [a], "beta": [b]}
    for q in [0.01, 0.1, 1.0, 10.0]
    for r in [0.01, 0.1, 1.0, 10.0]
    for a in [0.4, 0.6, 0.8, 1.0]
    for b in [0.05, 0.1, 0.2, 0.4]
]


# -----------------
# Utility Functions
# -----------------


def five_way_split(X, y, train_size=0.5, val1_size=0.15, val2_size=0.1, kalman_size=0.1, test_size=0.15):
    """Split data into five subsets."""
    total_len = len(X)

    train_len = round(total_len * train_size)
    val1_len = round(total_len * val1_size)
    val2_len = round(total_len * val2_size)
    kalman_len = round(total_len * kalman_size)
    test_len = total_len - train_len - val1_len - val2_len - kalman_len

    train_idx = range(0, train_len)
    val1_idx = range(train_len, train_len + val1_len)
    val2_idx = range(train_len + val1_len, train_len + val1_len + val2_len)
    kalman_idx = range(train_len + val1_len + val2_len, train_len + val1_len + val2_len + kalman_len)
    test_idx = range(train_len + val1_len + val2_len + kalman_len, total_len)

    return (
        X.iloc[train_idx], X.iloc[val1_idx], X.iloc[val2_idx], X.iloc[kalman_idx], X.iloc[test_idx],
        y.iloc[train_idx], y.iloc[val1_idx], y.iloc[val2_idx], y.iloc[kalman_idx], y.iloc[test_idx]
    )


def optimize_model_hyperparameters(model_fn, param_grid, X_train, y_train, validation_data, n_jobs=1):
    """
    Performs hyperparameter optimization using GridSearchCV.

    Args:
        model_fn: A callable that returns an instance of the model.
        param_grid: Dictionary of hyperparameters to search.
        X_train: Training features.
        y_train: Training labels.
        validation_data: Tuple (X_val, y_val) for validation.
        n_jobs: Number of parallel jobs for GridSearchCV.

    Returns:
        best_model: The best model after GridSearchCV.
        best_params: The best parameters from the search.
    """
    model = model_fn()
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring='roc_auc',
        cv=5,
        n_jobs=n_jobs,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_


def calculate_classification_metrics(y_true, y_pred, y_pred_proba=None):
    """
    Calculate classification metrics including Accuracy, Precision, Recall, F1, and AUC.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        y_pred_proba (array-like, optional): Predicted probabilities for the positive class.

    Returns:
        dict: Dictionary of calculated metrics.
    """
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0)
    }

    if y_pred_proba is not None:
        metrics["AUC"] = roc_auc_score(y_true, y_pred_proba)

    return metrics

from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed


def preprocess_data_with_advanced_features(data_frame, target_column, lag_steps=None, rolling_window=10):
    """
    Preprocess data for time series modeling with advanced feature engineering.
    Ensures no data leakage by strictly using past and current data for feature generation.

    Args:
        data_frame (str): Variable name of loaded pandas data frame.
        target_column (str): Target column name.
        lag_steps (list): List of lag steps for feature engineering.
        rolling_window (int): Window size for rolling features.

    Returns:
        tuple: Feature DataFrame (X) and target series (y).
    """
    # Load data and parse dates
    data = data_frame
    data.index = pd.to_datetime(data.index, errors='coerce')  # Ensure index is datetime
    assert data.index.is_monotonic_increasing, "Dataset is not sorted by time."

    # Fill missing values in the target column
    data[target_column] = data[target_column].interpolate(method='linear').bfill()

    # Initialize feature storage
    features = []
    indices = []

    for end_idx in range(rolling_window, len(data)):
        # Define the current window
        window = data.iloc[end_idx - rolling_window:end_idx]

        # Compute features for the current timestamp
        current_features = {}

        # Rolling statistics
        signal_cols = [col for col in data.columns if col not in ['patient', 'newtest', 'target', 'event1', 'event2', 'event3', 'event4', 'sleepstage']]  # sleepstage excluded as categorical variable
        for col in signal_cols:
            current_features[f'{col}_roll_mean'] = window[col].mean()
            current_features[f'{col}_roll_std'] = window[col].std()

        # Lagged features
        if lag_steps:
            for lag in lag_steps:
                if end_idx - lag >= 0:
                    current_features[f'{target_column}_lag{lag}'] = data[target_column].iloc[end_idx - lag]

        # Fourier Transform Features
        for col in signal_cols:
            fourier_transform = np.abs(np.fft.fft(window[col].fillna(0)))
            current_features[f'{col}_fft_max'] = np.max(fourier_transform)
            current_features[f'{col}_fft_mean'] = np.mean(fourier_transform)

        # Wavelet Transform Features
        for col in signal_cols:
            coeffs = pywt.wavedec(window[col].fillna(0), 'db1', level=3)
            current_features[f'{col}_wavelet_approx'] = coeffs[0].mean()
            current_features[f'{col}_wavelet_detail1'] = coeffs[1].mean()
            current_features[f'{col}_wavelet_detail2'] = coeffs[2].mean()

        # Add features and corresponding index
        features.append(current_features)
        indices.append(data.index[end_idx])

    # Convert features to DataFrame
    feature_df = pd.DataFrame(features, index=indices)

    # Align target values
    y = data.loc[feature_df.index, target_column]

    return feature_df, y

In [7]:
# -----------------
# Load and Preprocess Data
# -----------------

def load_and_preprocess_data(dataframe):
    # Load and preprocess data with advanced features
    X, y = preprocess_data_with_advanced_features(
        data_frame=dataframe,
        target_column='target',
        lag_steps=[1, 2, 3],
        rolling_window=10
    )

    # Perform five-way split
    X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test = five_way_split(
        X, y, train_size=0.5, val1_size=0.15, val2_size=0.05, kalman_size=0.1, test_size=0.2
    )
    
    return X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test

# Test with transitions from state 0 → 1 only

In [3]:
# Load data
master_df = pd.read_stata('./data/processed-data/combined-patient-data-1_00.dta')

In [4]:
# Group master dataframe by 'patient' and 'newtest' pairs (i.e. by each unique patient data)
# Access or initialize each dataframe like: group_dict[('pid100100', 0)]
group_dict = {
    (val1, val2): data
    for (val1, val2), data in master_df.groupby(['patient', 'newtest'])
}

In [5]:
# Preprocess dataframes
for group_key, subset_df in group_dict.items():
    subset_df['target'] = subset_df[['event1', 'event2', 'event3', 'event4']].apply(lambda x: 1 if 'Hypopnea' in x.values or 'Apnea Obstructive' in x.values or 'Apnea Central' in x.values or 'Apnea Mixed' in x.values else 0, axis=1)

    cols = list(subset_df.columns)
    cols.remove('target')
    cols.insert(3, 'target')
    subset_df = subset_df[cols]

    subset_df.set_index('timess', inplace=True)
    
    group_dict[group_key] = subset_df

In [17]:
sample = group_dict[('pid100816', 0)].copy()

In [19]:
sample['target'].value_counts()

target
0    270274
1     20110
Name: count, dtype: int64

In [20]:
y = sample['target']

In [21]:
transitions = pd.crosstab(y.shift(1), y, normalize='index')

In [22]:
transitions

target,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.999793,0.000207
1.0,0.002785,0.997215


In [25]:
# Create a lagged target variable
sample['target_lag1'] = sample['target'].shift(1)

# move target_lag1 column to the front
cols = list(sample.columns)
cols.remove('target_lag1')
cols.insert(3, 'target_lag1')
sample = sample[cols]

In [26]:
sample

Unnamed: 0_level_0,patient,newtest,target,target_lag1,event1,event2,event3,event4,chin,chin60sma,...,heartratecu,heartratecu60sma,spo2,spo260sma,positioncu,positioncu60sma,snorecu,snorecu60sma,nasalpressure,nasalpressure60sma
timess,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-28 22:15:00.100,pid100816,0.0,0,,,,,,-0.000016,,...,22.39,,98.0,,3.204,,0.001434,,-0.44410,
2023-06-28 22:15:00.200,pid100816,0.0,0,0.0,,,,,-0.000019,,...,22.39,,98.0,,2.838,,-0.006409,,-0.27970,
2023-06-28 22:15:00.300,pid100816,0.0,0,0.0,,,,,-0.000049,,...,22.39,,98.0,,2.853,,0.007279,,-0.05683,
2023-06-28 22:15:00.400,pid100816,0.0,0,0.0,,,,,-0.000001,,...,22.39,,98.0,,2.609,,-0.002930,,0.14230,
2023-06-28 22:15:00.500,pid100816,0.0,0,0.0,,,,,0.000003,,...,22.39,,98.0,,2.533,,-0.001022,,0.21020,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-29 06:19:59.400,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000047,-0.000009,...,74.07,73.429501,98.0,96.856905,95.170,95.225191,0.000061,-5.615641e-07,0.20000,-0.001838
2023-06-29 06:19:59.500,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000041,-0.000009,...,74.07,73.435973,98.0,96.858569,95.260,95.225491,0.000702,-2.316140e-07,0.15880,-0.001416
2023-06-29 06:19:59.600,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000040,-0.000010,...,74.07,73.442446,98.0,96.860233,95.200,95.225774,-0.000244,-3.050416e-06,0.11200,-0.000908
2023-06-29 06:19:59.700,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000033,-0.000010,...,74.07,73.446123,98.0,96.861897,95.190,95.225774,0.001129,1.229617e-07,0.09409,-0.000283


In [30]:
# Filter the dataset to only observations where previous state of target was 0
transition_sample = sample[sample['target_lag1'] == 0]
transition_sample


Unnamed: 0_level_0,patient,newtest,target,target_lag1,event1,event2,event3,event4,chin,chin60sma,...,heartratecu,heartratecu60sma,spo2,spo260sma,positioncu,positioncu60sma,snorecu,snorecu60sma,nasalpressure,nasalpressure60sma
timess,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-28 22:15:00.200,pid100816,0.0,0,0.0,,,,,-0.000019,,...,22.39,,98.0,,2.838,,-0.006409,,-0.27970,
2023-06-28 22:15:00.300,pid100816,0.0,0,0.0,,,,,-0.000049,,...,22.39,,98.0,,2.853,,0.007279,,-0.05683,
2023-06-28 22:15:00.400,pid100816,0.0,0,0.0,,,,,-0.000001,,...,22.39,,98.0,,2.609,,-0.002930,,0.14230,
2023-06-28 22:15:00.500,pid100816,0.0,0,0.0,,,,,0.000003,,...,22.39,,98.0,,2.533,,-0.001022,,0.21020,
2023-06-28 22:15:00.600,pid100816,0.0,0,0.0,,,,,0.000014,,...,22.39,,98.0,,2.106,,0.001328,,0.27380,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-29 06:19:59.400,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000047,-0.000009,...,74.07,73.429501,98.0,96.856905,95.170,95.225191,0.000061,-5.615641e-07,0.20000,-0.001838
2023-06-29 06:19:59.500,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000041,-0.000009,...,74.07,73.435973,98.0,96.858569,95.260,95.225491,0.000702,-2.316140e-07,0.15880,-0.001416
2023-06-29 06:19:59.600,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000040,-0.000010,...,74.07,73.442446,98.0,96.860233,95.200,95.225774,-0.000244,-3.050416e-06,0.11200,-0.000908
2023-06-29 06:19:59.700,pid100816,0.0,0,0.0,Spontaneous Arousal,,,,-0.000033,-0.000010,...,74.07,73.446123,98.0,96.861897,95.190,95.225774,0.001129,1.229617e-07,0.09409,-0.000283


In [31]:
transition_sample['target'].value_counts()

target
0    270217
1        56
Name: count, dtype: int64

In [35]:
sample['target'].value_counts()

target
0    270274
1     20110
Name: count, dtype: int64

In [36]:
# drop column target_lag1 from transition_sample
transition_sample.drop('target_lag1', axis=1, inplace=True)
transition_sample

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transition_sample.drop('target_lag1', axis=1, inplace=True)


Unnamed: 0_level_0,patient,newtest,target,event1,event2,event3,event4,chin,chin60sma,sleepstage,heartratecu,heartratecu60sma,spo2,spo260sma,positioncu,positioncu60sma,snorecu,snorecu60sma,nasalpressure,nasalpressure60sma
timess,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2023-06-28 22:15:00.200,pid100816,0.0,0,,,,,-0.000019,,W,22.39,,98.0,,2.838,,-0.006409,,-0.27970,
2023-06-28 22:15:00.300,pid100816,0.0,0,,,,,-0.000049,,W,22.39,,98.0,,2.853,,0.007279,,-0.05683,
2023-06-28 22:15:00.400,pid100816,0.0,0,,,,,-0.000001,,W,22.39,,98.0,,2.609,,-0.002930,,0.14230,
2023-06-28 22:15:00.500,pid100816,0.0,0,,,,,0.000003,,W,22.39,,98.0,,2.533,,-0.001022,,0.21020,
2023-06-28 22:15:00.600,pid100816,0.0,0,,,,,0.000014,,W,22.39,,98.0,,2.106,,0.001328,,0.27380,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-29 06:19:59.400,pid100816,0.0,0,Spontaneous Arousal,,,,-0.000047,-0.000009,N2,74.07,73.429501,98.0,96.856905,95.170,95.225191,0.000061,-5.615641e-07,0.20000,-0.001838
2023-06-29 06:19:59.500,pid100816,0.0,0,Spontaneous Arousal,,,,-0.000041,-0.000009,N2,74.07,73.435973,98.0,96.858569,95.260,95.225491,0.000702,-2.316140e-07,0.15880,-0.001416
2023-06-29 06:19:59.600,pid100816,0.0,0,Spontaneous Arousal,,,,-0.000040,-0.000010,N2,74.07,73.442446,98.0,96.860233,95.200,95.225774,-0.000244,-3.050416e-06,0.11200,-0.000908
2023-06-29 06:19:59.700,pid100816,0.0,0,Spontaneous Arousal,,,,-0.000033,-0.000010,N2,74.07,73.446123,98.0,96.861897,95.190,95.225774,0.001129,1.229617e-07,0.09409,-0.000283


In [37]:
Xy_before_dropnan = load_and_preprocess_data(transition_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target_column] = data[target_column].interpolate(method='linear').bfill()


In [38]:
# Dropping NaN from Xs and corresponding rows from ys
def drop_nan_from_Xy(X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test):
    # Drop rows with NaN from X
    X_train_cleaned = X_train.dropna()
    X_val1_cleaned = X_val1.dropna()
    X_val2_cleaned = X_val2.dropna()
    X_kalman_cleaned = X_kalman.dropna()
    X_test_cleaned = X_test.dropna()
    
    # Drop corresponding rows from y
    y_train_cleaned = y_train[X_train_cleaned.index]
    y_val1_cleaned = y_val1[X_val1_cleaned.index]
    y_val2_cleaned = y_val2[X_val2_cleaned.index]
    y_kalman_cleaned = y_kalman[X_kalman_cleaned.index]
    y_test_cleaned = y_test[X_test_cleaned.index]
    
    return (
        X_train_cleaned, X_val1_cleaned, X_val2_cleaned, X_kalman_cleaned, X_test_cleaned,
        y_train_cleaned, y_val1_cleaned, y_val2_cleaned, y_kalman_cleaned, y_test_cleaned
    )

In [39]:
X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test = drop_nan_from_Xy(*Xy_before_dropnan)

In [40]:
# -----------------
# Logistic Regression
# -----------------
log_reg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logisticregression', LogisticRegression(class_weight='balanced', max_iter=2000))  # Handle class imbalance
])
log_reg_grid = GridSearchCV(log_reg_pipeline, LASSO_PARAM_GRID, cv=5, scoring='roc_auc')
log_reg_grid.fit(X_train, y_train)
log_reg_model = log_reg_grid.best_estimator_

log_reg_preds = log_reg_model.predict(X_test)
log_reg_metrics = {
    "Accuracy": accuracy_score(y_test, log_reg_preds),
    "Precision": precision_score(y_test, log_reg_preds, zero_division=0),  # Avoid warning
    "Recall": recall_score(y_test, log_reg_preds, zero_division=0),
    "F1": f1_score(y_test, log_reg_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, log_reg_model.predict_proba(X_test)[:, 1])
}
print("Logistic Regression Metrics:", log_reg_metrics)

Logistic Regression Metrics: {'Accuracy': 0.7588847982535659, 'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'AUC': np.float64(0.3924733570159858)}


In [44]:
np.unique(log_reg_preds, return_counts=True)

(array([0, 1]), array([41025, 13028]))

In [43]:
# -----------------
# Random Forest
# -----------------
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('randomforest', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))  # Handle class imbalance
])
rf_param_grid = {
    "randomforest__n_estimators": [50, 100, 200],  # Prefixed by 'randomforest__'
    "randomforest__max_depth": [None, 10, 20]
}
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='roc_auc')
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_

rf_preds = rf_model.predict(X_test)
rf_metrics = {
    "Accuracy": accuracy_score(y_test, rf_preds),
    "Precision": precision_score(y_test, rf_preds),
    "Recall": recall_score(y_test, rf_preds),
    "F1": f1_score(y_test, rf_preds),
    "AUC": roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
}
print("Random Forest Metrics:", rf_metrics)

Random Forest Metrics: {'Accuracy': 0.9999074981962148, 'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'AUC': np.float64(0.44565756364712844)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
np.unique(rf_preds, return_counts=True)

(array([0]), array([54053]))

In [42]:
# -----------------
# XGBoost
# -----------------
# Define the pipeline
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(random_state=RANDOM_STATE))
])

# Prefix all hyperparameters for 'xgb' step with 'xgb__'
xgb_param_grid = {
    "xgb__n_estimators": [50, 100, 200],
    "xgb__max_depth": [3, 5, 7],
    "xgb__learning_rate": [0.01, 0.1, 0.2],
    "xgb__subsample": [0.6, 0.8, 1.0]
}

# Perform GridSearchCV
xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=5, scoring='roc_auc')
xgb_grid.fit(X_train, y_train)
xgb_model = xgb_grid.best_estimator_

# Predictions and metrics
xgb_preds = xgb_model.predict(X_test)
xgb_metrics = {
    "Accuracy": accuracy_score(y_test, xgb_preds),
    "Precision": precision_score(y_test, xgb_preds, zero_division=0),
    "Recall": recall_score(y_test, xgb_preds, zero_division=0),
    "F1": f1_score(y_test, xgb_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
}
print("XGBoost Metrics:", xgb_metrics)

XGBoost Metrics: {'Accuracy': 0.9999074981962148, 'Precision': 0.0, 'Recall': 0.0, 'F1': 0.0, 'AUC': np.float64(0.4754662522202487)}


In [46]:
np.unique(xgb_preds, return_counts=True)

(array([0]), array([54053]))

# Processing batch2 (without 18 new format data)

In [7]:
# Load data
master_df = pd.read_stata('./data/processed-data/combined-patient-data-2_00.dta')

In [9]:
# Group master dataframe by 'patient' and 'newtest' pairs (i.e. by each unique patient data)
# Access or initialize each dataframe like: group_dict[('pid100100', 0)]
group_dict = {
    (val1, val2): data
    for (val1, val2), data in master_df.groupby(['patient', 'newtest'])
}

In [10]:
# Preprocess dataframes
for group_key, subset_df in group_dict.items():
    subset_df['target'] = subset_df[['event1', 'event2', 'event3', 'event4']].apply(lambda x: 1 if 'Hypopnea' in x.values or 'Apnea Obstructive' in x.values or 'Apnea Central' in x.values or 'Apnea Mixed' in x.values else 0, axis=1)

    cols = list(subset_df.columns)
    cols.remove('target')
    cols.insert(3, 'target')
    subset_df = subset_df[cols]

    subset_df.set_index('timess', inplace=True)
    
    group_dict[group_key] = subset_df

In [11]:
# Function to check for ideal patient data
def is_ideal_patient(patient_df):
    X, y = preprocess_data_with_advanced_features(
        data_frame=patient_df,
        target_column='target',
        lag_steps=[1, 2, 3],
        rolling_window=10
    )

    # Perform five-way split
    X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test = five_way_split(
        X, y, train_size=0.5, val1_size=0.15, val2_size=0.05, kalman_size=0.1, test_size=0.2
    )
    
    # Check if all y split components have OSA event occurance (i.e. target = 1)
    if y_train.sum() > 0 and y_val1.sum() > 0 and y_val2.sum() > 0 and y_kalman.sum() > 0 and y_test.sum() > 0:
        return True
    else:
        return False

In [13]:
len(group_dict)

155

In [12]:
# Initialize list of ideal patients
ideal_patients = []

In [14]:
# Check for ideal patients in batches (start, end inclusive range)
start = 0
end = 1

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#0 Ideal patient data: ('pid101262', np.int32(0))
#1 Ideal patient data: ('pid103779', np.int32(0))


In [15]:
# Check for ideal patients in batches (start, end inclusive range)
start = 2
end = 10

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#2 Ideal patient data: ('pid103834', np.int32(0))
#3 Not ideal: ('pid104223', np.int32(0))
#4 Ideal patient data: ('pid105066', np.int32(0))
#5 Not ideal: ('pid106588', np.int32(0))
#6 Ideal patient data: ('pid109223', np.int32(0))
#7 Not ideal: ('pid109592', np.int32(0))
#8 Ideal patient data: ('pid109678', np.int32(0))
#9 Not ideal: ('pid110546', np.int32(0))
#10 Not ideal: ('pid112297', np.int32(0))


In [16]:
# Check for ideal patients in batches (start, end inclusive range)
start = 11
end = 30

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#11 Ideal patient data: ('pid114718', np.int32(0))
#12 Ideal patient data: ('pid120645', np.int32(0))
#13 Not ideal: ('pid125847', np.int32(0))
#14 Ideal patient data: ('pid126099', np.int32(0))
#15 Ideal patient data: ('pid134464', np.int32(0))
#16 Not ideal: ('pid137169', np.int32(0))
#17 Ideal patient data: ('pid137677', np.int32(0))
#18 Ideal patient data: ('pid137677', np.int32(1))
#19 Ideal patient data: ('pid150632', np.int32(0))
#20 Ideal patient data: ('pid158450', np.int32(0))
#21 Ideal patient data: ('pid160007', np.int32(0))
#22 Ideal patient data: ('pid164094', np.int32(0))
#23 Ideal patient data: ('pid175945', np.int32(0))
#24 Ideal patient data: ('pid176527', np.int32(0))
#25 Ideal patient data: ('pid191990', np.int32(0))
#26 Ideal patient data: ('pid195939', np.int32(0))
#27 Ideal patient data: ('pid206054', np.int32(0))
#28 Ideal patient data: ('pid206255', np.int32(0))
#29 Ideal patient data: ('pid217117', np.int32(0))
#30 Ideal patient data: ('pid219574', np.int32(0)

In [17]:
# Check for ideal patients in batches (start, end inclusive range)
start = 31
end = 40

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#31 Ideal patient data: ('pid219977', np.int32(0))
#32 Ideal patient data: ('pid222375', np.int32(0))
#33 Not ideal: ('pid227439', np.int32(0))
#34 Ideal patient data: ('pid236528', np.int32(0))
#35 Ideal patient data: ('pid237530', np.int32(0))
#36 Ideal patient data: ('pid249760', np.int32(0))
#37 Not ideal: ('pid253204', np.int32(1))
#38 Not ideal: ('pid253204', np.int32(2))
#39 Ideal patient data: ('pid258053', np.int32(0))
#40 Ideal patient data: ('pid261581', np.int32(0))


In [18]:
# Check for ideal patients in batches (start, end inclusive range)
start = 41
end = 60

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#41 Ideal patient data: ('pid262080', np.int32(0))
#42 Ideal patient data: ('pid262752', np.int32(0))
#43 Not ideal: ('pid272598', np.int32(1))
#44 Ideal patient data: ('pid274397', np.int32(0))
#45 Not ideal: ('pid277709', np.int32(0))
#46 Not ideal: ('pid277709', np.int32(1))
#47 Ideal patient data: ('pid284565', np.int32(0))
#48 Not ideal: ('pid300325', np.int32(0))
#49 Ideal patient data: ('pid301854', np.int32(0))
#50 Not ideal: ('pid309011', np.int32(0))
#51 Ideal patient data: ('pid309416', np.int32(0))
#52 Not ideal: ('pid311293', np.int32(0))
#53 Ideal patient data: ('pid318031', np.int32(0))
#54 Ideal patient data: ('pid318257', np.int32(0))
#55 Ideal patient data: ('pid322199', np.int32(0))
#56 Not ideal: ('pid322272', np.int32(0))
#57 Ideal patient data: ('pid322712', np.int32(0))
#58 Ideal patient data: ('pid343238', np.int32(0))
#59 Ideal patient data: ('pid347862', np.int32(0))
#60 Not ideal: ('pid364065', np.int32(0))


In [19]:
# Check for ideal patients in batches (start, end inclusive range)
start = 61
end = 100

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#61 Not ideal: ('pid364822', np.int32(0))
#62 Ideal patient data: ('pid376450', np.int32(0))
#63 Ideal patient data: ('pid380683', np.int32(0))
#64 Ideal patient data: ('pid385394', np.int32(0))
#65 Not ideal: ('pid390398', np.int32(0))
#66 Not ideal: ('pid390398', np.int32(1))
#67 Ideal patient data: ('pid397357', np.int32(0))
#68 Ideal patient data: ('pid399459', np.int32(0))
#69 Ideal patient data: ('pid399599', np.int32(0))
#70 Ideal patient data: ('pid402744', np.int32(0))
#71 Ideal patient data: ('pid405345', np.int32(0))
#72 Ideal patient data: ('pid411960', np.int32(0))
#73 Not ideal: ('pid412118', np.int32(0))
#74 Ideal patient data: ('pid413509', np.int32(0))
#75 Not ideal: ('pid414678', np.int32(0))
#76 Ideal patient data: ('pid423368', np.int32(0))
#77 Ideal patient data: ('pid437817', np.int32(0))
#78 Ideal patient data: ('pid438399', np.int32(0))
#79 Ideal patient data: ('pid438399', np.int32(1))
#80 Ideal patient data: ('pid446552', np.int32(0))
#81 Not ideal: ('pid45305

In [20]:
# Check for ideal patients in batches (start, end inclusive range)
start = 101
end = 155

for i, (group_key, subset_df) in enumerate(group_dict.items()):
    if start <= i <= end:
        if is_ideal_patient(subset_df):
            print(f"#{i} Ideal patient data: {group_key}")
            ideal_patients.append(group_key)
        else:
            print(f"#{i} Not ideal: {group_key}")

#101 Ideal patient data: ('pid533327', np.int32(0))
#102 Ideal patient data: ('pid534289', np.int32(0))
#103 Ideal patient data: ('pid535238', np.int32(0))
#104 Not ideal: ('pid538608', np.int32(0))
#105 Not ideal: ('pid544735', np.int32(0))
#106 Ideal patient data: ('pid545397', np.int32(0))
#107 Ideal patient data: ('pid549202', np.int32(0))
#108 Ideal patient data: ('pid552164', np.int32(0))
#109 Not ideal: ('pid553262', np.int32(0))
#110 Ideal patient data: ('pid553961', np.int32(0))
#111 Ideal patient data: ('pid557844', np.int32(0))
#112 Ideal patient data: ('pid557944', np.int32(0))
#113 Ideal patient data: ('pid563193', np.int32(0))
#114 Ideal patient data: ('pid576344', np.int32(0))
#115 Ideal patient data: ('pid577183', np.int32(0))
#116 Not ideal: ('pid578645', np.int32(0))
#117 Ideal patient data: ('pid584940', np.int32(0))
#118 Not ideal: ('pid585119', np.int32(0))
#119 Ideal patient data: ('pid585927', np.int32(0))
#120 Ideal patient data: ('pid587374', np.int32(0))
#121 

In [25]:
ideal_patients

[('pid101262', 0),
 ('pid103779', 0),
 ('pid103834', 0),
 ('pid105066', 0),
 ('pid109223', 0),
 ('pid109678', 0),
 ('pid114718', 0),
 ('pid120645', 0),
 ('pid126099', 0),
 ('pid134464', 0),
 ('pid137677', 0),
 ('pid137677', 1),
 ('pid150632', 0),
 ('pid158450', 0),
 ('pid160007', 0),
 ('pid164094', 0),
 ('pid175945', 0),
 ('pid176527', 0),
 ('pid191990', 0),
 ('pid195939', 0),
 ('pid206054', 0),
 ('pid206255', 0),
 ('pid217117', 0),
 ('pid219574', 0),
 ('pid219977', 0),
 ('pid222375', 0),
 ('pid236528', 0),
 ('pid237530', 0),
 ('pid249760', 0),
 ('pid258053', 0),
 ('pid261581', 0),
 ('pid262080', 0),
 ('pid262752', 0),
 ('pid274397', 0),
 ('pid284565', 0),
 ('pid301854', 0),
 ('pid309416', 0),
 ('pid318031', 0),
 ('pid318257', 0),
 ('pid322199', 0),
 ('pid322712', 0),
 ('pid343238', 0),
 ('pid347862', 0),
 ('pid376450', 0),
 ('pid380683', 0),
 ('pid385394', 0),
 ('pid397357', 0),
 ('pid399459', 0),
 ('pid399599', 0),
 ('pid402744', 0),
 ('pid405345', 0),
 ('pid411960', 0),
 ('pid413509

In [21]:
print(len(ideal_patients))
print(len(ideal_patients) / 155)

109
0.7032258064516129


In [22]:
df_ideal = pd.DataFrame(ideal_patients, columns=['patient', 'newtest'])
df_ideal.to_csv('./data/ideal_patients_2.csv', index=False)

In [24]:
# Create ideal_group_dict from group_dict using ideal_patients
ideal_patients = pd.read_csv('./data/ideal_patients_2.csv')
ideal_patients = list(ideal_patients.itertuples(index=False, name=None))

ideal_group_dict = {
    (val1, val2): group_dict[(val1, val2)]
    for (val1, val2) in ideal_patients
}

### Test with ideal pid101262

In [27]:
Xy_before_dropnan = load_and_preprocess_data(ideal_group_dict[('pid101262', 0)])

In [28]:
# Dropping NaN from Xs and corresponding rows from ys
def drop_nan_from_Xy(X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test):
    # Drop rows with NaN from X
    X_train_cleaned = X_train.dropna()
    X_val1_cleaned = X_val1.dropna()
    X_val2_cleaned = X_val2.dropna()
    X_kalman_cleaned = X_kalman.dropna()
    X_test_cleaned = X_test.dropna()
    
    # Drop corresponding rows from y
    y_train_cleaned = y_train[X_train_cleaned.index]
    y_val1_cleaned = y_val1[X_val1_cleaned.index]
    y_val2_cleaned = y_val2[X_val2_cleaned.index]
    y_kalman_cleaned = y_kalman[X_kalman_cleaned.index]
    y_test_cleaned = y_test[X_test_cleaned.index]
    
    return (
        X_train_cleaned, X_val1_cleaned, X_val2_cleaned, X_kalman_cleaned, X_test_cleaned,
        y_train_cleaned, y_val1_cleaned, y_val2_cleaned, y_kalman_cleaned, y_test_cleaned
    )

In [29]:
X_train, X_val1, X_val2, X_kalman, X_test, y_train, y_val1, y_val2, y_kalman, y_test = drop_nan_from_Xy(*Xy_before_dropnan)

In [30]:
# -----------------
# Baselines
# -----------------

# T-1 Baseline
y_t1_baseline = X_test["target_lag1"].astype(int)
t1_metrics = {
    "Accuracy": accuracy_score(y_test, y_t1_baseline),
    "Precision": precision_score(y_test, y_t1_baseline, zero_division=0),
    "Recall": recall_score(y_test, y_t1_baseline, zero_division=0),
    "F1": f1_score(y_test, y_t1_baseline, zero_division=0),
    "AUC": roc_auc_score(y_test, y_t1_baseline)
}
print("T-1 Baseline Metrics:", t1_metrics)

# Random Classifier Baseline
def random_classifier(y_true, seed=42):
    random.seed(seed)
    return pd.Series([random.choice([0, 1]) for _ in range(len(y_true))], index=y_true.index)

y_random = random_classifier(y_test)
random_metrics = {
    "Accuracy": accuracy_score(y_test, y_random),
    "Precision": precision_score(y_test, y_random, zero_division=0),
    "Recall": recall_score(y_test, y_random, zero_division=0),
    "F1": f1_score(y_test, y_random, zero_division=0),
    "AUC": roc_auc_score(y_test, y_random)
}
print("Random Classifier Metrics:", random_metrics)

# Rolling Naive Bayes Baseline
def rolling_naive_bayes(train_series, test_series, window_size):
    predictions = []
    rolling_buffer = train_series.tail(window_size)

    for test_point in test_series:
        # Fit Naive Bayes on the rolling buffer
        X_train = np.arange(len(rolling_buffer)).reshape(-1, 1)  # Sequential indices as features
        y_train = rolling_buffer.values  # Targets

        model = GaussianNB()
        model.fit(X_train, y_train)

        # Predict the test point
        X_test = np.array([[len(rolling_buffer)]]).reshape(-1, 1)
        prediction = model.predict(X_test)
        predictions.append(prediction[0])

        # Update rolling buffer
        rolling_buffer = pd.concat([rolling_buffer, pd.Series([test_point])], ignore_index=True)
        if len(rolling_buffer) > window_size:
            rolling_buffer = rolling_buffer.iloc[1:]

    return pd.Series(predictions, index=test_series.index)


y_rolling_nb = rolling_naive_bayes(pd.concat([y_train, y_val1, y_val2]), y_test, WINDOW_SIZE)
rolling_nb_metrics = {
    "Accuracy": accuracy_score(y_test, y_rolling_nb),
    "Precision": precision_score(y_test, y_rolling_nb, zero_division=0),
    "Recall": recall_score(y_test, y_rolling_nb, zero_division=0),
    "F1": f1_score(y_test, y_rolling_nb, zero_division=0),
    "AUC": roc_auc_score(y_test, y_rolling_nb)
}
print("Rolling Naive Bayes Metrics:", rolling_nb_metrics)

T-1 Baseline Metrics: {'Accuracy': 0.9995236548662986, 'Precision': 0.996769456681351, 'Recall': 0.996769456681351, 'F1': 0.996769456681351, 'AUC': np.float64(0.9982561635907924)}
Random Classifier Metrics: {'Accuracy': 0.49749918804806753, 'Precision': 0.07348696963170363, 'Recall': 0.5010279001468428, 'F1': 0.12817430503380917, 'AUC': np.float64(0.4991231132337768)}
Rolling Naive Bayes Metrics: {'Accuracy': 0.9985709645988957, 'Precision': 0.9903083700440528, 'Recall': 0.9903083700440528, 'F1': 0.9903083700440528, 'AUC': np.float64(0.9947684907723772)}


In [None]:
# -----------------
# Base Models
# -----------------
# -----------------
# Logistic Regression
# -----------------
log_reg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logisticregression', LogisticRegression(class_weight='balanced'))  # Handle class imbalance
])
log_reg_grid = GridSearchCV(log_reg_pipeline, LASSO_PARAM_GRID, cv=5, scoring='roc_auc')
log_reg_grid.fit(X_train, y_train)
log_reg_model = log_reg_grid.best_estimator_

log_reg_preds = log_reg_model.predict(X_test)
log_reg_metrics = {
    "Accuracy": accuracy_score(y_test, log_reg_preds),
    "Precision": precision_score(y_test, log_reg_preds, zero_division=0),  # Avoid warning
    "Recall": recall_score(y_test, log_reg_preds, zero_division=0),
    "F1": f1_score(y_test, log_reg_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, log_reg_model.predict_proba(X_test)[:, 1])
}
print("Logistic Regression Metrics:", log_reg_metrics)

# -----------------
# Random Forest
# -----------------
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('randomforest', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))  # Handle class imbalance
])
rf_param_grid = {
    "randomforest__n_estimators": [50, 100, 200],  # Prefixed by 'randomforest__'
    "randomforest__max_depth": [None, 10, 20]
}
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='roc_auc')
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_

rf_preds = rf_model.predict(X_test)
rf_metrics = {
    "Accuracy": accuracy_score(y_test, rf_preds),
    "Precision": precision_score(y_test, rf_preds),
    "Recall": recall_score(y_test, rf_preds),
    "F1": f1_score(y_test, rf_preds),
    "AUC": roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
}
print("Random Forest Metrics:", rf_metrics)

# -----------------
# XGBoost
# -----------------
# Define the pipeline
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(random_state=RANDOM_STATE))
])

# Prefix all hyperparameters for 'xgb' step with 'xgb__'
xgb_param_grid = {
    "xgb__n_estimators": [50, 100, 200],
    "xgb__max_depth": [3, 5, 7],
    "xgb__learning_rate": [0.01, 0.1, 0.2],
    "xgb__subsample": [0.6, 0.8, 1.0]
}

# Perform GridSearchCV
xgb_grid = GridSearchCV(xgb_pipeline, xgb_param_grid, cv=5, scoring='roc_auc')
xgb_grid.fit(X_train, y_train)
xgb_model = xgb_grid.best_estimator_

# Predictions and metrics
xgb_preds = xgb_model.predict(X_test)
xgb_metrics = {
    "Accuracy": accuracy_score(y_test, xgb_preds),
    "Precision": precision_score(y_test, xgb_preds, zero_division=0),
    "Recall": recall_score(y_test, xgb_preds, zero_division=0),
    "F1": f1_score(y_test, xgb_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
}
print("XGBoost Metrics:", xgb_metrics)


# -----------------
# Lasso (Base)
# -----------------
lasso_base_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaling for consistent input
    ('lasso', LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear'))  # L1 Regularization
])
lasso_base_param_grid = {"lasso__C": np.logspace(-3, 2, 10)}  # Regularization strength

lasso_base_grid = GridSearchCV(lasso_base_pipeline, lasso_base_param_grid, cv=5, scoring='roc_auc')
lasso_base_grid.fit(X_train, y_train)
lasso_base_model = lasso_base_grid.best_estimator_  # Capture the best Lasso model

# Predictions and Metrics
lasso_base_preds_proba = lasso_base_model.predict_proba(X_test)[:, 1]
lasso_base_preds = (lasso_base_preds_proba > 0.5).astype(int)

lasso_base_metrics = {
    "Accuracy": accuracy_score(y_test, lasso_base_preds),
    "Precision": precision_score(y_test, lasso_base_preds, zero_division=0),
    "Recall": recall_score(y_test, lasso_base_preds, zero_division=0),
    "F1": f1_score(y_test, lasso_base_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, lasso_base_preds_proba)
}
print("Lasso (Base) Metrics:", lasso_base_metrics)


# -----------------
# Neural Network Models
# -----------------
class BinaryNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BinaryNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

def train_nn_with_hyperparams(X_train, y_train, X_val1, y_val1, param_grid):
    """Grid search for PyTorch NN."""
    best_params = None
    best_auc = 0
    best_model = None

    for params in product(*param_grid.values()):
        hidden_size, lr, num_epochs = params
        model = BinaryNN(input_size=X_train.shape[1], hidden_size=hidden_size)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # Convert data to PyTorch tensors
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
        X_val1_tensor = torch.tensor(X_val1.values, dtype=torch.float32)
        y_val1_tensor = torch.tensor(y_val1.values, dtype=torch.float32).view(-1, 1)

        # Train
        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        # Validate
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val1_tensor).flatten().numpy()
        auc = roc_auc_score(y_val1, val_outputs)

        if auc > best_auc:
            best_auc = auc
            best_params = params
            best_model = model

    return best_model, {"AUC": best_auc, "Best Params": best_params}

# Train and evaluate NN
nn_model, nn_metrics = train_nn_with_hyperparams(X_train, y_train, X_val1, y_val1, NN_PARAM_GRID)

# Predictions and Metrics for Test Set
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
nn_model.eval()
with torch.no_grad():
    nn_outputs = nn_model(X_test_tensor).flatten().numpy()
nn_preds = (nn_outputs > 0.5).astype(int)

nn_test_metrics = {
    "Accuracy": accuracy_score(y_test, nn_preds),
    "Precision": precision_score(y_test, nn_preds, zero_division=0),
    "Recall": recall_score(y_test, nn_preds, zero_division=0),
    "F1": f1_score(y_test, nn_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, nn_outputs)
}
print("Neural Network Test Metrics:", nn_test_metrics)

# -----------------
# LSTM Model for Classification
# -----------------
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        x = self.sigmoid(self.fc(hidden[-1]))
        return x

def train_lstm_with_hyperparams(X_train, y_train, X_val1, y_val1, param_grid):
    """Grid search for LSTM."""
    best_params = None
    best_auc = 0
    best_model = None

    for params in product(*param_grid.values()):
        hidden_size, num_layers, lr, num_epochs = params
        model = LSTMClassifier(input_size=X_train.shape[1], hidden_size=hidden_size, num_layers=num_layers)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # Convert data to PyTorch tensors
        X_train_seq = torch.tensor(X_train.values.reshape(-1, 1, X_train.shape[1]), dtype=torch.float32)
        y_train_seq = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
        X_val1_seq = torch.tensor(X_val1.values.reshape(-1, 1, X_val1.shape[1]), dtype=torch.float32)
        y_val1_seq = torch.tensor(y_val1.values, dtype=torch.float32).view(-1, 1)

        # Train
        for epoch in range(num_epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_seq)
            loss = criterion(outputs, y_train_seq)
            loss.backward()
            optimizer.step()

        # Validate
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val1_seq).flatten().numpy()
        auc = roc_auc_score(y_val1, val_outputs)

        if auc > best_auc:
            best_auc = auc
            best_params = params
            best_model = model

    return best_model, {"AUC": best_auc, "Best Params": best_params}

# Train and evaluate LSTM
lstm_model, lstm_metrics = train_lstm_with_hyperparams(X_train, y_train, X_val1, y_val1, LSTM_PARAM_GRID)

# Predictions and Metrics for Test Set
X_test_seq = torch.tensor(X_test.values.reshape(-1, 1, X_test.shape[1]), dtype=torch.float32)
lstm_model.eval()
with torch.no_grad():
    lstm_outputs = lstm_model(X_test_seq).flatten().numpy()
lstm_preds = (lstm_outputs > 0.5).astype(int)

lstm_test_metrics = {
    "Accuracy": accuracy_score(y_test, lstm_preds),
    "Precision": precision_score(y_test, lstm_preds, zero_division=0),
    "Recall": recall_score(y_test, lstm_preds, zero_division=0),
    "F1": f1_score(y_test, lstm_preds, zero_division=0),
    "AUC": roc_auc_score(y_test, lstm_outputs)
}
print("LSTM Test Metrics:", lstm_test_metrics)

Logistic Regression Metrics: {'Accuracy': 0.999263830247916, 'Precision': 0.9967580312407899, 'Recall': 0.9932452276064611, 'F1': 0.9949985289791115, 'AUC': np.float64(0.9987709614947629)}


KeyboardInterrupt: 