# Programming environment

In [None]:
# Define the seed for reproducibility
seed = 2026-2-23

In [None]:
# Load existing functions from libraries
from sklearn.metrics import mean_squared_error, make_scorer, precision_score, r2_score
import torch

In [None]:
# Load custom functions from utils.py
from utils import train_and_estimate_sample_size, train_and_save_model, load_and_predict, compute_shap_values, MultiOutputMLP, train_and_estimate_sample_size_torch, train_and_save_model_torch, load_and_predict_torch, compute_shap_values_torch

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression, BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
import numpy as np

models_and_params = {
    # ----- Classification models -----
    'LogisticRegression': {
        'model': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000),
        'param_grid': {
            'C': np.logspace(-4, 4, 10)  # Regularization strength
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),  # Gaussian Naive Bayes
        'param_grid': {}  # Naive Bayes typically doesn't require hyperparameter tuning
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
            'weights': ['uniform', 'distance'],  # Weight function used in prediction
            'metric': ['euclidean', 'manhattan']  # Distance metric for tree construction
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(min_samples_split=20),
        'param_grid': {
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(min_samples_split=20, max_features=None),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of trees
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(learning_rate=0.01),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of boosting stages
            'max_depth': [3, 5, 10]  # Max depth of regression estimators
        }
    },
    'DeepNeuralNetwork': {
        'model': MLPClassifier(max_iter=10000, hidden_layer_sizes=(10, 10), activation='relu', solver='sgd', learning_rate_init=0.0001),
        'param_grid': {
            'alpha': np.logspace(-4, 4, 10)  # L2 penalty (regularization term)
        }
    },
    
    # ----- Regression models -----
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'fit_intercept': [True, False],
            'positive': [True, False]
        }
    },
    'BayesianRegression': {
        'model': BayesianRidge(),
        'param_grid': {
            'alpha_1': np.logspace(-6, -1, 5),
            'lambda_1': np.logspace(-6, -1, 5)
        }
    },
    'KNNRegression': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    'DecisionTreeRegression': {
        'model': DecisionTreeRegressor(min_samples_split=20),
        'param_grid': {'max_depth': [3, 5, 10]}
    },
    'RandomForestRegression': {
        'model': RandomForestRegressor(min_samples_split=20, max_features=None),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 10]
        }
    },
    'GradientBoostingRegression': {
        'model': GradientBoostingRegressor(learning_rate=0.01),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 10]
        }
    },
    'DeepNeuralNetworkRegression': {
        'model': MLPRegressor(max_iter=10000, hidden_layer_sizes=(10, 10),
                              activation='relu', solver='sgd', learning_rate_init=0.0001),
        'param_grid': {'alpha': np.logspace(-4, 4, 10)}
    },
    
    # ----- PyTorch Deep Learning (supports 1+ outputs) -----
    'PyTorchDNN': {
        'model': 'PyTorchDNN',
        'model_class': MultiOutputMLP,
        'param_grid': {
            'hidden_sizes': [(128, 64), (64, 32)],
            'dropout': [0.0, 0.3],
            'lr': [1e-3],
            'batch_size': [32],
            'epochs': [50],
            'weight_decay': [0.0, 1e-4]
        }
    }
}

In [None]:
import shap

# Corresponding explainers
explainers = {
    'LogisticRegression': shap.LinearExplainer,
    'NaiveBayes': shap.KernelExplainer,
    'SVM': shap.KernelExplainer,
    'KNN': shap.KernelExplainer,
    'DecisionTree': shap.TreeExplainer,
    'RandomForest': shap.TreeExplainer,
    'GradientBoosting': shap.TreeExplainer,
    'DeepNeuralNetwork': shap.KernelExplainer,
    'PyTorchDNN_v2': shap.DeepExplainer
}

# Predictive modeling

## Ridge Regression

In [None]:
prefix = 'data/outcome/feature_set/train'
prefix2 = 'data/outcome/feature_set/val'
prefix3 = 'data/outcome/feature_set/test'
model_name = 'LogisticRegression'
model_dir = 'data/outcome/feature_set/rr'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir, 'train')
load_and_predict(prefix2, output_type, model_dir, 'val')
load_and_predict(prefix3, output_type, model_dir, 'test')

In [None]:
prefix = 'predmod_data/death/Risk factors at-before hospital discharge/train'
prefix2 = 'predmod_data/death/Risk factors at-before hospital discharge/val'
model_name = 'NaiveBayes'
model_dir = 'predmod_data/death/Risk factors at-before hospital discharge/nb'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir, 'train')
load_and_predict(prefix2, output_type, model_dir, 'val')

In [None]:
prefix3 = 'predmod_data/death/Risk factors at-before hospital discharge/test'
model_dir = 'predmod_data/death/Risk factors at-before hospital discharge/nb'

output_type = 'probability'
load_and_predict(prefix3, output_type, model_dir, 'test')

In [None]:
prefix = 'predmod_data/death/Risk factors at-before hospital discharge/train'
prefix2 = 'predmod_data/death/Risk factors at-before hospital discharge/val'
model_name = 'KNN'
model_dir = 'predmod_data/death/Risk factors at-before hospital discharge/knn'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir, 'train')
load_and_predict(prefix2, output_type, model_dir, 'val')

In [None]:
prefix3 = 'predmod_data/death/Risk factors at-before hospital discharge/test'
model_dir = 'predmod_data/death/Risk factors at-before hospital discharge/knn'

output_type = 'probability'
load_and_predict(prefix3, output_type, model_dir, 'test')

## Neural Network (torch)

In [None]:
prefix = 'data/outcome/feature_set/train'
prefix2 = 'data/outcome/feature_set/val'
prefix3 = 'data/outcome/feature_set/test'
model_name = 'PyTorchDNN'
model_dir = 'data/outcome/feature_set/dnn_torch'

train_and_estimate_sample_size_torch(prefix, models_and_params[model_name], save_dir=model_dir, task="classification", seed=seed, score_outcome="outcome3")
train_and_save_model_torch(prefix, models_and_params[model_name], save_dir=model_dir, task="classification", seed=seed, score_outcome="outcome3")

output_type = "probability"
load_and_predict_torch(prefix,  output_type, model_dir, 'train')
load_and_predict_torch(prefix2, output_type, model_dir, 'val')
load_and_predict_torch(prefix3, output_type, model_dir, 'test')