# Programming environment

In [1]:
# Define the seed for reproducibility
seed = 2024-9-10

In [2]:
# Load existing functions from libraries
from sklearn.metrics import mean_squared_error, make_scorer, precision_score

In [3]:
# Load custom functions from utils.py
from utils import train_and_estimate_sample_size, train_and_save_model, load_and_predict, compute_shap_values

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np

models_and_params = {
    'LogisticRegression': {
        'model': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000),
        'param_grid': {
            'C': np.logspace(-4, 4, 10)  # Regularization strength
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),  # Gaussian Naive Bayes
        'param_grid': {}  # Naive Bayes typically doesn't require hyperparameter tuning
    },
    'SVM': {
        'model': SVC(kernel='linear', probability=True),  # Enable probability estimates
        'param_grid': {
            'C': np.logspace(-4, 4, 10)  # Penalty parameter C of the error term
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
            'weights': ['uniform', 'distance'],  # Weight function used in prediction
            'metric': ['euclidean', 'manhattan']  # Distance metric for tree construction
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(min_samples_split=20),
        'param_grid': {
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(min_samples_split=20, max_features=None),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of trees
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(learning_rate=0.01),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of boosting stages
            'max_depth': [3, 5, 10]  # Max depth of regression estimators
        }
    },
    'DeepNeuralNetwork': {
        'model': MLPClassifier(max_iter=5000, hidden_layer_sizes=(10, 10), activation='relu', solver='sgd', learning_rate_init=0.0001),
        'param_grid': {
            'alpha': np.logspace(-4, 4, 10)  # L2 penalty (regularization term)
        }
    }
}

In [5]:
import shap

# Corresponding explainers
explainers = {
    'LogisticRegression': shap.LinearExplainer,
    'NaiveBayes': shap.KernelExplainer,
    'SVM': shap.KernelExplainer,
    'KNN': shap.KernelExplainer,
    'DecisionTree': shap.TreeExplainer,
    'RandomForest': shap.TreeExplainer,
    'GradientBoosting': shap.TreeExplainer,
    'DeepNeuralNetwork': shap.KernelExplainer
}

# Predictive modeling

## Ridge Regression

In [6]:
prefix = 'predmod_data'
model_name = 'LogisticRegression'
model_dir = 'rr'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/rr/sample_size_estimation.csv
Scaler saved at inst/extdata/rr/scaler.joblib
Model saved at inst/extdata/rr/model.joblib
Results saved at inst/extdata/rr/prob.csv
Results saved at inst/extdata/rr/shap_values.csv


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


## Naive Bayes

In [7]:
prefix = 'predmod_data'
model_name = 'NaiveBayes'
model_dir = 'nb'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Results saved at inst/extdata/nb/sample_size_estimation.csv
Scaler saved at inst/extdata/nb/scaler.joblib
Model saved at inst/extdata/nb/model.joblib
Results saved at inst/extdata/nb/prob.csv


100%|███| 281/281 [00:01<00:00, 143.49it/s]

Results saved at inst/extdata/nb/shap_values.csv





## Support Vector Machine

In [8]:
prefix = 'predmod_data'
model_name = 'SVM'
model_dir = 'svm'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/svm/sample_size_estimation.csv
Scaler saved at inst/extdata/svm/scaler.joblib


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Model saved at inst/extdata/svm/model.joblib
Results saved at inst/extdata/svm/prob.csv


100%|███| 281/281 [00:01<00:00, 143.31it/s]

Results saved at inst/extdata/svm/shap_values.csv





## k-Nearest Neighbor

In [9]:
prefix = 'predmod_data'
model_name = 'KNN'
model_dir = 'knn'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Results saved at inst/extdata/knn/sample_size_estimation.csv
Scaler saved at inst/extdata/knn/scaler.joblib
Model saved at inst/extdata/knn/model.joblib
Results saved at inst/extdata/knn/prob.csv


100%|████| 281/281 [00:05<00:00, 47.66it/s]

Results saved at inst/extdata/knn/shap_values.csv





## Decision Tree

In [10]:
prefix = 'predmod_data'
model_name = 'DecisionTree'
model_dir = 'dt'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/dt/sample_size_estimation.csv
Scaler saved at inst/extdata/dt/scaler.joblib
Model saved at inst/extdata/dt/model.joblib
Results saved at inst/extdata/dt/prob.csv
Results saved at inst/extdata/dt/shap_values.csv


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


## Random Forest

In [11]:
prefix = 'predmod_data'
model_name = 'RandomForest'
model_dir = 'rf'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/rf/sample_size_estimation.csv
Scaler saved at inst/extdata/rf/scaler.joblib


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Model saved at inst/extdata/rf/model.joblib
Results saved at inst/extdata/rf/prob.csv
Results saved at inst/extdata/rf/shap_values.csv


## Gradient Boosting Machine

In [12]:
prefix = 'predmod_data'
model_name = 'GradientBoosting'
model_dir = 'gbm'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/gbm/sample_size_estimation.csv
Scaler saved at inst/extdata/gbm/scaler.joblib


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Model saved at inst/extdata/gbm/model.joblib
Results saved at inst/extdata/gbm/prob.csv
Results saved at inst/extdata/gbm/shap_values.csv


## Deep Neural Network

In [13]:
prefix = 'predmod_data'
model_name = 'DeepNeuralNetwork'
model_dir = 'dnn'

train_and_estimate_sample_size(prefix, models_and_params[model_name], model_dir, seed = seed)
train_and_save_model(prefix, models_and_params[model_name], model_dir, seed = seed)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

Results saved at inst/extdata/dnn/sample_size_estimation.csv
Scaler saved at inst/extdata/dnn/scaler.joblib


Class parallel_backend is deprecated; deprecated in 1.5 to be removed in 1.7. Use joblib.{} instead.


Model saved at inst/extdata/dnn/model.joblib
Results saved at inst/extdata/dnn/prob.csv


100%|███| 281/281 [00:01<00:00, 153.03it/s]

Results saved at inst/extdata/dnn/shap_values.csv



