# Programming environment

In [1]:
# Define the seed for reproducibility
seed = 2025-3-13

In [2]:
# Load existing functions from libraries
from sklearn.metrics import mean_squared_error, make_scorer, precision_score

In [3]:
# Load custom functions from utils.py
from utils import train_and_estimate_sample_size, train_and_save_model, load_and_predict, compute_shap_values

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import numpy as np

models_and_params = {
    'LogisticRegression': {
        'model': LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000),
        'param_grid': {
            'C': np.logspace(-4, 4, 10)  # Regularization strength
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),  # Gaussian Naive Bayes
        'param_grid': {}  # Naive Bayes typically doesn't require hyperparameter tuning
    },
    'SVM': {
        'model': SVC(kernel='linear', probability=True),  # Enable probability estimates
        'param_grid': {
            'C': np.logspace(-4, 4, 10)  # Penalty parameter C of the error term
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
            'weights': ['uniform', 'distance'],  # Weight function used in prediction
            'metric': ['euclidean', 'manhattan']  # Distance metric for tree construction
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(min_samples_split=20),
        'param_grid': {
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(min_samples_split=20, max_features=None),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of trees
            'max_depth': [3, 5, 10]  # Maximum depth of trees
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(learning_rate=0.01),
        'param_grid': {
            'n_estimators': [100, 200, 300],  # Number of boosting stages
            'max_depth': [3, 5, 10]  # Max depth of regression estimators
        }
    },
    'DeepNeuralNetwork': {
        'model': MLPClassifier(max_iter=5000, hidden_layer_sizes=(10, 10), activation='relu', solver='sgd', learning_rate_init=0.0001),
        'param_grid': {
            'alpha': np.logspace(-4, 4, 10)  # L2 penalty (regularization term)
        }
    }
}

In [5]:
import shap

# Corresponding explainers
explainers = {
    'LogisticRegression': shap.LinearExplainer,
    'NaiveBayes': shap.KernelExplainer,
    'SVM': shap.KernelExplainer,
    'KNN': shap.KernelExplainer,
    'DecisionTree': shap.TreeExplainer,
    'RandomForest': shap.TreeExplainer,
    'GradientBoosting': shap.TreeExplainer,
    'DeepNeuralNetwork': shap.KernelExplainer
}

# Predictive modeling

## Neonatal risk

### Ridge Regression

In [6]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'LogisticRegression'
model_dir = 'model/neonatal_risk/rr'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/rr/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/rr/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/rr/model.joblib
Results saved at inst/extdata/model/neonatal_risk/rr/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rr/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rr/test_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rr/shap_values.csv


### Naive Bayes

In [7]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'NaiveBayes'
model_dir = 'model/neonatal_risk/nb'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/neonatal_risk/nb/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/nb/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/nb/model.joblib
Results saved at inst/extdata/model/neonatal_risk/nb/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/nb/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/nb/test_prob.csv


100%|█████████████████████████| 301/301 [00:08<00:00, 33.77it/s]

Results saved at inst/extdata/model/neonatal_risk/nb/shap_values.csv





### Support Vector Machine

In [8]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'SVM'
model_dir = 'model/neonatal_risk/svm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/svm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/svm/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/neonatal_risk/svm/model.joblib
Results saved at inst/extdata/model/neonatal_risk/svm/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/svm/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/svm/test_prob.csv


100%|█████████████████████████| 301/301 [00:09<00:00, 32.32it/s]

Results saved at inst/extdata/model/neonatal_risk/svm/shap_values.csv





### k-Nearest Neighbor

In [9]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'KNN'
model_dir = 'model/neonatal_risk/knn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/neonatal_risk/knn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/knn/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/knn/model.joblib
Results saved at inst/extdata/model/neonatal_risk/knn/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/knn/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/knn/test_prob.csv


100%|█████████████████████████| 301/301 [00:36<00:00,  8.18it/s]

Results saved at inst/extdata/model/neonatal_risk/knn/shap_values.csv





### Decision Tree

In [10]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'DecisionTree'
model_dir = 'model/neonatal_risk/dt'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/dt/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/dt/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/dt/model.joblib
Results saved at inst/extdata/model/neonatal_risk/dt/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/dt/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/dt/test_prob.csv
Results saved at inst/extdata/model/neonatal_risk/dt/shap_values.csv


### Random Forest

In [11]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'RandomForest'
model_dir = 'model/neonatal_risk/rf'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/rf/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/rf/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/rf/model.joblib
Results saved at inst/extdata/model/neonatal_risk/rf/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rf/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rf/test_prob.csv
Results saved at inst/extdata/model/neonatal_risk/rf/shap_values.csv


### Gradient Boosting Machine

In [12]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'GradientBoosting'
model_dir = 'model/neonatal_risk/gbm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/gbm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/gbm/scaler.joblib
Model saved at inst/extdata/model/neonatal_risk/gbm/model.joblib
Results saved at inst/extdata/model/neonatal_risk/gbm/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/gbm/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/gbm/test_prob.csv
Results saved at inst/extdata/model/neonatal_risk/gbm/shap_values.csv


### Deep Neural Network

In [13]:
prefix = 'predmod_data/asd_neonatal_risk'
model_name = 'DeepNeuralNetwork'
model_dir = 'model/neonatal_risk/dnn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/neonatal_risk/dnn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/neonatal_risk/dnn/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/neonatal_risk/dnn/model.joblib
Results saved at inst/extdata/model/neonatal_risk/dnn/train_prob.csv
Results saved at inst/extdata/model/neonatal_risk/dnn/validation_prob.csv
Results saved at inst/extdata/model/neonatal_risk/dnn/test_prob.csv


100%|█████████████████████████| 301/301 [00:14<00:00, 21.18it/s]


Results saved at inst/extdata/model/neonatal_risk/dnn/shap_values.csv


## BSID-III

### Ridge Regression

In [14]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'LogisticRegression'
model_dir = 'model/bsid_iii/rr'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/bsid_iii/rr/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/rr/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/rr/model.joblib
Results saved at inst/extdata/model/bsid_iii/rr/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/rr/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/rr/test_prob.csv
Results saved at inst/extdata/model/bsid_iii/rr/shap_values.csv


### Naive Bayes

In [15]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'NaiveBayes'
model_dir = 'model/bsid_iii/nb'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/bsid_iii/nb/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/nb/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/nb/model.joblib
Results saved at inst/extdata/model/bsid_iii/nb/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/nb/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/nb/test_prob.csv


100%|█████████████████████████| 301/301 [01:55<00:00,  2.61it/s]

Results saved at inst/extdata/model/bsid_iii/nb/shap_values.csv





### Support Vector Machine

In [16]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'SVM'
model_dir = 'model/bsid_iii/svm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/bsid_iii/svm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/svm/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/bsid_iii/svm/model.joblib
Results saved at inst/extdata/model/bsid_iii/svm/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/svm/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/svm/test_prob.csv


100%|█████████████████████████| 301/301 [02:28<00:00,  2.03it/s]

Results saved at inst/extdata/model/bsid_iii/svm/shap_values.csv





### k-Nearest Neighbor

In [17]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'KNN'
model_dir = 'model/bsid_iii/knn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/bsid_iii/knn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/knn/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/knn/model.joblib
Results saved at inst/extdata/model/bsid_iii/knn/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/knn/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/knn/test_prob.csv


100%|█████████████████████████| 301/301 [04:31<00:00,  1.11it/s]

Results saved at inst/extdata/model/bsid_iii/knn/shap_values.csv





### Decision Tree

In [18]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'DecisionTree'
model_dir = 'model/bsid_iii/dt'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/bsid_iii/dt/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/dt/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/dt/model.joblib
Results saved at inst/extdata/model/bsid_iii/dt/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/dt/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/dt/test_prob.csv
Results saved at inst/extdata/model/bsid_iii/dt/shap_values.csv


### Random Forest

In [19]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'RandomForest'
model_dir = 'model/bsid_iii/rf'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/bsid_iii/rf/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/rf/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/rf/model.joblib
Results saved at inst/extdata/model/bsid_iii/rf/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/rf/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/rf/test_prob.csv
Results saved at inst/extdata/model/bsid_iii/rf/shap_values.csv


### Gradient Boosting Machine

In [20]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'GradientBoosting'
model_dir = 'model/bsid_iii/gbm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/bsid_iii/gbm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/gbm/scaler.joblib
Model saved at inst/extdata/model/bsid_iii/gbm/model.joblib
Results saved at inst/extdata/model/bsid_iii/gbm/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/gbm/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/gbm/test_prob.csv
Results saved at inst/extdata/model/bsid_iii/gbm/shap_values.csv


### Deep Neural Network

In [21]:
prefix = 'predmod_data/asd_bsid_iii'
model_name = 'DeepNeuralNetwork'
model_dir = 'model/bsid_iii/dnn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.


Results saved at inst/extdata/model/bsid_iii/dnn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/bsid_iii/dnn/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/bsid_iii/dnn/model.joblib
Results saved at inst/extdata/model/bsid_iii/dnn/train_prob.csv
Results saved at inst/extdata/model/bsid_iii/dnn/validation_prob.csv
Results saved at inst/extdata/model/bsid_iii/dnn/test_prob.csv


100%|█████████████████████████| 301/301 [02:16<00:00,  2.21it/s]

Results saved at inst/extdata/model/bsid_iii/dnn/shap_values.csv





## M-CHAT-R

### Ridge Regression

In [22]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'LogisticRegression'
model_dir = 'model/m_chat_r/rr'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_r/rr/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/rr/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/rr/model.joblib
Results saved at inst/extdata/model/m_chat_r/rr/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/rr/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/rr/test_prob.csv
Results saved at inst/extdata/model/m_chat_r/rr/shap_values.csv


### Naive Bayes

In [23]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'NaiveBayes'
model_dir = 'model/m_chat_r/nb'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/m_chat_r/nb/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/nb/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/nb/model.joblib
Results saved at inst/extdata/model/m_chat_r/nb/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/nb/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/nb/test_prob.csv


100%|█████████████████████████| 301/301 [03:36<00:00,  1.39it/s]

Results saved at inst/extdata/model/m_chat_r/nb/shap_values.csv





### Support Vector Machine

In [24]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'SVM'
model_dir = 'model/m_chat_r/svm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_r/svm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/svm/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/m_chat_r/svm/model.joblib
Results saved at inst/extdata/model/m_chat_r/svm/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/svm/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/svm/test_prob.csv


100%|█████████████████████████| 301/301 [04:38<00:00,  1.08it/s]

Results saved at inst/extdata/model/m_chat_r/svm/shap_values.csv





### k-Nearest Neighbor

In [25]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'KNN'
model_dir = 'model/m_chat_r/knn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/m_chat_r/knn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/knn/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/knn/model.joblib
Results saved at inst/extdata/model/m_chat_r/knn/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/knn/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/knn/test_prob.csv


100%|█████████████████████████| 301/301 [41:07<00:00,  8.20s/it]

Results saved at inst/extdata/model/m_chat_r/knn/shap_values.csv





### Decision Tree

In [26]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'DecisionTree'
model_dir = 'model/m_chat_r/dt'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_r/dt/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/dt/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/dt/model.joblib
Results saved at inst/extdata/model/m_chat_r/dt/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/dt/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/dt/test_prob.csv
Results saved at inst/extdata/model/m_chat_r/dt/shap_values.csv


### Random Forest

In [27]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'RandomForest'
model_dir = 'model/m_chat_r/rf'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_r/rf/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/rf/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/rf/model.joblib
Results saved at inst/extdata/model/m_chat_r/rf/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/rf/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/rf/test_prob.csv
Results saved at inst/extdata/model/m_chat_r/rf/shap_values.csv


### Gradient Boosting Machine

In [28]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'GradientBoosting'
model_dir = 'model/m_chat_r/gbm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_r/gbm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/gbm/scaler.joblib
Model saved at inst/extdata/model/m_chat_r/gbm/model.joblib
Results saved at inst/extdata/model/m_chat_r/gbm/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/gbm/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/gbm/test_prob.csv
Results saved at inst/extdata/model/m_chat_r/gbm/shap_values.csv


### Deep Neural Network

In [29]:
prefix = 'predmod_data/asd_m_chat_r'
model_name = 'DeepNeuralNetwork'
model_dir = 'model/m_chat_r/dnn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.


Results saved at inst/extdata/model/m_chat_r/dnn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_r/dnn/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/m_chat_r/dnn/model.joblib
Results saved at inst/extdata/model/m_chat_r/dnn/train_prob.csv
Results saved at inst/extdata/model/m_chat_r/dnn/validation_prob.csv
Results saved at inst/extdata/model/m_chat_r/dnn/test_prob.csv


100%|█████████████████████████| 301/301 [03:59<00:00,  1.26it/s]

Results saved at inst/extdata/model/m_chat_r/dnn/shap_values.csv





## M-CHAT-F

### Ridge Regression

In [30]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'LogisticRegression'
model_dir = 'model/m_chat_f/rr'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_f/rr/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/rr/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/rr/model.joblib
Results saved at inst/extdata/model/m_chat_f/rr/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/rr/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/rr/test_prob.csv
Results saved at inst/extdata/model/m_chat_f/rr/shap_values.csv


### Naive Bayes

In [31]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'NaiveBayes'
model_dir = 'model/m_chat_f/nb'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/m_chat_f/nb/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/nb/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/nb/model.joblib
Results saved at inst/extdata/model/m_chat_f/nb/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/nb/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/nb/test_prob.csv


100%|█████████████████████████| 301/301 [03:34<00:00,  1.40it/s]

Results saved at inst/extdata/model/m_chat_f/nb/shap_values.csv





### Support Vector Machine

In [32]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'SVM'
model_dir = 'model/m_chat_f/svm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_f/svm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/svm/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/m_chat_f/svm/model.joblib
Results saved at inst/extdata/model/m_chat_f/svm/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/svm/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/svm/test_prob.csv


100%|█████████████████████████| 301/301 [04:36<00:00,  1.09it/s]

Results saved at inst/extdata/model/m_chat_f/svm/shap_values.csv





### k-Nearest Neighbor

In [33]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'KNN'
model_dir = 'model/m_chat_f/knn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Results saved at inst/extdata/model/m_chat_f/knn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/knn/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/knn/model.joblib
Results saved at inst/extdata/model/m_chat_f/knn/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/knn/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/knn/test_prob.csv


100%|█████████████████████████| 301/301 [40:10<00:00,  8.01s/it]

Results saved at inst/extdata/model/m_chat_f/knn/shap_values.csv





### Decision Tree

In [34]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'DecisionTree'
model_dir = 'model/m_chat_f/dt'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_f/dt/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/dt/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/dt/model.joblib
Results saved at inst/extdata/model/m_chat_f/dt/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/dt/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/dt/test_prob.csv
Results saved at inst/extdata/model/m_chat_f/dt/shap_values.csv


### Random Forest

In [35]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'RandomForest'
model_dir = 'model/m_chat_f/rf'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_f/rf/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/rf/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/rf/model.joblib
Results saved at inst/extdata/model/m_chat_f/rf/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/rf/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/rf/test_prob.csv
Results saved at inst/extdata/model/m_chat_f/rf/shap_values.csv


### Gradient Boosting Machine

In [36]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'GradientBoosting'
model_dir = 'model/m_chat_f/gbm'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Results saved at inst/extdata/model/m_chat_f/gbm/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/gbm/scaler.joblib
Model saved at inst/extdata/model/m_chat_f/gbm/model.joblib
Results saved at inst/extdata/model/m_chat_f/gbm/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/gbm/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/gbm/test_prob.csv
Results saved at inst/extdata/model/m_chat_f/gbm/shap_values.csv


### Deep Neural Network

In [None]:
prefix = 'predmod_data/asd_m_chat_f'
model_name = 'DeepNeuralNetwork'
model_dir = 'model/m_chat_f/dnn'

train_and_estimate_sample_size(f'{prefix}_train', models_and_params[model_name], model_dir, seed = seed)
custom_precision = make_scorer(precision_score, zero_division=0)
train_and_save_model(f'{prefix}_train', models_and_params[model_name], model_dir, scoring=custom_precision, seed = seed)

output_type = 'probability'
load_and_predict(prefix, 'train', output_type, model_dir)
load_and_predict(prefix, 'validation', output_type, model_dir)
load_and_predict(prefix, 'test', output_type, model_dir)
compute_shap_values(f'{prefix}_train', explainers[model_name], model_dir)

Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (5000) reached and the optimization hasn't converged yet.


Results saved at inst/extdata/model/m_chat_f/dnn/sample_size_estimation.csv
Scaler saved at inst/extdata/model/m_chat_f/dnn/scaler.joblib


Using 301 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Model saved at inst/extdata/model/m_chat_f/dnn/model.joblib
Results saved at inst/extdata/model/m_chat_f/dnn/train_prob.csv
Results saved at inst/extdata/model/m_chat_f/dnn/validation_prob.csv
Results saved at inst/extdata/model/m_chat_f/dnn/test_prob.csv


 63%|███████████████▊         | 190/301 [02:34<01:25,  1.29it/s]

# Model deployment

In [6]:
import shutil
import os

best_model = 'rf'
model_name = 'RandomForest'

prefix = 'depmod_data'
model_dir = 'best_model'

# Define the source and destination file paths
source_path = f'inst/extdata/{best_model}/scaler.joblib'
source_path2 = f'inst/extdata/{best_model}/model.joblib'
destination_path = f'inst/extdata/{model_dir}/scaler.joblib'
destination_path2 = f'inst/extdata/{model_dir}/model.joblib'

# Ensure the destination directory exists, create if it doesn't
os.makedirs(os.path.dirname(destination_path), exist_ok=True)

# Copy the file
shutil.copy(source_path, destination_path)
print("File copied successfully from:", source_path, "to:", destination_path)

shutil.copy(source_path2, destination_path2)
print("File copied successfully from:", source_path2, "to:", destination_path2)

output_type = 'probability'
load_and_predict(prefix, output_type, model_dir)
compute_shap_values(prefix, explainers[model_name], model_dir)

File copied successfully from: inst/extdata/rf/scaler.joblib to: inst/extdata/best_model/scaler.joblib
File copied successfully from: inst/extdata/rf/model.joblib to: inst/extdata/best_model/model.joblib
Results saved at inst/extdata/best_model/prob.csv
Results saved at inst/extdata/best_model/shap_values.csv
