Skip to content

Latest commit

 

History

History
407 lines (352 loc) · 17.6 KB

README.md

File metadata and controls

407 lines (352 loc) · 17.6 KB

Tutorial on Using FasterRisk Inside R

Author: Jiachang Liu

Date: Last Compiled on November 09, 2022

1 Installation

1.1 Install the reticulate Package

install.packages("reticulate", repos = "http://cran.us.r-project.org")
library(reticulate)

1.2 Create a Python Virtual Environment

version <- "3.9.12"
install_python(version)
virtualenv_create(envname="FasterRisk-environment", version = version)
use_virtualenv("FasterRisk-environment")

1.3 Install the FasterRisk Package from PyPI

py_install("fasterrisk", pip=TRUE, envname="FasterRisk-environment")

2 Preparation before Training

fasterrisk <- import("fasterrisk")

2.1 Download Sample Data

data_dir_path <- "../tests"
train_data_file_path <- "../tests/adult_train_data.csv"
test_data_file_path <- "../tests/adult_test_data.csv"

if (!file.exists(data_dir_path)){
    # create the data folder if it doesn't exist
    dir.create(file.path(data_dir_path))
}

if (!file.exists(train_data_file_path)){
    # download sample training data if it doesn't exists
    fasterrisk$utils$download_file_from_google_drive('1nuWn0QVG8tk3AN4I4f3abWLcFEP3WPec', train_data_file_path)
}

if (!file.exists(train_data_file_path)){
    # download sample test data if it doesn't exists
    fasterrisk$utils$download_file_from_google_drive('1TyBO02LiGfHbatPWU4nzc8AndtIF-7WH', test_data_file_path)
}

2.2 Read Sample Data

np <- import("numpy", convert=FALSE)
train_df <- read.csv(train_data_file_path)
train_data <- data.matrix(train_df)
X_train <- np$array(train_data[, 2:ncol(train_data)])
y_train <- np$array(train_data[, 1], dtype=np$int) # Check your data! y label must be +1/-1.

test_df <- read.csv(test_data_file_path)
test_data <- data.matrix(test_df)
X_test <- np$array(test_data[, 2:ncol(test_data)])
y_test <- np$array(test_data[, 1], dtype=np$int) # Check your data! y label must be +1/-1.

3 Training the Model

3.1 Create A Model Class

sparsity <- as.integer(5)
parent_size <- as.integer(10)

RiskScoreOptimizer_m <- fasterrisk$fasterrisk$RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size)

3.2 Train the Model

start_time <- Sys.time()
RiskScoreOptimizer_m$optimize()
sprintf("Optimization takes %f seconds.", Sys.time() - start_time)
## [1] "Optimization takes 13.313264 seconds."

4 Get Risk Score Models

4.1 Get Solutions from the Trained Model Class

solutions = RiskScoreOptimizer_m$get_models()
multipliers = solutions[1][[1]]
sparseDiversePool_beta0_integer = solutions[2][[1]]
sparseDiversePool_betas_integer = solutions[3][[1]]
sprintf("We generate %d risk score models from the sparse diverse pool", length(multipliers))
## [1] "We generate 50 risk score models from the sparse diverse pool"

4.2 Access the First Risk Score Model

model_index = 1 # first model
multiplier = multipliers[model_index]
intercept = sparseDiversePool_beta0_integer[model_index]
coefficients = np$array(sparseDiversePool_betas_integer[model_index, ]) # each row of sparseDiversePool_betas_integer corresponds to a solution of coefficients

4.3 Use the First Risk Score Model to Do Prediction

RiskScoreClassifier_m = fasterrisk$fasterrisk$RiskScoreClassifier(multiplier, intercept, coefficients)
y_test_pred = RiskScoreClassifier_m$predict(X_test)
print("y_test are predicted to be (first 10 values):")
## [1] "y_test are predicted to be (first 10 values):"
y_test_pred[1:10]
##  [1] -1 -1 -1 -1 -1 -1 -1  1 -1 -1
y_test_pred_prob = RiskScoreClassifier_m$predict_prob(X_test)
print("The risk probabilities of having y_test to be +1 are (first 10 values):")
## [1] "The risk probabilities of having y_test to be +1 are (first 10 values):"
y_test_pred_prob[1:10]
##  [1] 0.13308868 0.34872682 0.34872682 0.04216029 0.13308868 0.34872682
##  [7] 0.04216029 0.65127318 0.34872682 0.01246260

4.4 Print the First Model Card

X_featureNames = list(colnames(train_df)[-1])[[1]] # extract the column names from the dataframe. Feature names exclude the first column name because it is the name for the y label

RiskScoreClassifier_m$reset_featureNames(X_featureNames)
tmp_str = py_capture_output(RiskScoreClassifier_m$print_model_card(), type = c("stdout", "stderr")) # capture the python standard output and print it inside R
cat(tmp_str)
## The Risk Score is:
## 1.            Age_22_to_29     -2 point(s) |   ...
## 2.               HSDiploma     -2 point(s) | + ...
## 3.                    NoHS     -4 point(s) | + ...
## 4.                 Married      4 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |
## RISK  |   0.1% |   0.4% |   0.7% |   1.2% |   2.3% |   4.2% |   7.6% |
## SCORE |   0.0  |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   7.0  |
## RISK  |  13.3% |  22.3% |  34.9% |  50.0% |  65.1% |  77.7% |  92.4% |

4.5 Print Top 10 Model Cards from the Pool and Their Performance Metrics

num_models = min(10, length(multipliers))

for (model_index in 1:num_models){
    multiplier = multipliers[model_index]
    intercept = sparseDiversePool_beta0_integer[model_index]
    coefficients = np$array(sparseDiversePool_betas_integer[model_index, ]) # each row of sparseDiversePool_betas_integer corresponds to a solution of coefficients

    RiskScoreClassifier_m = fasterrisk$fasterrisk$RiskScoreClassifier(multiplier, intercept, coefficients)
    RiskScoreClassifier_m$reset_featureNames(X_featureNames)
    tmp_str = py_capture_output(RiskScoreClassifier_m$print_model_card(), type = c("stdout", "stderr")) # capture the python standard output and print it inside R
    cat(tmp_str)

    train_loss = RiskScoreClassifier_m$compute_logisticLoss(X_train, y_train)
    train_results = RiskScoreClassifier_m$get_acc_and_auc(X_train, y_train)
    train_acc = train_results[1][[1]]
    train_auc = train_results[2][[1]]
    test_results = RiskScoreClassifier_m$get_acc_and_auc(X_test, y_test)
    test_acc = test_results[1][[1]]
    test_auc = test_results[2][[1]]

    tmp_str = sprintf("The logistic loss on the training set is %f", train_loss)
    print(tmp_str)
    tmp_str = sprintf("The training accuracy and AUC are %f and %f", train_acc*100, train_auc)
    print(tmp_str)
    tmp_str = sprintf("The test accuracy and AUC are are %f and %f", test_acc*100, test_auc)
    print(tmp_str)
    cat("\n")
}
## The Risk Score is:
## 1.            Age_22_to_29     -2 point(s) |   ...
## 2.               HSDiploma     -2 point(s) | + ...
## 3.                    NoHS     -4 point(s) | + ...
## 4.                 Married      4 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |
## RISK  |   0.1% |   0.4% |   0.7% |   1.2% |   2.3% |   4.2% |   7.6% |
## SCORE |   0.0  |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   7.0  |
## RISK  |  13.3% |  22.3% |  34.9% |  50.0% |  65.1% |  77.7% |  92.4% |
## 
## [1] "The logistic loss on the training set is 9798.652347"
## [1] "The training accuracy and AUC are 82.575147 and 0.861817"
## [1] "The test accuracy and AUC are are 81.787469 and 0.856367"
## 
## The Risk Score is:
## 1.               HSDiploma     -2 point(s) |   ...
## 2.                    NoHS     -4 point(s) | + ...
## 3.                 Married      4 point(s) | + ...
## 4.    WorkHrsPerWeek_lt_40     -2 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |
## RISK  |   0.1% |   0.4% |   0.7% |   1.3% |   2.5% |   4.4% |   7.9% |
## SCORE |   0.0  |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   7.0  |
## RISK  |  13.7% |  22.7% |  35.1% |  50.0% |  64.9% |  77.3% |  92.1% |
## 
## [1] "The logistic loss on the training set is 9859.615758"
## [1] "The training accuracy and AUC are 82.333295 and 0.859661"
## [1] "The test accuracy and AUC are are 81.848894 and 0.853752"
## 
## The Risk Score is:
## 1.               HSDiploma     -3 point(s) |   ...
## 2.                    NoHS     -5 point(s) | + ...
## 3.           JobManagerial      2 point(s) | + ...
## 4.                 Married      5 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -6.0  |  -5.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.2% |   0.6% |   1.0% |   2.7% |   4.4% |   7.2% |  11.4% |
## SCORE |   2.0  |   3.0  |   4.0  |   5.0  |   7.0  |   8.0  |  10.0  |
## RISK  |  26.4% |  37.5% |  50.0% |  62.5% |  82.3% |  88.6% |  95.6% |
## 
## [1] "The logistic loss on the training set is 9883.324462"
## [1] "The training accuracy and AUC are 82.268033 and 0.859570"
## [1] "The test accuracy and AUC are are 81.511057 and 0.854237"
## 
## The Risk Score is:
## 1.               HSDiploma     -3 point(s) |   ...
## 2.                    NoHS     -5 point(s) | + ...
## 3.                 Married      5 point(s) | + ...
## 4.   WorkHrsPerWeek_geq_50      1 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -7.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.2% |   0.3% |   0.9% |   1.5% |   2.5% |   4.1% |   6.8% |  10.9% |
## SCORE |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |   8.0  |   9.0  |
## RISK  |  17.2% |  25.9% |  37.2% |  50.0% |  62.8% |  74.1% |  89.1% |  93.2% |
## 
## [1] "The logistic loss on the training set is 9895.728068"
## [1] "The training accuracy and AUC are 82.179738 and 0.861227"
## [1] "The test accuracy and AUC are are 81.342138 and 0.856065"
## 
## The Risk Score is:
## 1.            Age_45_to_59      1 point(s) |   ...
## 2.               HSDiploma     -2 point(s) | + ...
## 3.                    NoHS     -5 point(s) | + ...
## 4.                 Married      4 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -7.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.2% |   0.4% |   0.7% |   1.2% |   2.0% |   3.5% |   5.9% |   9.9% |
## SCORE |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |   7.0  |   8.0  |
## RISK  |  16.0% |  24.9% |  36.5% |  50.0% |  63.5% |  75.1% |  84.0% |  90.1% |
## 
## [1] "The logistic loss on the training set is 9914.759742"
## [1] "The training accuracy and AUC are 80.655687 and 0.862991"
## [1] "The test accuracy and AUC are are 80.052211 and 0.856397"
## 
## The Risk Score is:
## 1.               HSDiploma     -3 point(s) |   ...
## 2.                    NoHS     -5 point(s) | + ...
## 3.                 Married      5 point(s) | + ...
## 4.         AnyCapitalGains      3 point(s) | + ...
## 5.          AnyCapitalLoss      2 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -8.0  |  -6.0  |  -5.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.2% |   0.5% |   0.8% |   2.3% |   3.9% |   6.4% |  10.5% |
## SCORE |   2.0  |   3.0  |   4.0  |   5.0  |   7.0  |   8.0  |  10.0  |
## RISK  |  25.5% |  36.9% |  50.0% |  63.1% |  83.3% |  89.5% |  96.1% |
## 
## [1] "The logistic loss on the training set is 9923.881690"
## [1] "The training accuracy and AUC are 82.179738 and 0.857475"
## [1] "The test accuracy and AUC are are 81.342138 and 0.852014"
## 
## The Risk Score is:
## 1.               HSDiploma     -2 point(s) |   ...
## 2.             ProfVocOrAS     -1 point(s) | + ...
## 3.                    NoHS     -4 point(s) | + ...
## 4.                 Married      3 point(s) | + ...
## 5.         AnyCapitalGains      2 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -7.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |
## RISK  |   0.1% |   0.1% |   0.3% |   0.8% |   1.7% |   3.7% |   8.0% |
## SCORE |   0.0  |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |
## RISK  |  16.4% |  30.7% |  50.0% |  69.3% |  83.6% |  92.0% |
## 
## [1] "The logistic loss on the training set is 9980.639484"
## [1] "The training accuracy and AUC are 82.172060 and 0.855975"
## [1] "The test accuracy and AUC are are 81.234644 and 0.849403"
## 
## The Risk Score is:
## 1.               HSDiploma     -2 point(s) |   ...
## 2.                    NoHS     -4 point(s) | + ...
## 3.                 Married      3 point(s) | + ...
## 4.            NeverMarried     -1 point(s) | + ...
## 5.         AnyCapitalGains      2 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -7.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |
## RISK  |   0.1% |   0.3% |   0.6% |   1.2% |   2.4% |   4.9% |   9.8% |
## SCORE |   0.0  |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |
## RISK  |  18.6% |  32.3% |  50.0% |  67.7% |  81.4% |  90.2% |
## 
## [1] "The logistic loss on the training set is 9988.041002"
## [1] "The training accuracy and AUC are 82.179738 and 0.855476"
## [1] "The test accuracy and AUC are are 81.342138 and 0.849000"
## 
## The Risk Score is:
## 1.               HSDiploma     -2 point(s) |   ...
## 2.                    NoHS     -4 point(s) | + ...
## 3.                 Married      4 point(s) | + ...
## 4.     DivorcedOrSeparated      1 point(s) | + ...
## 5.         AnyCapitalGains      2 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.1% |   0.3% |   0.6% |   1.2% |   2.6% |   5.1% |  10.1% |
## SCORE |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |   7.0  |
## RISK  |  18.9% |  32.6% |  50.0% |  67.4% |  81.1% |  89.9% |  94.9% |
## 
## [1] "The logistic loss on the training set is 10000.803139"
## [1] "The training accuracy and AUC are 82.179738 and 0.854825"
## [1] "The test accuracy and AUC are are 81.342138 and 0.848115"
## 
## The Risk Score is:
## 1.               HSDiploma     -2 point(s) |   ...
## 2.                    NoHS     -4 point(s) | + ...
## 3.              JobService     -1 point(s) | + ...
## 4.                 Married      4 point(s) | + ...
## 5.         AnyCapitalGains      3 point(s) | + ...
##                                      SCORE | =    
## SCORE |  -7.0  |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
## RISK  |   0.2% |   0.4% |   0.7% |   1.3% |   2.4% |   4.3% |   7.7% |  13.4% |
## SCORE |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |   7.0  |
## RISK  |  22.4% |  35.0% |  50.0% |  65.0% |  77.6% |  86.6% |  92.3% |
## 
## [1] "The logistic loss on the training set is 10000.838407"
## [1] "The training accuracy and AUC are 82.175899 and 0.857572"
## [1] "The test accuracy and AUC are are 81.464988 and 0.851669"