# Register R Model with SASCTL

Excludes Preprocessing

# Data

| Name      | Model Role | Measurement Level | Description                                                            |
|:----------|:-----------|:------------------|:-----------------------------------------------------------------------|
| BAD       | Target     | Binary            | 1 = applicant defaulted on loan or delinquent, 0 = applicant paid loan |
| CLAGE     | Input      | Interval          | Age of oldest credit line in months                                    |
| CLNO      | Input      | Interval          | Number of credit lines                                                 |
| DEBTINC   | Input      | Interval          | Debt-to-income ratio                                                   |
| DELINQ    | Input      | Interval          | Number of delinquent credit lines                                      |
| DEROG     | Input      | Interval          | Number of derogatory reports                                           |
| JOB       | Input      | Nominal           | Occupational categories                                                |
| LOAN      | Input      | Interval          | Amount of loan request                                                 |
| MORTDUE   | Input      | Interval          | Amount due on existing mortgage                                        |
| NINQ      | Input      | Interval          | Number of recent credit inquiries                                      |
| REASON    | Input      | Binary            | DebtCon = debt consolidation, HomeImp = home improvement               |
| VALUE     | Input      | Interval          | Value of current property                                              |
| YOJ       | Input      | Interval          | Years at present job                                                   |

# Load Packages

In [1]:
library(sasctl)
library(pmml)
library(XML)
library(zip)

Loading required package: XML


Attaching package: 'zip'


The following objects are masked from 'package:utils':

    unzip, zip




# Load Data

In [2]:
df = read.csv(paste0(dirname(getwd()),"/Data/hmeq.csv"))

# View Data

In [3]:
dim(df)

In [4]:
head(df)

Unnamed: 0_level_0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>
1,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.36667,1.0,9.0,
2,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.83333,0.0,14.0,
3,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.46667,1.0,10.0,
4,1,1500,,,,,,,,,,,
5,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.33333,0.0,14.0,
6,1,1700,30548.0,40320.0,HomeImp,Other,9.0,0.0,0.0,101.466,1.0,8.0,37.11361


In [5]:
summary(df)

      BAD              LOAN          MORTDUE           VALUE       
 Min.   :0.0000   Min.   : 1100   Min.   :  2063   Min.   :  8000  
 1st Qu.:0.0000   1st Qu.:11100   1st Qu.: 46276   1st Qu.: 66076  
 Median :0.0000   Median :16300   Median : 65019   Median : 89236  
 Mean   :0.1995   Mean   :18608   Mean   : 73761   Mean   :101776  
 3rd Qu.:0.0000   3rd Qu.:23300   3rd Qu.: 91488   3rd Qu.:119824  
 Max.   :1.0000   Max.   :89900   Max.   :399550   Max.   :855909  
                                  NA's   :518      NA's   :112     
    REASON              JOB                 YOJ             DEROG        
 Length:5960        Length:5960        Min.   : 0.000   Min.   : 0.0000  
 Class :character   Class :character   1st Qu.: 3.000   1st Qu.: 0.0000  
 Mode  :character   Mode  :character   Median : 7.000   Median : 0.0000  
                                       Mean   : 8.922   Mean   : 0.2546  
                                       3rd Qu.:13.000   3rd Qu.: 0.0000  
            

In [6]:
table(df$BAD)


   0    1 
4771 1189 

In [7]:
table(df$JOB)


            Mgr  Office   Other ProfExe   Sales    Self 
    279     767     948    2388    1276     109     193 

In [8]:
table(df$REASON)


        DebtCon HomeImp 
    252    3928    1780 

# Drop Missing

In [9]:
df = df[complete.cases(df),]
dim(df)

# Get Variables

In [10]:
target = "BAD"
inputs = c("LOAN","MORTDUE","VALUE","YOJ","DEROG","DELINQ","CLAGE","NINQ","CLNO","DEBTINC")
df = df[c(target,inputs)]

# Partition Data

In [11]:
train_pct = 0.70
valid_pct = 0.20
test_pct = 0.10
nrows = nrow(df)
myseq = seq(nrows)

set.seed(802)
train_index = sample(myseq, round(nrows*train_pct))
valid_test_index = setdiff(myseq, train_index)

valid_index = sample(valid_test_index, round(nrows*valid_pct))
test_index = setdiff(valid_test_index, valid_index)

df_train = df[train_index, ]
cat("Training Data Shape =", dim(df_train), "\n")
df_valid = df[valid_index, ]
cat("Valid Data Shape =", dim(df_valid), "\n")
df_test = df[test_index, ]
cat("Test Data Shape =", dim(df_test), "\n")

Training Data Shape = 2460 11 
Valid Data Shape = 703 11 
Test Data Shape = 352 11 


# Build Model

In [12]:
lr = glm(BAD ~ ., df_train, family = "binomial")
summary(lr)
print("Odds Ratios")
exp(coef(lr))


Call:
glm(formula = BAD ~ ., family = "binomial", data = df_train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5150  -0.4028  -0.3035  -0.1996   3.5535  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.531e+00  5.239e-01 -10.556  < 2e-16 ***
LOAN        -1.844e-05  9.102e-06  -2.026 0.042803 *  
MORTDUE     -2.828e-06  4.099e-06  -0.690 0.490213    
VALUE        3.600e-06  3.497e-06   1.029 0.303282    
YOJ         -1.460e-02  1.164e-02  -1.255 0.209636    
DEROG        6.312e-01  1.188e-01   5.314 1.07e-07 ***
DELINQ       5.961e-01  7.901e-02   7.544 4.54e-14 ***
CLAGE       -4.051e-03  1.233e-03  -3.284 0.001023 ** 
NINQ         1.492e-01  4.267e-02   3.497 0.000471 ***
CLNO        -2.100e-02  9.549e-03  -2.199 0.027889 *  
DEBTINC      1.108e-01  1.260e-02   8.793  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance

[1] "Odds Ratios"


# Score Data

In [13]:
lr_probs_train = predict(lr, newdata = df_train, type = "response")
lr_preds_train = as.numeric(lr_probs_train > 0.5)
lr_score_train = sum(lr_preds_train==df_train[target]) / nrow(df_train)
cat("Logistic Regression Train Accuracy =", round(lr_score_train, 4), "\n")

lr_probs_valid = predict(lr, newdata = df_valid, type = "response")
lr_preds_valid = as.numeric(lr_probs_valid > 0.5)
lr_score_valid = sum(lr_preds_valid==df_valid[target]) / nrow(df_valid)
cat("Logistic Regression Valid Accuracy =", round(lr_score_valid, 4), "\n")

lr_probs_test = predict(lr, newdata = df_test, type = "response")
lr_preds_test = as.numeric(lr_probs_test > 0.5)
lr_score_test = sum(lr_preds_test==df_test[target]) / nrow(df_test)
cat("Logistic Regression Test Accuracy =", round(lr_score_test, 4), "\n")

Logistic Regression Train Accuracy = 0.9297 
Logistic Regression Valid Accuracy = 0.9289 
Logistic Regression Test Accuracy = 0.9205 


In [14]:
df_train_scored = data.frame(BAD=df_train$BAD, P_BAD1=lr_probs_train, P_BAD0=1-lr_probs_train, EM_CLASSIFICATION=lr_preds_train)
df_valid_scored = data.frame(BAD=df_valid$BAD, P_BAD1=lr_probs_valid, P_BAD0=1-lr_probs_valid, EM_CLASSIFICATION=lr_preds_valid)
df_test_scored  = data.frame(BAD=df_test$BAD, P_BAD1=lr_probs_test, P_BAD0=1-lr_probs_test, EM_CLASSIFICATION=lr_preds_test)
head(df_test_scored)

Unnamed: 0_level_0,BAD,P_BAD1,P_BAD0,EM_CLASSIFICATION
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>
27,0,0.054431441,0.9455686,0
103,0,0.007767325,0.9922327,0
126,0,0.064837052,0.9351629,0
341,0,0.006324597,0.9936754,0
416,0,0.174562803,0.8254372,0
439,0,0.007290371,0.9927096,0


# Create Metadata Directory

In [15]:
output_dir = paste0(dirname(getwd()),"/Model_Manager/Metadata")
model_name = "R_LR_Model"
data_name = "HMEQ"
zip_folder = paste0(output_dir, "/", data_name, "_", model_name)

In [16]:
if (file.exists(zip_folder)){
    unlink(zip_folder, recursive=TRUE) 
}

dir.create(zip_folder)

# Save R Model

In [17]:
saveRDS(lr, paste0(zip_folder, "/",model_name,".rds"))

# Get Sample R Score Code

In [18]:
create_scoreSample(zip_folder, openFile = FALSE)
file.show(paste0(zip_folder,"/scoreCode.R"), title=NULL)

Example file copied to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/scoreCode.R



# Model Properties

In [19]:
write_ModelProperties_json(modelName = "R Logistic", 
                           modelDescription = "R model", 
                           modelFunction = "Classification",
                           trainTable = "HMEQ",
                           algorithm = "Logistic Regression",
                           numTargetCategories = 2,
                           targetEvent = "1",
                           targetVariable = "BAD",
                           eventProbVar = "P_BAD1",
                           modeler = "jobake",
                           tool = "R",
                           toolVersion = "default",
                           path = zip_folder)

[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/ModelProperties.json"


# Model Variables

In [20]:
write_in_out_json(data=df_train[,-1], 
                  input=TRUE, 
                  path=zip_folder)

[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/inputVar.json"


name,length,type,level,role
<chr>,<dbl>,<chr>,<chr>,<chr>
LOAN,8,decimal,interval,input
MORTDUE,8,decimal,interval,input
VALUE,8,decimal,interval,input
YOJ,8,decimal,interval,input
DEROG,8,decimal,interval,input
DELINQ,8,decimal,interval,input
CLAGE,8,decimal,interval,input
NINQ,8,decimal,interval,input
CLNO,8,decimal,interval,input
DEBTINC,8,decimal,interval,input


In [21]:
write_in_out_json(data=df_train_scored[,-1], 
                  input=FALSE, 
                  path=zip_folder)

[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/outputVar.json"


name,length,type,level,role
<chr>,<dbl>,<chr>,<chr>,<chr>
P_BAD1,8,decimal,interval,output
P_BAD0,8,decimal,interval,output
EM_CLASSIFICATION,8,decimal,interval,output


# Write Metadata

In [22]:
write_fileMetadata_json(scoreCodeName = "scoreCode.R",
                        scoreResource = paste0(model_name,".rds"),
                        path = zip_folder)

[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/fileMetadata.json"


# Diagnostics 

In [23]:
diag = diagnosticsJson(validadedf = df_valid_scored,
                       traindf = df_train_scored,
                       testdf = df_test_scored,
                       targetEventValue = 1,
                       targetName = "BAD",
                       path = zip_folder)

[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/dmcas_lift.json"
[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/dmcas_roc.json"
[1] "File written to C:/Users/jobake/FSBU/Model_Manager/Metadata/HMEQ_R_LR_Model/dmcas_fitstat.json"


# Zip Files

In [24]:
zip_files = list.files(zip_folder, full.names = T)
zipr(zipfile=paste0(zip_folder,"/",model_name,".zip"), files=zip_files)

# Register Model

In [25]:
df = read.csv(paste0(dirname(getwd()),"/password_r.txt"), header=TRUE, stringsAsFactors=FALSE)

In [26]:
sess = session(hostname=paste0("https://", strsplit(df$hostname,"/")[[1]][1]), username=df$username, password=df$password)

In [27]:
reg = register_model(
    session = sess,
    file = paste0(zip_folder,"/",model_name,".zip"),
    name = model_name,
    type = "zip",
    project = "MM_OS_Test",
    force = FALSE
)

# End Session

In [28]:
cas.session.endSession(conn)

ERROR: Error in cas.session.endSession(conn): could not find function "cas.session.endSession"
