# Register a Python Pipeline with PZMM

Includes Preprocessing

# Import Packages

In [1]:
import swat
from sasctl import Session
import sasctl.pzmm as pzmm

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import random
import os
import shutil
import sys
import pickle
import json

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  method='lar', copy_X=True, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes

# Get Key Variables

In [2]:
os.chdir("/".join(os.getcwd().split("\\")[:-1]))
from password import wd, hostname, port, username, password, protocol

# Load Data

In [3]:
df = pd.read_csv(wd+"Data/"+"hmeq.csv")

# View Data

In [4]:
df.shape

(5960, 13)

In [5]:
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [6]:
df.describe()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
count,5960.0,5960.0,5442.0,5848.0,5445.0,5252.0,5380.0,5652.0,5450.0,5738.0,4693.0
mean,0.199497,18607.969799,73760.8172,101776.048741,8.922268,0.25457,0.449442,179.766275,1.186055,21.296096,33.779915
std,0.399656,11207.480417,44457.609458,57385.775334,7.573982,0.846047,1.127266,85.810092,1.728675,10.138933,8.601746
min,0.0,1100.0,2063.0,8000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.524499
25%,0.0,11100.0,46276.0,66075.5,3.0,0.0,0.0,115.116702,0.0,15.0,29.140031
50%,0.0,16300.0,65019.0,89235.5,7.0,0.0,0.0,173.466667,1.0,20.0,34.818262
75%,0.0,23300.0,91488.0,119824.25,13.0,0.0,0.0,231.562278,2.0,26.0,39.003141
max,1.0,89900.0,399550.0,855909.0,41.0,10.0,15.0,1168.233561,17.0,71.0,203.312149


In [7]:
df["BAD"].value_counts()

0    4771
1    1189
Name: BAD, dtype: int64

In [8]:
df["JOB"].value_counts()

Other      2388
ProfExe    1276
Office      948
Mgr         767
Self        193
Sales       109
Name: JOB, dtype: int64

In [9]:
df["REASON"].value_counts()

DebtCon    3928
HomeImp    1780
Name: REASON, dtype: int64

In [10]:
df.isnull().sum(axis=0)

BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64

# Preprocessing Pipeline

In [11]:
num_cols = ["LOAN","MORTDUE","VALUE","YOJ","DEROG","DELINQ","CLAGE","NINQ","CLNO","DEBTINC"]
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

In [12]:
cat_cols = ["JOB","REASON"]
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

In [14]:
target = "BAD"
inputs = num_cols + cat_cols

# Partition Data

In [15]:
train_pct = 0.70
valid_pct = 0.20
test_pct = 0.10
nrows = len(df)
myseq = list(range(nrows))

random.seed(802)
train_index = random.sample(myseq,round(nrows*train_pct))
valid_test_index = list(set(myseq) - set(train_index))

valid_index = random.sample(valid_test_index,round(nrows*valid_pct))
test_index = list(set(valid_test_index) - set(valid_index))

df_train = df.iloc[train_index]
print("Training Data Shape =",df_train.shape)
df_valid = df.iloc[valid_index]
print("Valid Data Shape     =",df_valid.shape)
df_test = df.iloc[test_index]
print("Test Data Shape     =",df_test.shape)

Training Data Shape = (4172, 13)
Valid Data Shape     = (1192, 13)
Test Data Shape     = (596, 13)


# Build Model

In [16]:
lr = LogisticRegression(random_state=802)
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', lr)])

pipeline_lr

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [17]:
pipeline_lr.fit(df_train[num_cols+cat_cols],df_train[target])



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [18]:
coefs = pipeline_lr["classifier"].coef_.flatten()
coef = pd.DataFrame(zip(num_cols+cat_cols, coefs), columns=["feature", "coef"])
coef["abs_coef"] = coef["coef"].apply(lambda x: abs(x))
coef = coef.sort_values("abs_coef", ascending=False)
coef = coef.drop(columns='abs_coef')
coef

Unnamed: 0,feature,coef
9,DEBTINC,0.014405
6,CLAGE,-0.008076
5,DELINQ,0.005688
8,CLNO,0.005547
7,NINQ,0.003814
4,DEROG,0.003271
3,YOJ,-0.002838
11,REASON,-0.000519
10,JOB,0.000206
0,LOAN,-2.7e-05


# Score Data

In [19]:
lr_score_train = pipeline_lr.score(X=df_train[inputs], y=df_train[target])
print("Logistic Regression Train Accuracy =", round(lr_score_train,4))

lr_score_valid = pipeline_lr.score(X=df_valid[inputs], y=df_valid[target])
print("Logistic Regression Valid Accuracy =", round(lr_score_valid,4))

lr_score_test = pipeline_lr.score(X=df_test[inputs], y=df_test[target])
print("Logistic Regression Test Accuracy =", round(lr_score_test,4))

Logistic Regression Train Accuracy = 0.8058
Logistic Regression Valid Accuracy = 0.7752
Logistic Regression Test Accuracy = 0.8188


In [20]:
lr_auc_train = roc_auc_score(y_true=df_train[target], y_score=pipeline_lr.predict_proba(df_train[inputs])[:, 1])
print("Logistic Regression Train ROC =", round(lr_auc_train,4))

lr_auc_valid = roc_auc_score(y_true=df_valid[target], y_score=pipeline_lr.predict_proba(df_valid[inputs])[:, 1])
print("Logistic Regression Valid ROC =", round(lr_auc_valid,4))

lr_auc_test = roc_auc_score(y_true=df_test[target], y_score=pipeline_lr.predict_proba(df_test[inputs])[:, 1])
print("Logistic Regression test ROC =", round(lr_auc_test,4))

Logistic Regression Train ROC = 0.6734
Logistic Regression Valid ROC = 0.6342
Logistic Regression test ROC = 0.6739


# Register Model

In [21]:
# sess = Session(hostname, username, password)
# conn = sess.as_swat()

# Create Metadata Directory

In [22]:
output_dir = os.getcwd()+"\\Model_Manager\\Metadata"
model_name = "Python_Sklearn_LR_Pipeline"
data_name = "HMEQ"
zip_folder = output_dir +"\\"+ data_name +"_"+ model_name

In [23]:
if os.path.exists(zip_folder):
    shutil.rmtree(zip_folder)

os.makedirs(zip_folder)

# Define Variables

In [24]:
project_name = "HMEQ_Pipeline"
project_name = "MM_OS_Test"
metric_labels = ['EM_EVENTPROBABILITY', 'EM_CLASSIFICATION']
output_data = pd.DataFrame(columns=metric_labels, data=[[0.5, 'A']])
model_type = "Gradient Boosting"
target_event_level = 1
target_levels = 2
predict_syntax = "predict_proba"
predict_method = str('{}.')+str(predict_syntax)+str('({})')

# Create Metadata

In [25]:
pzmm.PickleModel.pickleTrainedModel(_, 
                                    trainedModel = pipeline_lr, 
                                    modelPrefix = model_name, 
                                    pPath = zip_folder)

pzmm.JSONFiles().writeVarJSON(inputData = df_train[inputs], 
                              isInput = True, 
                              jPath = zip_folder)

pzmm.JSONFiles().writeVarJSON(inputData = output_data, 
                              isInput = False, 
                              jPath = zip_folder)

Model Python_Sklearn_LR_Pipeline was successfully pickled and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\Python_Sklearn_LR_Pipeline.pickle.
inputVar.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\inputVar.json
outputVar.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\outputVar.json


In [26]:
train_proba = pipeline_lr.predict_proba(df_train[inputs])
df_train_obs_preds = pd.concat([df_train[target].reset_index(drop=True), pd.Series(data=train_proba[:,1])], axis=1)
df_train_obs_preds.columns = [target, "P_"+target+str(1)]

valid_proba = pipeline_lr.predict_proba(df_valid[inputs])
df_valid_obs_preds = pd.concat([df_valid[target].reset_index(drop=True), pd.Series(data=valid_proba[:,1])], axis=1)
df_valid_obs_preds.columns = [target, "P_"+target+str(1)]

test_proba = pipeline_lr.predict_proba(df_test[inputs])
df_test_obs_preds = pd.concat([df_test[target].reset_index(drop=True), pd.Series(data=test_proba[:,1])], axis=1)
df_test_obs_preds.columns = [target, "P_"+target+str(1)]

In [27]:
pzmm.JSONFiles().calculateFitStat(trainData = df_train_obs_preds, 
                                  validateData = df_valid_obs_preds, 
                                  testData = df_test_obs_preds, 
                                  jPath = zip_folder)

pzmm.JSONFiles().generateROCLiftStat(targetName = target, 
                                     targetValue = target_event_level, 
                                     swatConn = conn, 
                                     trainData = df_train_obs_preds, 
                                     validateData = df_valid_obs_preds, 
                                     testData = df_test_obs_preds, 
                                     jPath = zip_folder)

dmcas_fitstat.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\dmcas_fitstat.json
NOTE: Added action set 'percentile'.
NOTE: Cloud Analytic Services made the uploaded file available as table SCOREDVALUES in caslib CASUSER(jobake).
NOTE: The table SCOREDVALUES has been created in caslib CASUSER(jobake) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table SCOREDVALUES in caslib CASUSER(jobake).
NOTE: The table SCOREDVALUES has been created in caslib CASUSER(jobake) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table SCOREDVALUES in caslib CASUSER(jobake).
NOTE: The table SCOREDVALUES has been created in caslib CASUSER(jobake) from binary data uploaded to Cloud Analytic Services.
dmcas_roc.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\

In [28]:
pzmm.JSONFiles().writeFileMetadataJSON(modelPrefix = model_name, 
                                       jPath = zip_folder)

pzmm.JSONFiles().writeModelPropertiesJSON(modelName = model_name, 
                                          modelDesc = model_name,
                                          targetVariable = target,
                                          modelType = model_type,
                                          modelPredictors = inputs,
                                          targetEvent = target_event_level,
                                          numTargetCategories = target_levels,
                                          eventProbVar = metric_labels[0],
                                          jPath = zip_folder,
                                          modeler = username)

fileMetaData.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\fileMetaData.json
ModelProperties.json was successfully written and saved to C:\Users\jobake\FSBU\Model_Manager\Metadata\HMEQ_Python_Sklearn_LR_Pipeline\ModelProperties.json


In [29]:
# pzmm.ImportModel().pzmmImportModel(zPath = zip_folder, 
#                                    modelPrefix = model_name, 
#                                    project = project_name, 
#                                    inputDF = df_train[inputs], 
#                                    targetDF = df_train[target], 
#                                    predictmethod = predict_method, 
#                                    metrics = metric_labels, 
#                                    force = True)

# End Session

In [30]:
conn.session.endSession()