In [21]:
# imports
import h2o 
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

In [22]:
# start and connect to h2o server
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp09rz5c2a
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp09rz5c2a/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp09rz5c2a/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.10.3.4
H2O cluster version age:,23 days
H2O cluster name:,H2O_from_python_phall_4370tc
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [24]:
# location of "dirty" file
# decision trees handle dirty data elegantly
path = '/Users/phall/workspace/GWU_data_mining/02_analytical_data_prep/data/loan.csv'

In [25]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [26]:
frame = h2o.import_file(path=path, col_types=col_types) # multi-threaded import

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [27]:
frame.describe()

Rows:163987
Cols:16




Unnamed: 0,id,bad_loan,loan_amnt,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,longest_credit_length,verification_status,term_length
type,int,enum,int,real,int,enum,real,enum,enum,real,int,real,int,int,enum,int
mins,10001.0,,500.0,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,0.0,,36.0
mean,91994.0,,13073.209220415742,13.717143207254315,5.686200649105202,,71931.19588595249,,,15.880794152061497,0.22746713629788037,54.07622244747627,24.577910646698616,14.8582097058084,,40.980679245283056
maxs,173987.0,,35000.0,26.060000000000002,10.0,,7141778.0,,,39.93,29.0,150.70000000000002,118.0,65.0,,60.0
sigma,47339.11363414683,,7992.3993793601785,4.3935679462170425,3.610039811481059,,59464.026648950334,,,7.583636421364416,0.6949139713078192,25.284135504932134,11.685003948632696,6.949793041523766,,9.732920010298912
zeros,0,,0,0,13810,,0,,,263,135210,1515,0,11,,0
missing,0,0,4992,4854,10545,2571,4983,2488,2484,5025,4997,5154,4933,4907,2426,4987
0,10001.0,0,5000.0,10.65,10.0,RENT,24000.0,credit_card,AZ,27.650000000000002,0.0,83.7,9.0,26.0,verified,36.0
1,10002.0,1,2500.0,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,12.0,verified,60.0
2,10003.0,0,2400.0,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,10.0,not verified,36.0


In [28]:
# correct MORTGAGE/mortgage problem using gsub() and trim() functions
print(frame['home_ownership'].table())

frame['home_ownership'] = frame['home_ownership'].gsub(pattern='mortgage',
                                                       replacement='MORTGAGE')
frame['home_ownership'] = frame['home_ownership'].trim()

print(frame['home_ownership'].table())

home_ownership,Count
ANY,1
MORTGAGE,74209
NONE,30
OTHER,151
OWN,13369
RENT,69416
mortgage,4240





home_ownership,Count
ANY,1
MORTGAGE,78449
NONE,30
OTHER,151
OWN,13369
RENT,69416





In [30]:
# split into 40% training, 30% validation, and 30% test
train, valid, test = frame.split_frame([0.4, 0.3])

In [32]:
# assign target and inputs
y = 'bad_loan'
X = [name for name in frame.columns if name not in ['id', '_WARN_', y]]
print(y)
print(X)

bad_loan
['loan_amnt', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status', 'term_length']


In [34]:
# set target to factor - for binary classification
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()
test[y] = test[y].asfactor()

In [37]:
# random forest

# initialize rf model
rf_model = H2ORandomForestEstimator(
    ntrees=500,                      # Up to 500 decision trees in the forest 
    max_depth=30,                    # trees can grow to depth of 30
    stopping_rounds=5,               # stop after validation error does not decrease for 5 iterations/new trees
    score_each_iteration=True,       # score validation error on every iteration/new tree
    model_id='rf_model')             # for easy lookup in flow

# train rf model
rf_model.train(
    x=X,
    y=y,
    training_frame=train,
    validation_frame=valid)

# print model information
rf_model

# view detailed results at http://localhost:54321/flow/index.html

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [38]:
# measure rf AUC
print(rf_model.auc(train=True))
print(rf_model.auc(valid=True))
print(rf_model.model_performance(test_data=test).auc())

0.6516259568256889
0.6698227714296694
0.6665292847716544


In [39]:
# GBM with random hyperparameter search
# train many different GBM models with random hyperparameters
# and select best model based on validation error

# define random grid search parameters
hyper_parameters = {'ntrees':list(range(0, 500, 50)),
                    'max_depth':list(range(0, 20, 2)),
                    'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

# define search strategy
search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':20,
                   'max_runtime_secs':600}

# initialize grid search
gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

# execute training w/ grid search
gsearch.train(x=X,
              y=y,
              training_frame=train,
              validation_frame=valid)

# view detailed results at http://localhost:54321/flow/index.html

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [47]:
# show grid search results
gsearch.show()

# select best model
gbm_model = gsearch.get_grid()[0]

# print model information
gbm_model

     col_sample_rate max_depth ntrees sample_rate  \
0                0.4         4    100         0.9   
1                0.8         2    300         0.6   
2                0.4         4    100         0.4   
3                0.7         2     50         0.3   
4                0.3        10    250         0.9   
5                0.5         8      0         0.4   
6                0.4         4      0         0.6   
7                0.6        10    300         0.7   
8                0.7        10    150         0.3   
9                0.3        12    200         0.9   
10               1.0        10    450         0.7   
11               0.8        14    150         0.4   
12               0.2        12    450         0.2   
13               0.5        12    250         0.8   
14               0.5        18    150         0.5   
15               0.7        14    300         0.3   
16               1.0        16    300         0.3   
17               0.7        16    250         

0,1,2,3,4
,0.0,1.0,Error,Rate
0,38428.0,14523.0,0.2743,(14523.0/52951.0)
1,5101.0,7588.0,0.402,(5101.0/12689.0)
Total,43529.0,22111.0,0.299,(19624.0/65640.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2184950,0.4360920,223.0
max f2,0.1335923,0.5861598,304.0
max f0point5,0.3332156,0.4212968,137.0
max accuracy,0.4503907,0.8140311,70.0
max precision,0.7690441,1.0,0.0
max recall,0.0331168,1.0,397.0
max specificity,0.7690441,1.0,0.0
max absolute_mcc,0.2665027,0.2763134,183.0
max min_per_class_accuracy,0.1968138,0.6627000,242.0


Gains/Lift Table: Avg response rate: 19.33 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100091,0.5253878,3.7242339,3.7242339,0.7199391,0.7199391,0.0372764,0.0372764,272.4233876,272.4233876
,2,0.0200030,0.4816920,3.0754024,3.4000652,0.5945122,0.6572734,0.0307353,0.0680117,207.5402355,240.0065196
,3,0.0300122,0.4527065,2.8108911,3.2035741,0.5433790,0.6192893,0.0281346,0.0961463,181.0891107,220.3574142
,4,0.0400061,0.4311573,2.5155214,3.0316920,0.4862805,0.5860625,0.0251399,0.1212862,151.5521414,203.1691967
,5,0.05,0.4124502,2.4287793,2.9111829,0.4695122,0.5627666,0.0242730,0.1455591,142.8779296,191.1182914
,6,0.1,0.3510026,2.1530459,2.5321144,0.4162096,0.4894881,0.1076523,0.2532114,115.3045945,153.2114430
,7,0.15,0.3090377,1.7984081,2.2875456,0.3476539,0.4422100,0.0899204,0.3431318,79.8408070,128.7545643
,8,0.2,0.2786815,1.5966585,2.1148239,0.3086533,0.4088208,0.0798329,0.4229648,59.6658523,111.4823863
,9,0.3,0.2316795,1.3350146,1.8548874,0.2580743,0.3585720,0.1335015,0.5564662,33.5014580,85.4887435




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.14438109242042815
RMSE: 0.3799751207913857
LogLoss: 0.45479666998307405
Mean Per-Class Error: 0.361804064598096
AUC: 0.6872182674977086
Gini: 0.3744365349954173
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2061165497069541: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,27012.0,12710.0,0.32,(12710.0/39722.0)
1,3827.0,5642.0,0.4042,(3827.0/9469.0)
Total,30839.0,18352.0,0.3362,(16537.0/49191.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2061165,0.4055929,231.0
max f2,0.1183479,0.5679719,318.0
max f0point5,0.3021100,0.3663097,151.0
max accuracy,0.5679614,0.8083796,22.0
max precision,0.7490839,1.0,0.0
max recall,0.0344285,1.0,396.0
max specificity,0.7490839,1.0,0.0
max absolute_mcc,0.2073513,0.2249614,230.0
max min_per_class_accuracy,0.1928456,0.6374553,243.0


Gains/Lift Table: Avg response rate: 19.25 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100018,0.5153739,2.7030644,2.7030644,0.5203252,0.5203252,0.0270356,0.0270356,170.3064428,170.3064428
,2,0.0200037,0.4739774,2.3968579,2.5499612,0.4613821,0.4908537,0.0239730,0.0510086,139.6857911,154.9961170
,3,0.0300055,0.4469212,2.2490341,2.4496521,0.4329268,0.4715447,0.0224945,0.0735030,124.9034075,144.9652138
,4,0.0400073,0.4257645,2.2807106,2.4074168,0.4390244,0.4634146,0.0228113,0.0963143,128.0710612,140.7416757
,5,0.0500091,0.4078517,2.0695337,2.3398401,0.3983740,0.4504065,0.0206991,0.1170134,106.9533703,133.9840146
,6,0.1000183,0.3475735,1.9407158,2.1402780,0.3735772,0.4119919,0.0970535,0.2140670,94.0715789,114.0277967
,7,0.1500071,0.3087966,1.7154538,1.9987083,0.3302155,0.3847405,0.0857535,0.2998205,71.5453836,99.8708305
,8,0.2000163,0.2786260,1.5035796,1.8749135,0.2894309,0.3609107,0.0751927,0.3750132,50.3579588,87.4913545
,9,0.3000142,0.2322707,1.3042825,1.6847161,0.2510673,0.3242987,0.1304256,0.5054388,30.4282508,68.4716088



Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2017-02-26 20:37:23,10.034 sec,0.0,0.3948955,0.4909899,0.5,1.0,0.8066880,0.3942602,0.4898221,0.5,1.0,0.8075054
,2017-02-26 20:37:23,10.122 sec,1.0,0.3928354,0.4858534,0.6596175,2.1389207,0.3577544,0.3922891,0.4849089,0.6561771,1.9504438,0.3752922
,2017-02-26 20:37:23,10.183 sec,2.0,0.3912008,0.4818656,0.6687307,2.4841747,0.3488879,0.3907561,0.4811622,0.6631341,2.0483338,0.3775691
,2017-02-26 20:37:23,10.256 sec,3.0,0.3896667,0.4781717,0.6739042,2.6686031,0.3378123,0.3893532,0.4777577,0.6679402,2.3650170,0.4245492
,2017-02-26 20:37:23,10.341 sec,4.0,0.3883678,0.4751033,0.6779161,2.8651583,0.3988422,0.3881810,0.4749732,0.6707408,2.5568904,0.3773454
,2017-02-26 20:37:24,10.454 sec,5.0,0.3873668,0.4727509,0.6797287,2.7439309,0.3939671,0.3873226,0.4729324,0.6713976,2.5291213,0.4134903
,2017-02-26 20:37:24,10.594 sec,6.0,0.3865308,0.4708421,0.6798502,2.8387411,0.4138787,0.3865470,0.4711347,0.6722546,2.7664175,0.3990161
,2017-02-26 20:37:24,10.756 sec,7.0,0.3856835,0.4688257,0.6815973,2.9132485,0.4080134,0.3858120,0.4693569,0.6732273,2.7295510,0.4015775
,2017-02-26 20:37:24,10.936 sec,8.0,0.3849485,0.4671133,0.6831737,2.9044071,0.4015539,0.3852280,0.4679569,0.6738892,2.6659693,0.3878758


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
int_rate,2056.2578125,1.0,0.3764540
addr_state,818.2465820,0.3979300,0.1498023
annual_inc,542.6047363,0.2638797,0.0993386
term_length,524.0743408,0.2548680,0.0959461
dti,353.3314514,0.1718323,0.0646870
purpose,298.6519470,0.1452405,0.0546764
revol_util,279.2208252,0.1357908,0.0511190
loan_amnt,228.4065857,0.1110788,0.0418160
total_acc,131.3474579,0.0638769,0.0240467




In [48]:
# measure gbm AUC
print(gbm_model.auc(train=True))
print(gbm_model.auc(valid=True))
print(gbm_model.model_performance(test_data=test).auc())

0.7252310460262095
0.6872182674977086
0.6845183180496265


In [16]:
# shutdown h2o
h2o.cluster().shutdown(prompt=False)

H2O session _sid_805f closed.
