In [13]:
import h2o
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator as gbm
from h2o.grid.grid_search import H2OGridSearch
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import log_loss
import shutil
import os

In [72]:
h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,10 mins 42 secs
H2O cluster version:,3.10.1.2
H2O cluster version age:,2 months and 5 days
H2O cluster name:,H2O_from_python_hamelhusain_a536si
H2O cluster total nodes:,1
H2O cluster free memory:,3.418 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


## Read in Data

In [73]:
df = pd.read_csv('/Users/hamelhusain/Dropbox/zidisha/Regression_Dataset_20170125.csv')
id_fields = ['borrower_id', 'category_id', 'id']
features = [c for c in df.columns if c not in id_fields]

In [74]:
columns = ['id', 'borrower_id', 'length(loans.proposal)', 'length(about_me)',
       'length(about_business)', 'length(address_instructions)',
       'missing_referred_by', 'application_time', 
       'friends_count', 'category_id', 'invited_flag', 'english_flag',
       'usd_installment_amount', 'reserve_fee_pct', 'ip_user_count',
       'parent_age_at_invite', 'parent_comment_cnt',
       'parent_avg_char_per_comment', 'parent_repay_ratio',
       'parent_usd_total_amount', 'parent_invited_count', 'peer_repay_ratio',
       'peer_usd_total_amount', 'peer_network_comments_by',
       'peer_network_avg_char_per_comment_by', 'peer_network_comments_about',
       'peer_network_avg_char_per_comment_about', 'loan_usage', 'Prior Loans', 'default_flag']

In [75]:
#[str(x) for x in df[columns].dtypes.values]

In [76]:
col_types = ['int',
 'int',
 'int',
 'real',
 'real',
 'int',
 'int',
 'real',
 'real',
 'enum',
 'int',
 'int',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'real',
 'enum',
 'int',
 'enum']

## Upload Data To H2O

In [78]:
HF = h2o.H2OFrame()
HDF = HF.from_python(df[columns], column_types = col_types)
hgbm = gbm(nfolds=5, distribution='bernoulli')
hyper_params = {'max_depth': [20], 'min_rows': [20, 40], 'ntrees':[30]}
grid = H2OGridSearch(hgbm, hyper_params, grid_id='zidisha')

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Split Train & Test

In [79]:
test, train = HDF.split_frame([.2])
print('test set shape:', test.shape)
print('train set shape:', train.shape)

test set shape: (6199, 30)
train set shape: (24941, 30)


## Train Model And Evaluate Results

In [80]:
grid.train(x=features, y='default_flag', training_frame=train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [81]:
grid_results = grid.get_grid(sort_by='logloss', decreasing=False)
grid_results.sorted_metric_table()

Unnamed: 0,Unnamed: 1,max_depth,min_rows,ntrees,model_ids,logloss
0,,20,20.0,30,zidisha_model_0,0.540005999122871
1,,20,40.0,30,zidisha_model_1,0.5406877713183903


In [82]:
best_model = grid_results[0]
best_model.coef_norm

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  zidisha_model_0


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.08644091055272848
RMSE: 0.29400835116154184
LogLoss: 0.31400952272797483
Mean Per-Class Error: 0.07256562187802129
AUC: 0.9809309558843343
Gini: 0.9618619117686686
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.42417330765549455: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,12682.0,987.0,0.0722,(987.0/13669.0)
1,822.0,10450.0,0.0729,(822.0/11272.0)
Total,13504.0,11437.0,0.0725,(1809.0/24941.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4241733,0.9203400,219.0
max f2,0.3497008,0.9415003,248.0
max f0point5,0.5077721,0.9359630,187.0
max accuracy,0.4380500,0.9279099,214.0
max precision,0.9703898,1.0,0.0
max recall,0.1648001,1.0,338.0
max specificity,0.9703898,1.0,0.0
max absolute_mcc,0.4380500,0.8544043,214.0
max min_per_class_accuracy,0.4241733,0.9270759,219.0


Gains/Lift Table: Avg response rate: 45.19 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100237,0.9651718,2.2126508,2.2126508,1.0,1.0,0.0221789,0.0221789,121.2650816,121.2650816
,2,0.0200072,0.9599268,2.2126508,2.2126508,1.0,1.0,0.0220901,0.0442690,121.2650816,121.2650816
,3,0.0300309,0.9540751,2.2126508,2.2126508,1.0,1.0,0.0221789,0.0664478,121.2650816,121.2650816
,4,0.0400144,0.9471467,2.2126508,2.2126508,1.0,1.0,0.0220901,0.0885380,121.2650816,121.2650816
,5,0.0500381,0.9398124,2.2126508,2.2126508,1.0,1.0,0.0221789,0.1107168,121.2650816,121.2650816
,6,0.1000361,0.8989975,2.2126508,2.2126508,1.0,1.0,0.1106281,0.2213449,121.2650816,121.2650816
,7,0.1500341,0.8459147,2.2108764,2.2120595,0.9991981,0.9997328,0.1105394,0.3318843,121.0876437,121.2059515
,8,0.2000321,0.7861604,2.2091021,2.2113203,0.9983962,0.9993987,0.1104507,0.4423350,120.9102058,121.1320299
,9,0.3000281,0.6530078,2.1585323,2.1937266,0.9755413,0.9914473,0.2158446,0.6581796,115.8532252,119.3726634




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.18222955499303123
RMSE: 0.4268835379738029
LogLoss: 0.540005999122871
Mean Per-Class Error: 0.2837686194603726
AUC: 0.7908942464392211
Gini: 0.5817884928784423
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.36263722649996155: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,8859.0,4810.0,0.3519,(4810.0/13669.0)
1,2686.0,8586.0,0.2383,(2686.0/11272.0)
Total,11545.0,13396.0,0.3005,(7496.0/24941.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3626372,0.6961245,238.0
max f2,0.1383406,0.8153885,357.0
max f0point5,0.6013751,0.7190933,138.0
max accuracy,0.5366292,0.7265547,162.0
max precision,0.9715001,1.0,0.0
max recall,0.0412492,1.0,396.0
max specificity,0.9715001,1.0,0.0
max absolute_mcc,0.5783136,0.4483149,147.0
max min_per_class_accuracy,0.4060194,0.7106991,218.0


Gains/Lift Table: Avg response rate: 45.19 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100237,0.9645713,2.2038002,2.2038002,0.996,0.996,0.0220901,0.0220901,120.3800213,120.3800213
,2,0.0200072,0.9583768,2.1859924,2.1949141,0.9879518,0.9919840,0.0218240,0.0439141,118.5992373,119.4914136
,3,0.0300309,0.9510550,2.1949496,2.1949260,0.992,0.9919893,0.0220014,0.0659155,119.4949610,119.4925977
,4,0.0400144,0.9417037,2.1415616,2.1816116,0.9678715,0.9859719,0.0213804,0.0872960,114.1561633,118.1611626
,5,0.0500381,0.9313735,2.1241448,2.1700998,0.96,0.9807692,0.0212917,0.1085877,112.4144784,117.0099839
,6,0.1000361,0.8755742,2.0227922,2.0964756,0.9141941,0.9474950,0.1011356,0.2097232,102.2792246,109.6475563
,7,0.1500341,0.8023088,1.8684213,2.0204778,0.8444266,0.9131480,0.0934173,0.3031405,86.8421259,102.0477776
,8,0.2000321,0.7311753,1.6200082,1.9203804,0.7321572,0.8679094,0.0809972,0.3841377,62.0008176,92.0380444
,9,0.3000281,0.5893090,1.4141802,1.7516696,0.6391339,0.7916611,0.1414123,0.5255500,41.4180193,75.1669576



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.6944594,0.0080561,0.685479,0.6991484,0.706278,0.7044502,0.6769416
auc,0.7911798,0.0070518,0.8082987,0.7812758,0.7948132,0.7816272,0.7898843
err,0.3055406,0.0080561,0.3145210,0.3008516,0.2937220,0.2955498,0.3230585
err_count,1524.2,43.158775,1566.0,1519.0,1441.0,1481.0,1614.0
f0point5,0.656682,0.0085493,0.6390042,0.6558023,0.6728051,0.6666944,0.6491041
f1,0.6980237,0.0074751,0.7023945,0.6868687,0.7096514,0.6840196,0.7071843
f2,0.7460976,0.0215381,0.7797468,0.7210249,0.7507674,0.7022693,0.7766797
lift_top_group,2.2046375,0.0326171,2.2219582,2.2600715,2.1756098,2.235058,2.1304903
logloss,0.5399684,0.0070965,0.5210699,0.546563,0.5383896,0.5483429,0.5454763


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-02-19 15:59:53,32.152 sec,0.0,0.4976855,0.6885218,0.5,1.0,0.5480534
,2017-02-19 15:59:54,32.348 sec,1.0,0.4804385,0.6544471,0.8445332,2.2126508,0.2638627
,2017-02-19 15:59:54,32.490 sec,2.0,0.4652605,0.6252141,0.8659026,2.2126508,0.2351149
,2017-02-19 15:59:54,32.800 sec,3.0,0.4512631,0.5987212,0.8865311,2.2126508,0.2143860
,2017-02-19 15:59:54,32.982 sec,4.0,0.4387141,0.5752094,0.8983172,2.2126508,0.1971052
,2017-02-19 15:59:55,33.174 sec,5.0,0.4273872,0.5540918,0.9070973,2.2126508,0.1855980
,2017-02-19 15:59:55,33.400 sec,6.0,0.4170298,0.5348353,0.9140598,2.2126508,0.1832725
,2017-02-19 15:59:55,33.623 sec,7.0,0.4073610,0.5169234,0.9207208,2.2126508,0.1711639
,2017-02-19 15:59:55,33.863 sec,8.0,0.3985763,0.5006357,0.9263059,2.2126508,0.1602983


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
parent_repay_ratio,3225.5710449,1.0,0.1551331
reserve_fee_pct,2195.3669434,0.6806134,0.1055857
usd_installment_amount,1578.3438721,0.4893223,0.0759101
length(address_instructions),1357.3443604,0.4208075,0.0652812
application_time,1273.7399902,0.3948882,0.0612602
---,---,---,---
missing_referred_by,182.2612610,0.0565051,0.0087658
peer_network_comments_by,180.2558136,0.0558834,0.0086694
peer_network_avg_char_per_comment_about,177.6522522,0.0550762,0.0085441



See the whole table with table.as_data_frame()


<bound method ModelBase.coef_norm of >

In [83]:
holdout_preds = best_model.predict(test)['p1']
holdout_eval = pd.concat([holdout_preds['p1'].as_data_frame(), test['default_flag'].as_data_frame()], axis = 1)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [84]:
print('holdout log-loss:', log_loss(holdout_eval.default_flag.values, holdout_eval.p1.values))

holdout log-loss: 0.534793141864


## Save Best Model To Disk

We must save path to disk, unfortunately I cannot specify the name of the file

In [85]:
#the path the model is to be saved on
model_save_path = '/Users/hamelhusain/Dropbox/zidisha/models/'
#before we save the model, clear the directory of previous model
shutil.rmtree(model_save_path)
#save the model to the saved path
saved_model = h2o.save_model(best_model, path = model_save_path)
print('Model is saved to this path: ', saved_model)

Model is saved to this path:  /Users/hamelhusain/Dropbox/zidisha/models/zidisha_model_0


## Load Model From Disk

In [86]:
#inspect the model directory to retrieve the filename of the model object
models = os.listdir(model_save_path)

#check to make sure there is only one model object in the directory, if there 
# is no model file or there is more than one model then raise an error.  
assert len(models) == 1, \
    ('Exactly one model must be present in path: {}\n Clear this directory, retrain the model and try again'.
     format(model_save_path))

In [87]:
#Construct the model's full path
full_model_path = model_save_path + models[0]

#print the full path for verfication and load model
print('loading model from: {}'.format(full_model_path))
final_model = h2o.load_model(full_model_path)

loading model from: /Users/hamelhusain/Dropbox/zidisha/models/zidisha_model_0


## Use Model To Make Prediction

In [88]:
#Load loans to be scored into H2o
df_predict = pd.read_csv('/Users/hamelhusain/Dropbox/zidisha/for_predictions.csv')
pred_cols = columns[:-1]
pred_col_types = col_types[:-1]

HDF_predict = HF.from_python(df_predict, column_types=pred_col_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [89]:
#Make predictions on new loans
predictions = HDF_predict[id_fields].concat(final_model.predict(HDF_predict)['p1'].set_names(['Default_Prediction']))
predictions

gbm prediction progress: |████████████████████████████████████████████████| 100%


borrower_id,category_id,id,Default_Prediction
15031,,4863,0.197912
15031,,5687,0.15878
15057,,4895,0.227824
15057,9c,10181,0.212473
15057,9c,20720,0.184139
15057,9c,33386,0.247697
15059,,4859,0.545076




In [90]:
#Export data to csv
h2o.export_file(predictions, '/Users/hamelhusain/Dropbox/zidisha/model_predictions.csv', force = True)

Export File progress: |███████████████████████████████████████████████████| 100%


#  Part 2 Experimental - Text Features (Ignore This Section)

Extract text features from data frame and split into test vs. train set.  TODO:  use the same train/test split as above.

In [184]:
df_text = df[text_fields + ['default_flag']]
df_text.proposal_field[df.proposal_field.isnull()] = ''
df_text_train, df_text_test = tts(df_text, test_size = .2)
print('size of train', df_text_train.shape)
print('size of test', df_text_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


size of train (24912, 4)
size of test (6228, 4)


## Try two basic text vectorizers 
- Count Vectorizer (Bag of Words with frequency count)
- TFIDF

In [2]:
from sklearn.linear_model import ElasticNetCV
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.feature_extraction.text import CountVectorizer as Cvec
enet = ElasticNetCV(l1_ratio = [.001, .01, .1, .5], cv = 5, normalize = True)
tfidf = TFIDF(stop_words = 'english', min_df=.05, max_df = .95, sublinear_tf = True, ngram_range=(1, 4))
cvec = Cvec(stop_words = 'english', min_df=.05, max_df = .95, ngram_range=(1, 4))

In [185]:
cvec_dat = cvec.fit_transform(df_text_train.proposal_field)
tfidf_data = tfidf.fit_transform(df_text_train.proposal_field)

In [186]:
enet.fit(cvec_dat, df_text_train.default_flag)
cvec_preds = enet.predict(cvec.transform(df_text_test.proposal_field))

ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.001, 0.01, 0.1, 0.5], max_iter=1000, n_alphas=100,
       n_jobs=1, normalize=True, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [189]:
print('log loss on holdout set (count vectorizer):', log_loss(df_text_test.default_flag, cvec_preds))

log loss on holdout set (count vectorizer): 0.679630228373


In [190]:
enet.fit(tfidf_data, df_text_train.default_flag)
tfidf_preds = enet.predict(tfidf.transform(df_text_test.proposal_field))

In [192]:
print('log loss on holdout set (TFIDF vectorizer):', log_loss(df_text_test.default_flag, tfidf_preds))

log loss on holdout set (TFIDF vectorizer): 0.677721258792


## Scratch Work

In [178]:
>>> from sklearn import datasets, linear_model
>>> from sklearn.cross_validation import cross_val_predict
>>> diabetes = datasets.load_diabetes()
>>> X = diabetes.data[:150]
>>> y = diabetes.target[:150]
>>> lasso = linear_model.Lasso()
>>> y_pred = cross_val_predict(lasso, X, y)

In [182]:
y_pred

array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
        132.68647979,  128.49511245,  120.76146877,  141.069413  ,
        164.18904498,  182.37394949,  111.04181265,  127.94311443,
        135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
        178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
        133.58117218,  124.77928571,  132.90918003,  208.52927   ,
        153.61908967,  154.16616341,  118.95351821,  163.50467541,
        145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
        185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
        174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
        154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
        179.68118754,  137.96619936,  146.07788398,  126.77579723,
        123.32101099,  166.26710247,  146.41559964,  161.67261029,
        147.47731459,  138.44595305,  144.85421048,  113.77990664,
        185.54970402,  115.31624749,  142.23672103,  171.07792

In [193]:
!pwd

/Users/hamel_husain/Dropbox/zidisha
