In [1]:
import h2o
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator as gbm
from h2o.grid.grid_search import H2OGridSearch
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import log_loss
import shutil
import os

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_73"; Java(TM) SE Runtime Environment (build 1.8.0_73-b02); Java HotSpot(TM) 64-Bit Server VM (build 25.73-b02, mixed mode)
  Starting server from /Users/hamelhusain/anaconda/envs/drpy3/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/0w/p50zlf7n6mbbxfprr0ycxr9c0000gn/T/tmpxsowm7dn
  JVM stdout: /var/folders/0w/p50zlf7n6mbbxfprr0ycxr9c0000gn/T/tmpxsowm7dn/h2o_hamelhusain_started_from_python.out
  JVM stderr: /var/folders/0w/p50zlf7n6mbbxfprr0ycxr9c0000gn/T/tmpxsowm7dn/h2o_hamelhusain_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.10.1.2
H2O cluster version age:,2 months and 5 days
H2O cluster name:,H2O_from_python_hamelhusain_kfg0vi
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


## Read in Data

In [3]:
df = pd.read_csv('/Users/hamelhusain/Dropbox/zidisha/Regression_Dataset_20170125.csv')
text_fields =  ['about_me_field', 'about_business_field', 'proposal_field']
id_fields = ['borrower_id', 'category_id', 'id']
non_text_cols = [c for c in df.columns if c not in text_fields]
features = [c for c in non_text_cols if c not in id_fields]

## Upload Data To H2O

In [4]:
col_types = ['int','real', 'int','int', 'int','int','int','int',
             'enum','int','int','int', 'int','int','enum','int','int',
             'int','int', 'int','real','real','enum','enum','enum',
             'enum', 'real', 'real','real', 'real']

HF = h2o.H2OFrame()
HDF = HF.from_python(df[non_text_cols], 
                     column_types = col_types)

hgbm = gbm(nfolds=5, distribution='bernoulli')
hyper_params = {'max_depth': [20], 'min_rows': [20, 40], 'ntrees':[30]}
grid = H2OGridSearch(hgbm, hyper_params, grid_id='zidisha')

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Split Train & Test

In [5]:
test, train = HDF.split_frame([.2])
print('test set shape:', test.shape)
print('train set shape:', train.shape)

test set shape: (6247, 30)
train set shape: (24893, 30)


## Train Model And Evaluate Results

In [6]:
grid.train(x=features, y='default_flag', training_frame=train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [7]:
grid_results = grid.get_grid(sort_by='logloss', decreasing=False)
grid_results.sorted_metric_table()

Unnamed: 0,Unnamed: 1,max_depth,min_rows,ntrees,model_ids,logloss
0,,20,40.0,30,zidisha_model_1,0.553947865784791
1,,20,20.0,30,zidisha_model_0,0.5547375716155066


In [8]:
best_model = grid_results[0]
best_model.coef_norm

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  zidisha_model_1


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.13914889070672892
RMSE: 0.373026662192837
LogLoss: 0.44069725766940404
Mean Per-Class Error: 0.18632800575551256
AUC: 0.903860201529126
Gini: 0.8077204030582521
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3771473810442348: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,10278.0,3308.0,0.2435,(3308.0/13586.0)
1,1579.0,9728.0,0.1396,(1579.0/11307.0)
Total,11857.0,13036.0,0.1963,(4887.0/24893.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3771474,0.7992441,241.0
max f2,0.2939534,0.8702096,286.0
max f0point5,0.5638592,0.8231700,161.0
max accuracy,0.4420354,0.8155706,211.0
max precision,0.9662436,1.0,0.0
max recall,0.1474196,1.0,364.0
max specificity,0.9662436,1.0,0.0
max absolute_mcc,0.4420354,0.6271595,211.0
max min_per_class_accuracy,0.4139015,0.8122402,224.0


Gains/Lift Table: Avg response rate: 45.42 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100028,0.9532753,2.2015566,2.2015566,1.0,1.0,0.0220218,0.0220218,120.1556558,120.1556558
,2,0.0200056,0.9459811,2.2015566,2.2015566,1.0,1.0,0.0220218,0.0440435,120.1556558,120.1556558
,3,0.0300084,0.9378321,2.2015566,2.2015566,1.0,1.0,0.0220218,0.0660653,120.1556558,120.1556558
,4,0.0400112,0.9256089,2.2015566,2.2015566,1.0,1.0,0.0220218,0.0880870,120.1556558,120.1556558
,5,0.0500141,0.9126764,2.2015566,2.2015566,1.0,1.0,0.0220218,0.1101088,120.1556558,120.1556558
,6,0.1000281,0.8430395,2.1838734,2.1927150,0.9919679,0.9959839,0.1092244,0.2193332,118.3873373,119.2714965
,7,0.1500020,0.7794020,2.1148393,2.1667703,0.9606109,0.9841993,0.1056867,0.3250199,111.4839298,116.6770313
,8,0.2000161,0.7127034,1.9946633,2.1237349,0.9060241,0.9646515,0.0997612,0.4247811,99.4663291,112.3734916
,9,0.3000040,0.5746752,1.7460316,1.9978507,0.7930896,0.9074719,0.1745821,0.5993632,74.6031597,99.7850669




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.18752206021974627
RMSE: 0.4330381740906294
LogLoss: 0.553947865784791
Mean Per-Class Error: 0.29303055467164674
AUC: 0.7782427385496942
Gini: 0.5564854770993883
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3651621405449738: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,8506.0,5080.0,0.3739,(5080.0/13586.0)
1,2714.0,8593.0,0.24,(2714.0/11307.0)
Total,11220.0,13673.0,0.3131,(7794.0/24893.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3651621,0.6879904,245.0
max f2,0.1587507,0.8128626,360.0
max f0point5,0.5782350,0.7082803,146.0
max accuracy,0.5029966,0.7166673,176.0
max precision,0.9453697,0.9963168,9.0
max recall,0.0536858,1.0,398.0
max specificity,0.9648226,0.9999264,0.0
max absolute_mcc,0.5782350,0.4290140,146.0
max min_per_class_accuracy,0.4057367,0.7015310,222.0


Gains/Lift Table: Avg response rate: 45.42 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100028,0.9541186,2.1838734,2.1838734,0.9919679,0.9919679,0.0218449,0.0218449,118.3873373,118.3873373
,2,0.0200056,0.9455674,2.2015566,2.1927150,1.0,0.9959839,0.0220218,0.0438666,120.1556558,119.2714965
,3,0.0300084,0.9345159,2.1750318,2.1868206,0.9879518,0.9933066,0.0217564,0.0656231,117.5031780,118.6820570
,4,0.0400112,0.9194633,2.0866159,2.1617694,0.9477912,0.9819277,0.0208720,0.0864951,108.6615854,116.1769391
,5,0.0500141,0.9059360,2.0866159,2.1467387,0.9477912,0.9751004,0.0208720,0.1073671,108.6615854,114.6738684
,6,0.1000281,0.8330300,1.9539920,2.0503653,0.8875502,0.9313253,0.0977271,0.2050942,95.3991965,105.0365324
,7,0.1500020,0.7590267,1.7945164,1.9651280,0.8151125,0.8926085,0.0896790,0.2947731,79.4516358,96.5128015
,8,0.2000161,0.6896537,1.6657560,1.8902700,0.7566265,0.8586061,0.0833112,0.3780844,66.5756046,89.0269991
,9,0.3000040,0.5630829,1.3586143,1.7130751,0.6171153,0.7781200,0.1358451,0.5139294,35.8614252,71.3075142



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.6862886,0.0111664,0.6850863,0.7144285,0.688902,0.6737632,0.669263
auc,0.7783667,0.0039741,0.7772501,0.7887144,0.7782713,0.7718146,0.7757831
err,0.3137114,0.0111664,0.3149137,0.2855716,0.3110980,0.3262368,0.3307370
err_count,1561.6,54.71307,1533.0,1429.0,1581.0,1609.0,1656.0
f0point5,0.6507176,0.0132159,0.6473733,0.6835213,0.6565582,0.6341055,0.6320294
f1,0.6904634,0.0065412,0.6855385,0.7080694,0.6913918,0.6839521,0.6833652
f2,0.7358285,0.0043982,0.7284855,0.7344465,0.7301287,0.7423041,0.7437776
lift_top_group,2.184174,0.0197833,2.2147408,2.1747067,2.1764455,2.1404703,2.2145069
logloss,0.5539548,0.0030647,0.5527529,0.5469882,0.5555840,0.5603796,0.5540692


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-02-19 14:57:30,1 min 19.628 sec,0.0,0.4979002,0.6889504,0.5,1.0,0.5457759
,2017-02-19 14:57:30,1 min 19.711 sec,1.0,0.4854953,0.6643334,0.7908059,2.2015566,0.3142651
,2017-02-19 14:57:30,1 min 19.809 sec,2.0,0.4745555,0.6429957,0.8095307,2.2015566,0.2902422
,2017-02-19 14:57:30,1 min 19.963 sec,3.0,0.4658240,0.6261026,0.8157253,2.2015566,0.2821677
,2017-02-19 14:57:30,1 min 20.149 sec,4.0,0.4585251,0.6119811,0.8209610,2.2015566,0.2842968
,2017-02-19 14:57:31,1 min 20.344 sec,5.0,0.4533085,0.6018191,0.8219468,2.2015566,0.2842968
,2017-02-19 14:57:31,1 min 20.559 sec,6.0,0.4474190,0.5902568,0.8253735,2.2015566,0.2787129
,2017-02-19 14:57:31,1 min 20.814 sec,7.0,0.4407262,0.5771864,0.8335020,2.2015566,0.2669425
,2017-02-19 14:57:31,1 min 21.066 sec,8.0,0.4371247,0.5699273,0.8344855,2.2015566,0.2689913


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
reserve_fee_pct,11942.8203125,1.0,0.3930696
peer_repay_ratio,4752.0917969,0.3979037,0.1564038
peer_network_comments_by,3054.2214355,0.2557370,0.1005224
parent_repay_ratio,2808.4978027,0.2351620,0.0924350
peer_network_avg_char_per_comment_by,1613.0695801,0.1350661,0.0530904
---,---,---,---
missing_referred_by,75.3075485,0.0063057,0.0024786
peer_network_avg_char_per_comment_about,56.1397285,0.0047007,0.0018477
parent_comment_cnt,52.1496124,0.0043666,0.0017164



See the whole table with table.as_data_frame()


<bound method ModelBase.coef_norm of >

In [9]:
holdout_preds = best_model.predict(test)['p1']
holdout_eval = pd.concat([holdout_preds['p1'].as_data_frame(), test['default_flag'].as_data_frame()], axis = 1)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [10]:
print('holdout log-loss:', log_loss(holdout_eval.default_flag.values, holdout_eval.p1.values))

holdout log-loss: 0.548885145266


## Save Best Model To Disk

We must save path to disk, unfortunately I cannot specify the name of the file

In [11]:
#the path the model is to be saved on
model_save_path = '/Users/hamelhusain/Dropbox/zidisha/models/'
#before we save the model, clear the directory of previous model
shutil.rmtree(model_save_path)
#save the model to the saved path
saved_model = h2o.save_model(best_model, path = model_save_path)
print('Model is saved to this path: ', saved_model)

Model is saved to this path:  /Users/hamelhusain/Dropbox/zidisha/models/zidisha_model_1


## Load Model From Disk

In [36]:
#inspect the model directory to retrieve the filename of the model object
models = os.listdir(model_save_path)

#check to make sure there is only one model object in the directory, if there 
# is no model file or there is more than one model then raise an error.  
assert len(models) == 1, \
    ('Exactly one model must be present in path: {}\n Clear this directory, retrain the model and try again'.
     format(model_save_path))

In [37]:
#Construct the model's full path
full_model_path = model_save_path + models[0]

#print the full path for verfication and load model
print('loading model from: {}'.format(full_model_path))
final_model = h2o.load_model(full_model_path)

loading model from: /Users/hamelhusain/Dropbox/zidisha/models/zidisha_model_1


## Use Model To Make Prediction

In [20]:
#Load loans to be scored into H2o
df_predict = pd.read_csv('/Users/hamelhusain/Dropbox/zidisha/for_predictions.csv')
HDF_predict = HF.from_python(df_predict[non_text_cols], 
                      column_types = col_types)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [31]:
#Make predictions on new loans
predictions = HDF_predict.concat(final_model.predict(HDF_predict)['p1'].set_names(['Default_Prediction']))
predictions

gbm prediction progress: |████████████████████████████████████████████████| 100%


id,borrower_id,length(loans.proposal),length(about_me),length(about_business),length(address_instructions),missing_referred_by,application_time,default_flag,friends_count,category_id,invited_flag,english_flag,usd_installment_amount,reserve_fee_pct,ip_user_count,parent_age_at_invite,parent_comment_cnt,parent_avg_char_per_comment,parent_repay_ratio,parent_usd_total_amount,parent_invited_count,peer_repay_ratio,peer_usd_total_amount,peer_network_comments_by,peer_network_avg_char_per_comment_by,peer_network_comments_about,peer_network_avg_char_per_comment_about,loan_usage,Prior Loans,Default_Prediction
4863,15031,422,573,835,16,1,,0,41.0,,0,1,,0.0,,,,,,,,,,,,,,,0,0.286845
5687,15031,748,573,835,16,1,,1,41.0,,0,1,,0.0,,,,,,,,,,,,,,,1,0.250552
4895,15057,504,1374,627,15,1,,0,,,1,1,,0.0,,80.0,9.0,492.0,1.0,100.0,1.0,1.0,700.0,9.0,3447.0,15.0,3332.0,,0,0.176774
10181,15057,627,1374,627,15,1,,0,,,1,1,,0.0,,80.0,9.0,492.0,1.0,100.0,1.0,1.0,700.0,9.0,3447.0,15.0,3332.0,,1,0.183053
20720,15057,627,1374,627,15,1,,0,,,1,1,31.08,0.0,,80.0,9.0,492.0,1.0,100.0,1.0,1.0,700.0,9.0,3447.0,15.0,3332.0,,2,0.10998
33386,15057,263,1374,627,15,1,,0,,,1,1,30.44,0.8,,80.0,9.0,492.0,1.0,100.0,1.0,1.0,700.0,9.0,3447.0,15.0,3332.0,,3,0.141729
4859,15059,441,546,922,237,1,,1,,,1,1,,0.0,,71.0,23.0,556.0,1.0,200.0,5.0,1.0,2000.0,23.0,5558.0,31.0,5130.0,,0,0.488749




In [41]:
#Export data to csv
h2o.export_file(predictions, '/Users/hamelhusain/Dropbox/zidisha/model_predictions.csv', force = True)

Export File progress: |███████████████████████████████████████████████████| 100%


#  Part 2 Experimental - Text Features (Ignore This Section)

Extract text features from data frame and split into test vs. train set.  TODO:  use the same train/test split as above.

In [184]:
df_text = df[text_fields + ['default_flag']]
df_text.proposal_field[df.proposal_field.isnull()] = ''
df_text_train, df_text_test = tts(df_text, test_size = .2)
print('size of train', df_text_train.shape)
print('size of test', df_text_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


size of train (24912, 4)
size of test (6228, 4)


## Try two basic text vectorizers 
- Count Vectorizer (Bag of Words with frequency count)
- TFIDF

In [2]:
from sklearn.linear_model import ElasticNetCV
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.feature_extraction.text import CountVectorizer as Cvec
enet = ElasticNetCV(l1_ratio = [.001, .01, .1, .5], cv = 5, normalize = True)
tfidf = TFIDF(stop_words = 'english', min_df=.05, max_df = .95, sublinear_tf = True, ngram_range=(1, 4))
cvec = Cvec(stop_words = 'english', min_df=.05, max_df = .95, ngram_range=(1, 4))

In [185]:
cvec_dat = cvec.fit_transform(df_text_train.proposal_field)
tfidf_data = tfidf.fit_transform(df_text_train.proposal_field)

In [186]:
enet.fit(cvec_dat, df_text_train.default_flag)
cvec_preds = enet.predict(cvec.transform(df_text_test.proposal_field))

ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.001, 0.01, 0.1, 0.5], max_iter=1000, n_alphas=100,
       n_jobs=1, normalize=True, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [189]:
print('log loss on holdout set (count vectorizer):', log_loss(df_text_test.default_flag, cvec_preds))

log loss on holdout set (count vectorizer): 0.679630228373


In [190]:
enet.fit(tfidf_data, df_text_train.default_flag)
tfidf_preds = enet.predict(tfidf.transform(df_text_test.proposal_field))

In [192]:
print('log loss on holdout set (TFIDF vectorizer):', log_loss(df_text_test.default_flag, tfidf_preds))

log loss on holdout set (TFIDF vectorizer): 0.677721258792


## Scratch Work

In [178]:
>>> from sklearn import datasets, linear_model
>>> from sklearn.cross_validation import cross_val_predict
>>> diabetes = datasets.load_diabetes()
>>> X = diabetes.data[:150]
>>> y = diabetes.target[:150]
>>> lasso = linear_model.Lasso()
>>> y_pred = cross_val_predict(lasso, X, y)

In [182]:
y_pred

array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
        132.68647979,  128.49511245,  120.76146877,  141.069413  ,
        164.18904498,  182.37394949,  111.04181265,  127.94311443,
        135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
        178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
        133.58117218,  124.77928571,  132.90918003,  208.52927   ,
        153.61908967,  154.16616341,  118.95351821,  163.50467541,
        145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
        185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
        174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
        154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
        179.68118754,  137.96619936,  146.07788398,  126.77579723,
        123.32101099,  166.26710247,  146.41559964,  161.67261029,
        147.47731459,  138.44595305,  144.85421048,  113.77990664,
        185.54970402,  115.31624749,  142.23672103,  171.07792

In [193]:
!pwd

/Users/hamel_husain/Dropbox/zidisha
