In [167]:
import h2o
import pandas as pd
from h2o.estimators.gbm import H2OGradientBoostingEstimator as gbm
from h2o.grid.grid_search import H2OGridSearch
from sklearn.cross_validation import train_test_split as tts
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.feature_extraction.text import CountVectorizer as Cvec
from sklearn.metrics import log_loss
from sklearn.linear_model import ElasticNetCV
enet = ElasticNetCV(l1_ratio = [.001, .01, .1, .5], cv = 5, normalize = True)
tfidf = TFIDF(stop_words = 'english', min_df=.05, max_df = .95, sublinear_tf = True, ngram_range=(1, 4))
cvec = Cvec(stop_words = 'english', min_df=.05, max_df = .95, ngram_range=(1, 4))

In [142]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,8 hours 36 mins
H2O cluster version:,3.10.0.10
H2O cluster version age:,3 months
H2O cluster name:,H2O_from_python_hamel_husain_ok0hrs
H2O cluster total nodes:,1
H2O cluster free memory:,2.748 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


## Read in Data

In [143]:
df = pd.read_csv('/Users/hamel_husain/Dropbox/zidisha/Regression_Dataset_20170125.csv')
text_fields =  ['about_me_field', 'about_business_field', 'proposal_field']
id_fields = ['borrower_id', 'category_id', 'id']
non_text_cols = [c for c in df.columns if c not in text_fields]
features = [c for c in non_text_cols if c not in id_fields]

## Upload Data To H2O

In [155]:
col_types = ['int','real', 'int','int', 'int','int','int','int',
             'enum','int','int','int', 'int','int','enum','int','int',
             'int','int', 'int','real','real','enum','enum','enum',
             'enum', 'real', 'real','real', 'real']

HF = h2o.H2OFrame()
HDF = HF.from_python(df[non_text_cols], 
                     column_types = col_types)

hgbm = gbm(nfolds=5, distribution='bernoulli')
hyper_params = {'max_depth': [20], 'min_rows': [20, 40], 'ntrees':[30]}
grid = H2OGridSearch(hgbm, hyper_params)

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Split Train & Test

In [156]:
test, train = HDF.split_frame([.2])
print('test set shape:', test.shape)
print('train set shape:', train.shape)

test set shape: (6281, 30)
train set shape: (24859, 30)


## Train Model And Evaluate Results

In [157]:
grid.train(x=features, y='default_flag', training_frame=train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [158]:
grid_results = grid.get_grid(sort_by='logloss', decreasing=False)
grid_results.sorted_metric_table()

Unnamed: 0,Unnamed: 1,max_depth,min_rows,ntrees,model_ids,logloss
0,,20,20.0,30,Grid_GBM_py_48_sid_9794_model_python_1486535343200_5212_model_0,0.55295434725286
1,,20,40.0,30,Grid_GBM_py_48_sid_9794_model_python_1486535343200_5212_model_1,0.5540855403855385


In [159]:
best_model = grid_results[0]
best_model.coef_norm

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_48_sid_9794_model_python_1486535343200_5212_model_0
Model Summary: 


0,1,2,3,4,5,6,7,8,9
,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,30.0,30.0,474576.0,20.0,20.0,20.0,346.0,1007.0,708.43335




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.12140987717908508
RMSE: 0.34843920155327684
LogLoss: 0.3982588564436409
Mean Per-Class Error: 0.1457990714245927
AUC: 0.938126640057931
Gini: 0.876253280115862
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.39734398753212635: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,11561.0,2121.0,0.155,(2121.0/13682.0)
1,1529.0,9648.0,0.1368,(1529.0/11177.0)
Total,13090.0,11769.0,0.1468,(3650.0/24859.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3973440,0.8409309,234.0
max f2,0.3083178,0.8920149,275.0
max f0point5,0.5624460,0.8649695,164.0
max accuracy,0.4266237,0.8544189,221.0
max precision,0.9690433,1.0,0.0
max recall,0.1599995,1.0,354.0
max specificity,0.9690433,1.0,0.0
max absolute_mcc,0.3999631,0.7060912,233.0
max min_per_class_accuracy,0.4035698,0.8531647,231.0


Gains/Lift Table: Avg response rate: 44.96 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100165,0.9595026,2.2241210,2.2241210,1.0,1.0,0.0222779,0.0222779,122.4120963,122.4120963
,2,0.0200330,0.9536849,2.2241210,2.2241210,1.0,1.0,0.0222779,0.0445558,122.4120963,122.4120963
,3,0.0300093,0.9462814,2.2241210,2.2241210,1.0,1.0,0.0221884,0.0667442,122.4120963,122.4120963
,4,0.0400257,0.9381132,2.2241210,2.2241210,1.0,1.0,0.0222779,0.0890221,122.4120963,122.4120963
,5,0.0500020,0.9297010,2.2241210,2.2241210,1.0,1.0,0.0221884,0.1112105,122.4120963,122.4120963
,6,0.1000040,0.8653398,2.2241210,2.2241210,1.0,1.0,0.1112105,0.2224210,122.4120963,122.4120963
,7,0.1500060,0.7982108,2.1972812,2.2151744,0.9879324,0.9959775,0.1098685,0.3322895,119.7281209,121.5174378
,8,0.2000080,0.7247330,2.1346551,2.1950446,0.9597747,0.9869268,0.1067370,0.4390266,113.4655115,119.5044562
,9,0.3000121,0.5865825,1.9279890,2.1060260,0.8668544,0.9469027,0.1928067,0.6318332,92.7989008,110.6026044




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.18739553808707646
RMSE: 0.43289206285987325
LogLoss: 0.55295434725286
Mean Per-Class Error: 0.2941315105648036
AUC: 0.7767939902375115
Gini: 0.553587980475023
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3531199660326569: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,8452.0,5230.0,0.3823,(5230.0/13682.0)
1,2621.0,8556.0,0.2345,(2621.0/11177.0)
Total,11073.0,13786.0,0.3158,(7851.0/24859.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3531200,0.6854945,247.0
max f2,0.1472856,0.8103441,362.0
max f0point5,0.5758792,0.7036580,144.0
max accuracy,0.4926624,0.7164005,179.0
max precision,0.9672483,1.0,0.0
max recall,0.0514200,1.0,398.0
max specificity,0.9672483,1.0,0.0
max absolute_mcc,0.5596448,0.4260518,151.0
max min_per_class_accuracy,0.3951270,0.6993130,225.0


Gains/Lift Table: Avg response rate: 44.96 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100165,0.9592377,2.1973243,2.1973243,0.9879518,0.9879518,0.0220095,0.0220095,119.7324325,119.7324325
,2,0.0200330,0.9517443,2.1883921,2.1928582,0.9839357,0.9859438,0.0219200,0.0439295,118.8392112,119.2858218
,3,0.0300093,0.9430476,2.1792798,2.1883442,0.9798387,0.9839142,0.0217411,0.0656706,117.9279814,118.8344218
,4,0.0400257,0.9323897,2.1705277,2.1838856,0.9759036,0.9819095,0.0217411,0.0874116,117.0527686,118.3885609
,5,0.0500020,0.9206131,2.1075340,2.1686521,0.9475806,0.9750603,0.0210253,0.1084370,110.7533977,116.8652137
,6,0.1000040,0.8472125,1.9879311,2.0782916,0.8938053,0.9344328,0.0994006,0.2078375,98.7931126,107.8291632
,7,0.1500060,0.7702347,1.8233140,1.9932991,0.8197908,0.8962188,0.0911694,0.2990069,82.3313967,99.3299077
,8,0.2000080,0.6941207,1.5907028,1.8926500,0.7152051,0.8509654,0.0795383,0.3785452,59.0702764,89.2649999
,9,0.3000121,0.5599633,1.3706168,1.7186389,0.6162510,0.7727273,0.1370672,0.5156124,37.0616780,71.8638926



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.6807503,0.0104317,0.6676804,0.6782069,0.7089746,0.6786148,0.6702746
auc,0.7767161,0.0062445,0.7602965,0.7793535,0.7867511,0.7802712,0.7769083
err,0.3192497,0.0104317,0.3323196,0.3217931,0.2910254,0.3213852,0.3297254
err_count,1587.0,48.651825,1639.0,1608.0,1456.0,1587.0,1645.0
f0point5,0.6419515,0.0117463,0.6245257,0.635857,0.6719037,0.6465179,0.6309532
f1,0.6871622,0.0076229,0.6676131,0.6860601,0.6983009,0.6956855,0.6881517
f2,0.7397031,0.0108138,0.7170864,0.7448702,0.7268571,0.7529470,0.7567545
lift_top_group,2.1982698,0.0451440,2.2603116,2.2468526,2.2186253,2.0818973,2.1836624
logloss,0.5529907,0.0055853,0.5670603,0.549654,0.5429523,0.5516805,0.5536061


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-02-08 07:41:10,37.482 sec,0.0,0.4974550,0.6880614,0.5,1.0,0.5503842
,2017-02-08 07:41:10,37.610 sec,1.0,0.4882238,0.6697087,0.7236020,2.2241210,0.4952331
,2017-02-08 07:41:10,37.781 sec,2.0,0.4771886,0.6480764,0.7910987,2.2241210,0.3119192
,2017-02-08 07:41:11,38.040 sec,3.0,0.4673608,0.6290509,0.8142465,2.2241210,0.3093849
,2017-02-08 07:41:11,38.333 sec,4.0,0.4573849,0.6098163,0.8355073,2.2241210,0.2615149
,2017-02-08 07:41:11,38.634 sec,5.0,0.4488621,0.5933916,0.8449572,2.2241210,0.2571302
,2017-02-08 07:41:12,38.912 sec,6.0,0.4444117,0.5846155,0.8448200,2.2241210,0.2564866
,2017-02-08 07:41:12,39.241 sec,7.0,0.4376184,0.5713864,0.8511790,2.2241210,0.2502112
,2017-02-08 07:41:12,39.572 sec,8.0,0.4315082,0.5593732,0.8557237,2.2241210,0.2516191


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
reserve_fee_pct,14538.1152344,1.0,0.3940146
peer_repay_ratio,5030.3208008,0.3460091,0.1363327
peer_network_comments_by,3667.4553223,0.2522648,0.0993960
parent_repay_ratio,2901.9892578,0.1996125,0.0786502
peer_network_avg_char_per_comment_by,2130.2753906,0.1465304,0.0577351
---,---,---,---
parent_comment_cnt,87.1575775,0.0059951,0.0023622
missing_referred_by,81.7057571,0.0056201,0.0022144
peer_network_avg_char_per_comment_about,69.8431091,0.0048041,0.0018929



See the whole table with table.as_data_frame()


<bound method ModelBase.coef_norm of >

In [161]:
holdout_preds = best_model.predict(test)['p1']
holdout_eval = pd.concat([holdout_preds['p1'].as_data_frame(), test['default_flag'].as_data_frame()], axis = 1)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [164]:
print('holdout log-loss:', log_loss(holdout_eval.default_flag.values, holdout_eval.p1.values))

holdout log-loss: 0.547096885488


#  Part 2 Experimental - Text Features

Extract text features from data frame and split into test vs. train set.  TODO:  use the same train/test split as above.

In [184]:
df_text = df[text_fields + ['default_flag']]
df_text.proposal_field[df.proposal_field.isnull()] = ''
df_text_train, df_text_test = tts(df_text, test_size = .2)
print('size of train', df_text_train.shape)
print('size of test', df_text_test.shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


size of train (24912, 4)
size of test (6228, 4)


## Try two basic text vectorizers 
- Count Vectorizer (Bag of Words with frequency count)
- TFIDF

In [185]:
cvec_dat = cvec.fit_transform(df_text_train.proposal_field)
tfidf_data = tfidf.fit_transform(df_text_train.proposal_field)

In [186]:
enet.fit(cvec_dat, df_text_train.default_flag)
cvec_preds = enet.predict(cvec.transform(df_text_test.proposal_field))

ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=[0.001, 0.01, 0.1, 0.5], max_iter=1000, n_alphas=100,
       n_jobs=1, normalize=True, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [189]:
print('log loss on holdout set (count vectorizer):', log_loss(df_text_test.default_flag, cvec_preds))

log loss on holdout set (count vectorizer): 0.679630228373


In [190]:
enet.fit(tfidf_data, df_text_train.default_flag)
tfidf_preds = enet.predict(tfidf.transform(df_text_test.proposal_field))

In [192]:
print('log loss on holdout set (TFIDF vectorizer):', log_loss(df_text_test.default_flag, tfidf_preds))

log loss on holdout set (TFIDF vectorizer): 0.677721258792


## Scratch Work

In [178]:
>>> from sklearn import datasets, linear_model
>>> from sklearn.cross_validation import cross_val_predict
>>> diabetes = datasets.load_diabetes()
>>> X = diabetes.data[:150]
>>> y = diabetes.target[:150]
>>> lasso = linear_model.Lasso()
>>> y_pred = cross_val_predict(lasso, X, y)

In [182]:
y_pred

array([ 174.26933996,  117.6539241 ,  164.60228641,  155.65049088,
        132.68647979,  128.49511245,  120.76146877,  141.069413  ,
        164.18904498,  182.37394949,  111.04181265,  127.94311443,
        135.0869234 ,  162.83066014,  135.3573514 ,  157.64516523,
        178.95843326,  163.3919841 ,  143.85237903,  144.29748882,
        133.58117218,  124.77928571,  132.90918003,  208.52927   ,
        153.61908967,  154.16616341,  118.95351821,  163.50467541,
        145.89406196,  168.3308101 ,  155.87411031,  123.45960148,
        185.70459144,  133.38468582,  117.2789469 ,  150.27895019,
        174.1541028 ,  160.03235091,  192.31389633,  161.58568256,
        154.2224809 ,  119.35517679,  146.15706413,  133.82056934,
        179.68118754,  137.96619936,  146.07788398,  126.77579723,
        123.32101099,  166.26710247,  146.41559964,  161.67261029,
        147.47731459,  138.44595305,  144.85421048,  113.77990664,
        185.54970402,  115.31624749,  142.23672103,  171.07792

In [193]:
!pwd

/Users/hamel_husain/Dropbox/zidisha
