java install instructions:  
https://www.digitalocean.com/community/tutorials/how-to-install-java-with-apt-on-ubuntu-18-04
or   
https://www.oracle.com/java/technologies/javase-jdk14-downloads.html

AutoML Code Examples:  
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html#code-examples

In [1]:
import pandas as pd
import numpy as np
import h2o
from h2o.automl import H2OAutoML

In [2]:
ORD = pd.read_csv('../data/ORD.csv', index_col='Unnamed: 0')
ORD_train = ORD[ORD['year'] < 2018]

In [3]:
def lag_df(df, lag, cols):
    return df.assign(**{f"{col}-{n}": df[col].shift(n) for n in range(1, lag + 1) for col in cols})

In [4]:
lag = 3
ORD_train_lag = lag_df(ORD_train, lag=lag, cols=['seats'])

In [25]:
ORD_train_lag

Unnamed: 0,year,month,day,hour,rides,tmpf,dwpf,relh,sknt,p01i,vsby,feel,skyc,seats,airline,seats-1,seats-2,seats-3
0,2013,1,1,0,22,24.98,17.96,74.290000,9.000000,0.0,9.000000,14.780000,3.000000,547.520724,3.0,,,
1,2013,1,1,1,9,24.89,17.78,73.995000,9.000000,0.0,8.000000,14.670000,5.000000,147.962430,1.0,547.520724,,
2,2013,1,1,2,11,21.20,14.00,73.320000,10.000000,0.0,9.000000,9.420000,5.000000,,,147.962430,547.520724,
3,2013,1,1,3,3,21.14,12.14,67.723333,10.666667,0.0,9.666667,8.976667,4.333333,,,,147.962430,547.520724
4,2013,1,1,4,5,19.94,10.94,67.580000,10.000000,0.0,10.000000,7.840000,5.000000,847.150186,4.0,,,147.962430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43757,2017,12,31,19,159,12.90,-2.00,50.620000,12.769231,0.0,10.000000,-3.460000,2.923077,4652.273120,37.0,6279.424095,3469.881437,4896.180254
43758,2017,12,31,20,100,12.90,-2.90,48.500000,13.153846,0.0,10.000000,-2.900000,0.230769,2201.296388,15.0,4652.273120,6279.424095,3469.881437
43759,2017,12,31,21,62,10.90,-4.00,50.280000,12.307692,0.0,10.000000,-5.460000,0.230769,1456.971367,10.0,2201.296388,4652.273120,6279.424095
43760,2017,12,31,22,53,9.00,-4.00,54.740000,11.666667,0.0,10.000000,-5.090000,0.307692,1368.781483,9.0,1456.971367,2201.296388,4652.273120


In [8]:
ORD_train_lag.to_csv('../data/train_data.csv')

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,3 mins 06 secs
H2O_cluster_timezone:,America/Vancouver
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_dkruszew_tooiz8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.916 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [9]:
train = h2o.import_file('../data/train_data.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
x = train.columns
y = "rides"
x.remove(y)

In [11]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [12]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20200607_195151,1473.43,38.3853,1473.43,27.6276,
StackedEnsemble_BestOfFamily_AutoML_20200607_195151,1492.7,38.6355,1492.7,27.8776,
GBM_4_AutoML_20200607_195151,1557.12,39.4604,1557.12,28.4064,
GBM_3_AutoML_20200607_195151,1591.8,39.8974,1591.8,28.7943,
GBM_2_AutoML_20200607_195151,1618.45,40.23,1618.45,29.1308,
GBM_5_AutoML_20200607_195151,1664.05,40.7927,1664.05,29.4696,
GBM_grid__1_AutoML_20200607_195151_model_2,1686.92,41.0722,1686.92,29.6672,
XGBoost_grid__1_AutoML_20200607_195151_model_4,1705.02,41.2919,1705.02,30.1876,
GBM_1_AutoML_20200607_195151,1706.02,41.304,1706.02,29.9832,
XGBoost_3_AutoML_20200607_195151,1804.47,42.4791,1804.47,31.0419,




In [14]:
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])

In [15]:
model_ids

['StackedEnsemble_AllModels_AutoML_20200607_195151',
 'StackedEnsemble_BestOfFamily_AutoML_20200607_195151',
 'GBM_4_AutoML_20200607_195151',
 'GBM_3_AutoML_20200607_195151',
 'GBM_2_AutoML_20200607_195151',
 'GBM_5_AutoML_20200607_195151',
 'GBM_grid__1_AutoML_20200607_195151_model_2',
 'XGBoost_grid__1_AutoML_20200607_195151_model_4',
 'GBM_1_AutoML_20200607_195151',
 'XGBoost_3_AutoML_20200607_195151',
 'GBM_grid__1_AutoML_20200607_195151_model_1',
 'XGBoost_grid__1_AutoML_20200607_195151_model_3',
 'XGBoost_grid__1_AutoML_20200607_195151_model_1',
 'XGBoost_1_AutoML_20200607_195151',
 'XGBoost_2_AutoML_20200607_195151',
 'DRF_1_AutoML_20200607_195151',
 'XRT_1_AutoML_20200607_195151',
 'XGBoost_grid__1_AutoML_20200607_195151_model_2',
 'DeepLearning_grid__1_AutoML_20200607_195151_model_1',
 'DeepLearning_1_AutoML_20200607_195151',
 'GLM_1_AutoML_20200607_195151',
 'DeepLearning_grid__2_AutoML_20200607_195151_model_1']

In [22]:
m = h2o.get_model([mid for mid in model_ids if "GBM_4" in mid][0])  

In [23]:
m

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_4_AutoML_20200607_195151


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,262.0,262.0,874253.0,10.0,10.0,10.0,75.0,682.0,261.17557




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 379.2979340605132
RMSE: 19.47557275308003
MAE: 14.156868321803845
RMSLE: NaN
Mean Residual Deviance: 379.2979340605132

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 1557.1209875473403
RMSE: 39.46037236959809
MAE: 28.406419987297827
RMSLE: NaN
Mean Residual Deviance: 1557.1209875473403

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,28.40641,0.34294885,28.308731,28.949728,28.121191,28.51528,28.13712
1,mean_residual_deviance,1557.1201,37.864216,1530.5797,1620.1691,1550.5513,1558.9414,1525.3591
2,mse,1557.1201,37.864216,1530.5797,1620.1691,1550.5513,1558.9414,1525.3591
3,r2,0.92028534,0.0018006696,0.92150784,0.9173942,0.92025614,0.9202193,0.9220492
4,residual_deviance,1557.1201,37.864216,1530.5797,1620.1691,1550.5513,1558.9414,1525.3591
5,rmse,39.458054,0.47717118,39.122623,40.251324,39.37704,39.483433,39.055847
6,rmsle,,0.0,,,,,



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-06-07 20:01:55,50.357 sec,0.0,139.761096,119.13281,19533.164053
1,,2020-06-07 20:01:55,50.579 sec,5.0,94.302846,79.558933,8893.026823
2,,2020-06-07 20:01:55,50.785 sec,10.0,69.790564,57.388407,4870.722823
3,,2020-06-07 20:01:55,51.001 sec,15.0,56.556167,44.81975,3198.600042
4,,2020-06-07 20:01:55,51.206 sec,20.0,49.225278,37.59424,2423.127962
5,,2020-06-07 20:01:56,51.416 sec,25.0,44.969265,33.376154,2022.234806
6,,2020-06-07 20:01:56,51.609 sec,30.0,42.018022,30.680637,1765.51417
7,,2020-06-07 20:01:56,51.779 sec,35.0,39.848679,28.888828,1587.917244
8,,2020-06-07 20:01:56,51.961 sec,40.0,38.166644,27.578965,1456.69272
9,,2020-06-07 20:01:56,52.150 sec,45.0,36.793798,26.558539,1353.783559



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,seats-2,2156654000.0,1.0,0.53045
1,C1,393928500.0,0.182657,0.096891
2,hour,388320500.0,0.180057,0.095511
3,seats-1,321825800.0,0.149225,0.079156
4,seats-3,236526700.0,0.109673,0.058176
5,month,92157850.0,0.042732,0.022667
6,day,81965910.0,0.038006,0.02016
7,seats,81759860.0,0.037911,0.02011
8,year,58647500.0,0.027194,0.014425
9,airline,48166690.0,0.022334,0.011847




In [27]:
model_ids

['StackedEnsemble_AllModels_AutoML_20200607_195151',
 'StackedEnsemble_BestOfFamily_AutoML_20200607_195151',
 'GBM_4_AutoML_20200607_195151',
 'GBM_3_AutoML_20200607_195151',
 'GBM_2_AutoML_20200607_195151',
 'GBM_5_AutoML_20200607_195151',
 'GBM_grid__1_AutoML_20200607_195151_model_2',
 'XGBoost_grid__1_AutoML_20200607_195151_model_4',
 'GBM_1_AutoML_20200607_195151',
 'XGBoost_3_AutoML_20200607_195151',
 'GBM_grid__1_AutoML_20200607_195151_model_1',
 'XGBoost_grid__1_AutoML_20200607_195151_model_3',
 'XGBoost_grid__1_AutoML_20200607_195151_model_1',
 'XGBoost_1_AutoML_20200607_195151',
 'XGBoost_2_AutoML_20200607_195151',
 'DRF_1_AutoML_20200607_195151',
 'XRT_1_AutoML_20200607_195151',
 'XGBoost_grid__1_AutoML_20200607_195151_model_2',
 'DeepLearning_grid__1_AutoML_20200607_195151_model_1',
 'DeepLearning_1_AutoML_20200607_195151',
 'GLM_1_AutoML_20200607_195151',
 'DeepLearning_grid__2_AutoML_20200607_195151_model_1']

In [30]:
model = h2o.get_model(model_ids[3])

In [57]:
model_ids[15]

'DRF_1_AutoML_20200607_195151'

In [58]:
import os
h2o.save_model(model=h2o.get_model(model_ids[2]), path=os.getcwd()+"/../data/models", force=True)
h2o.save_model(model=h2o.get_model(model_ids[7]), path=os.getcwd()+"/../data/models", force=True)
h2o.save_model(model=h2o.get_model(model_ids[15]), path=os.getcwd()+"/../data/models", force=True)
h2o.save_model(model=h2o.get_model(model_ids[16]), path=os.getcwd()+"/../data/models", force=True)
h2o.save_model(model=h2o.get_model(model_ids[20]), path=os.getcwd()+"/../data/models", force=True)

'/home/dkruszew/Repos/Ohare_taxi_demand/data/models/GLM_1_AutoML_20200607_195151'