In [29]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.automl import H2OAutoML

In [30]:
# constant random seed 
seed = 1234

# Connect to a pre-existing cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 16 mins
H2O cluster timezone:,America/Denver
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.2
H2O cluster version age:,8 days
H2O cluster name:,H2O_from_python_jpitt_rbvdxg
H2O cluster total nodes:,1
H2O cluster free memory:,3.137 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [31]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("train.csv"))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [32]:
df.describe()

Rows:10885
Cols:13




Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,previouscount
type,time,int,int,int,int,real,real,int,real,int,int,int,int
mins,1293843600000.0,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0,1.0
mean,1324968241874.1387,2.5067524115755626,0.02857142857142857,0.6809370693615067,1.4184657785943962,20.2318144235186,23.655934772622874,61.88470372071658,12.800571281580158,36.02498851630684,155.5652733118971,191.59026182820395,191.5836472209463
maxs,1355958000000.0,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0,977.0
sigma,18186077643.01126,1.1161321529674035,0.16660627876431913,0.46613489303567057,0.6338550128938669,7.7913111304392215,8.474525070270193,19.245045237309753,8.163990511234385,49.96176892218166,151.03979036028676,181.14495703238546,181.15005452302242
zeros,0,0,10574,3473,0,0,0,22,1312,986,15,0,0
missing,0,0,0,0,0,0,0,0,0,0,0,0,0
0,2011-01-01 01:00:00,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0,8.0,32.0,40.0,16.0
1,2011-01-01 02:00:00,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0,5.0,27.0,32.0,40.0
2,2011-01-01 03:00:00,1.0,0.0,0.0,1.0,9.84,14.395,75.0,0.0,3.0,10.0,13.0,32.0


In [33]:
# Remove things from training frame
df_edited = df.drop(["datetime", "registered", "casual"])

# Convert fields to logical factors
df_edited['holiday'] = df_edited['holiday'].asfactor()
df_edited['workingday'] = df_edited['workingday'].asfactor()
df_edited['season'] = df_edited['season'].asfactor()
df_edited['weather'] = df_edited['weather'].asfactor()

train,test,valid = df_edited.split_frame(ratios=[.85, .1], seed = seed)

In [34]:
# See that the data is ready
train.describe()

Rows:9283
Cols:10




Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,previouscount
type,enum,enum,enum,enum,real,real,int,real,int,int
mins,,,,,0.82,0.76,0.0,0.0,1.0,1.0
mean,,,,,20.2193644296025,23.64114725842939,61.81590003231714,12.779464375740602,192.32877302596142,191.93471937951094
maxs,,,,,41.0,45.455,100.0,56.9969,977.0,977.0
sigma,,,,,7.821605504951305,8.506134753233946,19.226124595547134,8.129678467415845,182.27345686476707,181.77608056277506
zeros,,,,,0,0,22,1121,0,0
missing,0,0,0,0,0,0,0,0,0,0
0,1,0,0,1,9.02,13.635,80.0,0.0,40.0,16.0
1,1,0,0,1,9.02,13.635,80.0,0.0,32.0,40.0
2,1,0,0,1,9.84,14.395,75.0,0.0,13.0,32.0


In [35]:
# Run GBM
gbm_model = H2OGradientBoostingEstimator(distribution = "gaussian", ntrees=100, learn_rate=.01)
gbm_model.train(x=list(range(0,train.ncol - 1)), y="count", training_frame=train, validation_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [36]:
# Run GLM
glm_model = H2OGeneralizedLinearEstimator(family= "gaussian", lambda_ = 0, compute_p_values = True)
glm_model.train(x=list(range(0,train.ncol - 1)), y="count", training_frame=train, validation_frame=train)


glm Model Build progress: |███████████████████████████████████████████████| 100%


In [38]:
# Run AutoML
x = train.columns
y = "count"
x.remove(y)
aml = H2OAutoML(max_runtime_secs = 30, seed = seed)
aml.train(x = x, y = y,
          training_frame = train,
          validation_frame = valid,
          leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [56]:
# Get the metrics for the different techniques
gbm_metrics = gbm_model.model_performance(test)
glm_metrics = glm_model.model_performance(test)
lb = aml.leaderboard
aml_leader_metrics = aml.leader.model_performance(test)

p = 100 # for display
print('RMSE (root mean squared error) for each algorithm')
print(f"gbm = {int(gbm_metrics['RMSE']*p)/p}, glm = {int(glm_metrics['RMSE']*p)/p}, autoML = {int(aml_leader_metrics['RMSE']*p)/p}")
print('')
lb

RMSE (root mean squared error) for each algorithm
gbm = 146.69, glm = 147.68, autoML = 86.29



model_id,mean_residual_deviance,rmse,mae,rmsle
StackedEnsemble_BestOfFamily_0_AutoML_20180228_171307,7446.19,86.2913,58.1043,0.678415
StackedEnsemble_AllModels_0_AutoML_20180228_171307,7474.8,86.4569,58.4997,0.663193
GBM_grid_0_AutoML_20180228_171307_model_2,7493.92,86.5674,58.9048,
GBM_grid_0_AutoML_20180228_171307_model_0,7584.34,87.0881,59.6296,
GBM_grid_0_AutoML_20180228_171307_model_1,7593.25,87.1393,59.2504,0.688517
GBM_grid_0_AutoML_20180228_171307_model_4,7633.15,87.3679,59.3719,0.667084
XRT_0_AutoML_20180228_171307,7867.21,88.6973,60.4383,0.71199
GBM_grid_0_AutoML_20180228_171307_model_3,7885.1,88.7981,60.6105,
DRF_0_AutoML_20180228_171307,7940.4,89.1089,60.1474,0.704953
DeepLearning_0_AutoML_20180228_171307,8234.62,90.7448,58.6519,


