In [1]:
import numpy as np 
import h2o

h2o.init(max_mem_size = 2)           
h2o.remove_all()               

from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

#help(h2o)
#help(H2ODeepLearningEstimator)
#help(h2o.import_file)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpsmiatx1r
  JVM stdout: /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpsmiatx1r/h2o_mohsenkiskani_started_from_python.out
  JVM stderr: /var/folders/hy/j_c72d1x72g_rr58tgrlh3b40000gn/T/tmpsmiatx1r/h2o_mohsenkiskani_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.11
H2O cluster version age:,1 month and 3 days
H2O cluster name:,H2O_from_python_mohsenkiskani_ntf1lf
H2O cluster total nodes:,1
H2O cluster free memory:,1.778 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [2]:
def mape_func(labels, preds):
    return np.mean(np.abs((preds - labels)/(labels))) * 100

In [5]:
import os
train_h2o_all = h2o.import_file(path = os.path.realpath("train.csv"))
test_h2o      = h2o.import_file(path = os.path.realpath("test.csv"))

train_h2o, val_h2o = train_h2o_all.split_frame(ratios = [0.8], seed = 7)

X_cols = train_h2o_all.col_names[1:-1]
y_cols = train_h2o_all.col_names[13]

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
deep_long_model_1 = H2ODeepLearningEstimator(hidden=[40,40,40,40,40], epochs = 10000)
deep_long_model_1.train(X_cols, y_cols, train_h2o)
pred_long_model_1 = deep_long_model_1.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_long_model_1.as_matrix())
#21.88915821827038     [20,20,20,20,20]
#21.812563482703755    [40,40,40,40,40]

In [None]:
dl_model = H2ODeepLearningEstimator(epochs=10)
dl_model.train(X_cols, y_cols, train_h2o)
pred = dl_model.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred.as_matrix())

#22.15701987460977

In [None]:
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(X_cols, y_cols, train_h2o)
pred = gbm_model.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred.as_matrix())

#22.505163722860498

In [None]:
drf_model = H2ORandomForestEstimator(max_depth = 40,
    nfolds = 3, fold_assignment="Modulo", keep_cross_validation_predictions=True)
drf_model.train(X_cols, y_cols, train_h2o)
pred = drf_model.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred.as_matrix())

#18.126878064811716
#18.115947532453333 max_depth = 40 

In [None]:
glm_model = H2OGeneralizedLinearEstimator()
glm_model.fit(train_h2o[X_cols], train_h2o[y_cols])  

In [None]:
models = [dl_model, gbm_model, drf_model, glm_model]
m_names = ["Deep Learning", "Gradient Boosted Method", "Distributed Random Forest", "Generalized Linear Model"]

In [None]:
dl_1 = H2ODeepLearningEstimator(epochs=1)
dl_1.train(X_cols, y_cols, train_h2o)

dl_250 = H2ODeepLearningEstimator(checkpoint=dl_1, epochs=250)
dl_250.train(X_cols, y_cols, train_h2o)

dl_500 = H2ODeepLearningEstimator(checkpoint=dl_250, epochs=500)
dl_500.train(X_cols, y_cols, train_h2o)

dl_750 = H2ODeepLearningEstimator(checkpoint=dl_500, epochs=750)
dl_750.train(X_cols, y_cols, train_h2o)

models_dl = [dl_1, dl_250, dl_500, dl_750]
m_names_dl = ["DL " + str(int(model.get_params()['epochs']['actual_value'])) + " Epochs" for model in models_dl]

In [None]:
pred_1   = dl_1.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_250 = dl_250.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_500 = dl_500.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_750 = dl_750.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)

print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_1.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_250.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_500.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_750.as_matrix()))

#28.603573415730754
#22.39875197740143
#22.7802683842562
#22.04537421795538

In [None]:
dl_l1 = H2ODeepLearningEstimator(hidden=[1000], epochs=500)
dl_l1.train(X_cols, y_cols, train_h2o)

dl_l2 = H2ODeepLearningEstimator(hidden=[200,200], epochs=500)
dl_l2.train(X_cols, y_cols, train_h2o)

dl_l3 = H2ODeepLearningEstimator(hidden=[42,42,42], epochs=500)
dl_l3.train(X_cols, y_cols, train_h2o)

dl_l4 = H2ODeepLearningEstimator(hidden=[11,13,17,19], epochs = 1000)
dl_l4.train(X_cols, y_cols, train_h2o)

models_network = [dl_l1, dl_l2, dl_l3, dl_l4]
m_names_network = ["1000", "200 x 200", "42 x 42 x 42", "11 x 13 x 17 x 19"]


# ###Activation Functions
# Next, we compare between different activation functions, including one with 50% dropout regularization in the hidden layers:

models_act = []
m_names_act = []
for i,method in enumerate(["Tanh","Maxout","Rectifier","RectifierWithDropout"]):
    models_act.append(H2ODeepLearningEstimator(activation=method, hidden=[100,100], epochs=1000))
    models_act[i].train(X_cols, y_cols, train_h2o)
    m_names_act.append("DL "+ method + " Activation")

In [None]:
pred_l1   = dl_l1.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_l2   = dl_l2.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_l3   = dl_l3.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
pred_l4   = dl_l4.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)

print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l1.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l2.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l3.as_matrix()))
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l4.as_matrix()))

for i in range(4):
    pred = models_act[i].predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
    print(m_names_act[i], " score = " , mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred.as_matrix()))
    
    
#24.898107487754242
#23.07006928025099
#22.494132169347903
#22.454395504979217
#DL Tanh Activation  score =  24.994449602375052
#DL Maxout Activation  score =  25.168343688143718
#DL Rectifier Activation  score =  23.07230430482452
#DL RectifierWithDropout Activation  score =  42.88493826250886

In [None]:
dl_l5 = H2ODeepLearningEstimator(hidden=[400,400], epochs=1000)
dl_l5.train(X_cols, y_cols, train_h2o)
pred_l5   = dl_l5.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l5.as_matrix()))
np.sum(pred_l5.as_matrix()<0)
# 22.50300980342744
# 5

In [None]:
dl_l6 = H2ODeepLearningEstimator(hidden=[40,40,40], epochs=1000)
dl_l6.train(X_cols, y_cols, train_h2o)
pred_l6 = dl_l6.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l6.as_matrix()))
np.sum(pred_l6.as_matrix()<0)
#22.41066233203548
#0

In [None]:
dl_l7 = H2ODeepLearningEstimator(hidden=[10,10,10,10], epochs=1000)
dl_l7.train(X_cols, y_cols, train_h2o)
pred_l7 = dl_l7.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_l7.as_matrix()))
np.sum(pred_l7.as_matrix()<0)
#23.002932001739524
#0

In [None]:
model_d1 = H2ODeepLearningEstimator(model_id="model_d1", epochs=1, variable_importances=True)
model_d1.train(X_cols, y_cols, training_frame = train_h2o , validation_frame = val_h2o)

In [None]:
model_d2 = H2OXGBoostEstimator(nfolds=3, fold_assignment="Modulo", keep_cross_validation_predictions=True)
model_d2.train(X_cols, y_cols, train_h2o)
pred_d2 = model_d2.predict(val_h2o).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_d2.as_matrix()))
# 20.443528867009622    None 
# 20.443176634720867   'sort_by_response'

In [None]:
stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=train_h2o, validation_frame=val_h2o,
                                    base_models= [model_d2.model_id, drf_model.model_id])
stack.train(X_cols, y_cols, train_h2o)
pred_d3 = stack.predict(val_h2o).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_d3.as_matrix()))
stack.model_performance()

In [None]:
h2o.shutdown(prompt=False)

In [None]:
# Using updated data
train_h2o_all = h2o.import_file(path = os.path.realpath("dataFrames/train_updated_June27.csv"))
test_h2o      = h2o.import_file(path = os.path.realpath("dataFrames/test_updated_June27.csv"))

train_h2o, val_h2o = train_h2o_all.split_frame(ratios = [0.8], seed = 7)

X_cols = train_h2o_all.col_names[1:-1]
y_cols = train_h2o_all.col_names[5]

X_cols = ['destinationLatitude', 'destinationLongitude', 'distanceKM', 'sourceLatitude', 'sourceLongitude',
          'taxiDurationMin', 'weight', 'source', 'destination', 'Ridge', 'KNeighborsRegressor', 'Lasso', 'Pipeline',
          'XGBRegressor', 'GradientBoostingRegressor', 'BaggingRegressor', 'DecisionTreeRegressor', 'AdaBoostRegressor',
          'ExtraTreesRegressor', 'RandomForestRegressor', 'LGBMRegressor']

X_cols = train_h2o_all.col_names 
X_cols.remove('C1')
X_cols.remove('ID') 
X_cols.remove('price')
len(X_cols)

In [None]:
deep_long_model_1 = H2ODeepLearningEstimator(hidden=[40,40,40], epochs = 1000)
deep_long_model_1.train(X_cols, y_cols, train_h2o)
pred_long_model_1 = deep_long_model_1.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_long_model_1.as_matrix())
# 19.1373295048555     [40,40,40]   all features 
#    [40,40,40]   only 9 continuous and 12 predictions  

In [None]:
drf_model = H2ORandomForestEstimator(max_depth = 40,
    nfolds = 3, fold_assignment="Modulo", keep_cross_validation_predictions=True)
drf_model.train(X_cols, y_cols, train_h2o)
pred = drf_model.predict(val_h2o[X_cols]).as_data_frame(use_pandas=True)
mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred.as_matrix())
np.sum(pred.as_matrix()<0)

#18.126878064811716
#18.115947532453333 max_depth = 40 

In [None]:
model_d2 = H2OXGBoostEstimator(nfolds=3, fold_assignment="Modulo", keep_cross_validation_predictions=True)
model_d2.train(X_cols, y_cols, train_h2o)
pred_d2 = model_d2.predict(val_h2o).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_d2.as_matrix()))
# 20.443528867009622    None 
# 20.443176634720867   'sort_by_response'

In [None]:
stack = H2OStackedEnsembleEstimator(model_id="my_ensemble", training_frame=train_h2o, validation_frame=val_h2o,
                                    base_models= [model_d2.model_id, drf_model.model_id])
stack.train(X_cols, y_cols, train_h2o)
pred_d3 = stack.predict(val_h2o).as_data_frame(use_pandas=True)
print(mape_func(val_h2o[y_cols].as_data_frame().as_matrix(), pred_d3.as_matrix()))
stack.model_performance()

In [None]:
tt = test_h2o['ID'].as_data_frame()
filename = "/Users/mohsenkiskani/Downloads/Ubaar/submissions/submission76.csv"
with open(filename,"w+") as outputfile:
    outputfile.write("ID,price\n")
    for i in range(pred_df.shape[0]):
        outputfile.write(str(tt.iloc[i])[6:17]+","+str(int(np.ceil(pred_df.iloc[i])))+"\n")

In [None]:
h2o.cluster().shutdown()