In [7]:
! apt-get install default-jre
!java -version

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.11-68ubuntu1~18.04.1).
default-jre set to manually installed.
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
openjdk version "11.0.16" 2022-07-19
OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu118.04)
OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu118.04, mixed mode, sharing)


In [8]:
! pip install h2o

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting h2o
  Downloading h2o-3.38.0.1.tar.gz (177.2 MB)
[K     |████████████████████████████████| 177.2 MB 43 kB/s 
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.38.0.1-py2.py3-none-any.whl size=177276478 sha256=a971e4055f906e24c33f7770205afcbd022e057b308a542fe59fd4e999fe9559
  Stored in directory: /root/.cache/pip/wheels/a1/a1/d9/bb37df368c4635a707e7362d1088450b606041f05aeba5f173
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.38.0.1


In [9]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [10]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    try:
        assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    except AssertionError:
        print(f"The shape of y_true is {y_true.shape}")
        print(f"The shape of y_pred is {y_pred.shape}")
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [11]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16" 2022-07-19; OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu118.04); OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu118.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpe5r02z1o
  JVM stdout: /tmp/tmpe5r02z1o/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpe5r02z1o/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.1
H2O_cluster_version_age:,"21 days, 16 hours and 24 minutes"
H2O_cluster_name:,H2O_from_python_unknownUser_c3ee3f
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [13]:
train = h2o.import_file("https://raw.githubusercontent.com/haakonnese/tdt-4173-revenue/main/own_data/train_log_h2o.csv?token=GHSAT0AAAAAABYAXXBLTMZVWMS7JE7LBQWWY2FDURQ")
test = h2o.import_file("https://raw.githubusercontent.com/haakonnese/tdt-4173-revenue/main/own_data/test_h2o.csv?token=GHSAT0AAAAAABYAXXBKHGBLFGZPEWIXRFEQY2FDUGQ")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [15]:
pd_df = pd.read_csv("https://raw.githubusercontent.com/haakonnese/tdt-4173-revenue/main/own_data/all_merged_h2o.csv?token=GHSAT0AAAAAABYAXXBKG6V7JR3R5K5IG5IWY2FDT5A")
categorical_features = list(pd_df.select_dtypes(include=[np.object0]).columns)

In [16]:
train[categorical_features] = train[categorical_features].asfactor()
test[categorical_features] = test[categorical_features].asfactor()

In [17]:
y = "revenue"
x = train.columns
x.remove(y)

In [None]:
aml = H2OAutoML(max_models=15, seed=1)
aml.train(x = x, y = y, training_frame=train)

AutoML progress: |███

In [None]:
lb = aml.leaderboard

In [None]:
lb.head()

In [None]:
preds = aml.predict(test)
preds = preds.as_data_frame()

In [None]:
preds = np.array(preds["predict"])
preds = np.expm1(preds)

In [None]:
true_revenue = test.as_data_frame().revenue
rmsle(true_revenue, preds)

In [None]:
test["revenue"]

revenue
6.853
1.733
3.721
50.238
0.26
3.169
16.69
5.417
3.349
4.389


