In [None]:
!pip install hopsworks

In [9]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import hopsworks
import numpy as np
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
import joblib
import os
from hsfs.feature import Feature

In [5]:
project = hopsworks.login()
fs = project.get_feature_store()

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/371861
Connected. Call `.close()` to terminate connection gracefully.


In [7]:
earthquakes_fg = fs.get_feature_group(name="earthquakes", version=1)
query = earthquakes_fg.select_all()
feature_view = fs.get_or_create_feature_view(name="earthquakes", version=1, labels=["mag"], query=query)

In [None]:
# earthquakes_in_radius = fs.get_feature_group(name="earthquakes", version=1)
# query = earthquakes_fg.select_all()
# query.filter(Feature("latitude") > 1000)
# query.filter(Feature("name").like("max%"))
# feature_view_2 = fs.get_or_create_feature_view(name="earthquakes_in_radius", version=1, labels=["mag"], query=query)

In [8]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2)

Finished: Reading data from Hopsworks, using ArrowFlight (3.61s) 




In [11]:
X_train.drop(columns=['id', 'time'], inplace=True)
X_test.drop(columns=['id', 'time'], inplace=True)

In [12]:
X_train

Unnamed: 0,latitude,longitude,depth,deptherror,rms,reviewed
0,37.574500,-118.846333,-0.990000,0.400000,0.02,1.0
1,38.771667,-122.710503,1.500000,0.430000,0.02,0.0
2,38.840668,-122.876167,2.610000,0.600000,0.02,0.0
3,36.546333,-89.711833,6.630000,0.450000,0.14,1.0
5,-33.977000,56.142200,10.000000,1.805000,0.58,1.0
...,...,...,...,...,...,...
130777,38.793500,-122.731167,1.470000,0.900000,0.01,1.0
130778,31.681126,-104.295363,7.802588,0.909312,0.10,1.0
130779,38.820167,-122.822998,2.030000,0.990000,0.01,0.0
130780,46.248500,-119.436167,10.470000,0.650000,0.06,1.0


In [22]:
model = HistGradientBoostingRegressor()

params = {'learning_rate': [0.1, 0.01, 0.001], 'l2_regularization': [0, 1e-5, 1e-3],
          'max_iter': [50, 100, 200, 300], 'max_leaf_nodes': [21, 31, 41, 51],
          'min_samples_leaf': [10, 15, 20, 30, 50]}
gs = RandomizedSearchCV(model, params, scoring='neg_mean_squared_error', n_iter=50 )
gs.fit(X_train, y_train.values.ravel())
gs.cv_results_

{'mean_fit_time': array([0.93143806, 4.65423975, 1.64068913, 4.62918763, 2.98268027,
        5.79108939, 3.25094347, 1.44295311, 1.27163315, 0.92659287,
        0.57852201, 1.76270075, 0.75209436, 1.28377509, 3.08196588,
        1.75933418, 1.11168599, 1.78826857, 2.52577767, 2.67684436,
        3.37164865, 0.58948159, 1.77746892, 1.10110836, 2.28323069,
        0.90869703, 1.45842595, 1.12023005, 1.12503853, 1.63103361,
        4.17787604, 1.42389398, 1.22665267, 0.7262496 , 1.1280839 ,
        0.50574279, 2.19702721, 5.10604773, 1.65151153, 3.11456933,
        1.29723935, 0.69989853, 2.99070215, 1.20179009, 1.96598067,
        1.29130692, 4.1667315 , 1.98975463, 2.23412371, 1.84835019]),
 'std_fit_time': array([0.01956515, 1.30455568, 1.04928986, 1.29100093, 1.06867665,
        1.11499903, 1.03907043, 1.06142015, 0.01567385, 0.7964309 ,
        0.01493951, 1.04650288, 0.01092066, 1.06376159, 1.02622734,
        1.03068951, 0.01232139, 1.04755153, 1.0452568 , 1.02913316,
        1.277

In [25]:
gs.best_score_

-0.1361969746120307

In [23]:
gs.best_estimator_

In [26]:
gs.predict(X_train)

array([0.47281938, 0.92054758, 0.75998156, ..., 0.65993119, 0.59031939,
       1.71996939])

In [27]:
y_train.values.ravel()

array([0.28, 0.53, 0.91, ..., 0.76, 0.19, 1.66])

In [29]:
score = -gs.score(X_test, y_test.values.ravel())

0.13619027263253636

In [33]:
mr = project.get_model_registry()

model_dir="earthquakes_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

joblib.dump(gs.best_estimator_, model_dir + "/earthquakes_model.pkl")

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

earthquakes_model = mr.python.create_model(
    name="earthquakes_model",
    metrics={"mse" : score},
    model_schema=model_schema,
    description="Earthquake Magnitude Predictor"
)


earthquakes_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/1103800 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/600 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/371861/models/earthquakes_model/2


Model(name: 'earthquakes_model', version: 2)