# Model Training 

In [1]:
# Model training and tuning
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Hopsworks platform and feature store
import hopsworks
from hsfs.feature import Feature
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# General-purpose libraries
import numpy as np
import joblib
import os

## Connecting to Hopsworks

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1133716
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
earthquakes_fs = fs.get_feature_group(name="earthquakes", version=1)
query = earthquakes_fs.select_all()
feature_view = fs.get_or_create_feature_view(name="earthquakes", version=1, labels=["mag"], query=query)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1133716/fs/1124419/fv/earthquakes/version/1


## Train/Test Split

In [4]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (152.30s) 



In [5]:
X_train.drop(columns=['id', 'time'], inplace=True)
X_test.drop(columns=['id', 'time'], inplace=True)

In [6]:
X_train

Unnamed: 0,latitude,longitude,depth,deptherror,rms,reviewed
0,8.976400,126.256900,77.0290,6.848000,0.41,1.0
1,51.859500,-178.019333,3.8100,1.160000,0.18,1.0
2,31.614000,-103.995000,7.5482,0.385222,0.10,1.0
4,51.807500,-177.984500,-1.7600,1.010000,0.13,1.0
5,33.676167,-116.987000,9.8100,0.400000,0.07,1.0
...,...,...,...,...,...,...
167203,38.817167,-122.817500,2.4300,0.620000,0.02,1.0
167204,34.030167,-117.553667,11.8500,0.330000,0.14,1.0
167205,33.347667,-116.340167,13.5600,0.660000,0.20,1.0
167206,33.440000,-116.556000,10.0600,0.340000,0.13,1.0


## Model Training

In [7]:
model = HistGradientBoostingRegressor()

params = {'learning_rate': [0.1, 0.01, 0.001], 'l2_regularization': [0, 1e-5, 1e-3],
          'max_iter': [50, 100, 200, 300], 'max_leaf_nodes': [21, 32, 41, 51],
          'min_samples_leaf': [10, 15, 20, 30, 50]}

gs = RandomizedSearchCV(model, params, scoring='neg_mean_squared_error', n_iter=50)
gs.fit(X_train, y_train.values.ravel())
gs.cv_results_

{'mean_fit_time': array([46.6086762 , 55.38788981,  3.95370631,  1.03344259,  1.70561438,
         5.06452518, 16.76599922,  9.74602556,  3.11583352,  4.67550745,
        10.47686524, 13.14198413,  5.3938314 ,  2.14934468,  1.96487966,
        11.48361406,  7.98168588, 16.31535268,  2.23444295,  9.13930101,
         9.32534075, 12.95412493,  5.7318881 , 10.80747213, 10.6608027 ,
         2.46382508,  1.66468315,  7.44494667,  3.82358537,  8.23185687,
         3.07700071,  6.50402336,  7.02529764,  8.51303797,  2.3146594 ,
         1.9899663 ,  7.08103676,  3.51435733,  2.24665322, 12.65068235,
         3.091747  ,  9.22259827,  4.4507112 ,  5.22776108,  1.91727533,
         1.74796877,  8.10907378,  7.95662889,  4.56613946,  2.09121294]),
 'std_fit_time': array([58.20542698, 46.19474496,  0.8938125 ,  0.17063081,  0.21739717,
         1.86060773,  2.78399978,  0.95346771,  0.44637559,  0.56047455,
         1.23553299,  3.31509091,  0.83876083,  1.21442485,  0.39016854,
         1.11719

In [8]:
gs.best_score_

-0.14336991608171357

In [9]:
gs.best_estimator_

In [10]:
gs.predict(X_train)

array([4.47425343, 1.20708176, 2.06020824, ..., 1.00536757, 0.65802333,
       4.39186953])

In [11]:
y_train.values.ravel()

array([3.9 , 1.37, 2.1 , ..., 0.82, 0.93, 4.9 ])

In [12]:
score = -gs.score(X_test, y_test.values.ravel())

In [13]:
score

0.14266585298314563

## Model Registry

In [14]:
mr = project.get_model_registry()

model_dir="earthquakes_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

joblib.dump(gs.best_estimator_, model_dir + "/earthquakes_model.pkl")

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

earthquakes_model = mr.python.create_model(
    name="earthquakes_model",
    metrics={"mse" : score},
    model_schema=model_schema,
    description="Earthquake Magnitude Predictor"
)


earthquakes_model.save(model_dir)

Connected. Call `.close()` to terminate connection gracefully.


  0%|          | 0/6 [00:02<?, ?it/s]

Uploading: 0.000%|          | 0/1434088 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/637 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1133716/models/earthquakes_model/1


Model(name: 'earthquakes_model', version: 1)