# Hopsworks Machine Learning (HSML) - Quickstart Notebook

#### Install dependencies

In [1]:
#!pip install faker

Collecting faker
  Downloading Faker-12.3.0-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 10.3 MB/s eta 0:00:01
Installing collected packages: faker
Successfully installed faker-12.3.0


## Generate dataset and train a classifier

### Generate dataset for binary classification

In [2]:
# Credit to https://github.com/minimaxir/ml-data-generator

from faker import Faker
from datetime import datetime
import pandas as pd
import numpy as np
from collections import Counter

# Utils

cat1_dict = { 1: 2, 2: -1, 3: 4, 4: 6, 5: -6, 6: 4, 7: 9, 8: 10, 9: -4, 10: 0 }
cat2_dict = { 'a': 1, 'b': 4, 'c': 15 }

def text1_transform(text):
    text_split = text.split(" ")
    num_words = len(text_split)

    total = 0
    for i, word in enumerate(text_split):
        total += np.power(len(word), 1 + i / 100) / num_words

    return total

def text2_transform(text):
    counts = Counter(text)
    return sum([counts.get(x, 0) for x in 'aeiou'])

def cat1_transform(key):
    return cat1_dict[key]

def cat2_transform(key):
    return cat2_dict[key]

def datetime2_transform(datetime_col):
    hour = datetime_col.dt.hour
    dayofweek = datetime_col.dt.dayofweek
    year = datetime_col.dt.year

    hour_tf = np.array([0.1 if x <= 6 or x >= 20 else 0.3 for x in hour])
    dayofweek_tf = np.array([0.2 if x >= 5 else 0.5 for x in dayofweek])
    year_tf = np.array([0.2 if x == 2017 else 0.5 for x in year])

    return hour_tf + dayofweek_tf + year_tf

# Generate dataset

fake = Faker()
nrow = 100
df = pd.DataFrame()

# Dummy data

df['id'] = range(1, nrow+1)
df['name'] = [fake.name()
              for _ in range(nrow)]

# Numeric data

df['num1'] = np.random.normal(0, 1, nrow)
df['num2'] = np.random.randint(1, 100, nrow)

# Text data

df['text1'] = [fake.sentence(nb_words=10, variable_nb_words=True)
               for _ in range(nrow)]
df['text2'] = [fake.sentence(nb_words=4, variable_nb_words=True)
               for _ in range(nrow)]
# Categorical data

df['cat1'] = np.random.randint(1, 10, nrow)
df['cat2'] = np.random.choice(['a', 'b', 'c'], nrow, p=[0.5, 0.45, 0.05])

# Datetime data

df['datetime1'] = [fake.date_time_between(
    start_date=datetime(2017, 1, 1),
    end_date=datetime(2018, 12, 31))
    for _ in range(nrow)]
df['datetime2'] = df['datetime1'].apply(lambda x: x + pd.Timedelta(seconds=np.random.randint(0, 72 * 60 * 60)))

# Process each generated field to derive a target statistic.

num1_tf = df['num1']**2
num2_tf = df['num2'] / 10 * num1_tf

text1_tf = df['text1'].apply(text1_transform)
text2_tf = df['text2'].apply(text2_transform)

datetime1_tf = (df['datetime2'] - df['datetime1'])
datetime1_tf = datetime1_tf / np.timedelta64(1, 'h') / 12
datetime2_tf = datetime2_transform(df['datetime2'])

cat1_tf = df['cat1'].apply(cat1_transform)
cat2_tf = df['cat2'].apply(cat2_transform)

df['target'] = (np.log10(num1_tf + num2_tf) +
                np.power(text1_tf * text2_tf, datetime2_tf) +
                datetime1_tf + np.max(cat1_tf + cat2_tf))

bins = df['target'].quantile([0.0, 0.5, 1.0])
df['target'] = pd.cut(df['target'], bins=bins, labels=False, include_lowest=True).astype(int)
#df_copy.to_csv("gen_df_binary.csv", index=False)

### Train a dummy classifier

In [4]:
import numpy as np
import joblib
from sklearn.dummy import DummyClassifier

X = df
y = np.random.randint(low=0, high=2, size=100)

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)

acc = dummy_clf.score(X, y)

joblib.dump(dummy_clf, "dummy_classifier.pkl")

['dummy_classifier.pkl']

## Register and serve a model with HSML

### Create a connection with Hopsworks

In [6]:
import hsml

connection = hsml.connection()

Connected. Call `.close()` to terminate connection gracefully.


### Register the classifier in the Model Registry

In [7]:
# get connection with the Model Registry
mr = connection.get_model_registry()

In [8]:
from hops import hdfs
hdfs.mkdir("/Resources/dummy_classifier")
hdfs.copy_to_hdfs("dummy_classifier.pkl", "/Resources/dummy_classifier", overwrite=True)

In [10]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

model_schema = ModelSchema(input_schema=Schema(X), output_schema=Schema(y))

dummy_model = mr.python.create_model("dummy_classifier", description="Dummy binary classifier", metrics={'accuracy': acc}, model_schema=model_schema, input_example=X)
#dummy_model.save("dummy_classifier.pkl")
dummy_model.save("/Resources/dummy_classifier")

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


Exported model dummy_classifier with version 1


<hsml.python.model.Model at 0x7f627bb51d00>

### Deploy the classifier

In [11]:
# get connection with Model Serving
ms = connection.get_model_serving()

In [12]:
dummy_model = mr.get_best_model("dummy_classifier", "accuracy", "max")

In [None]:
# create a deployment
dummy_model.deploy(name="dummyclassifier")

In [None]:
dummy_predictor = ms.create_predictor(dummy_model)

In [None]:
dummy_predictor.describe()

In [None]:
dummy_predictor.update_from_response_json(dummy_predictor.to_dict())

In [None]:
dummy_classifier = ms.create_deployment(dummy_predictor)

In [None]:
dummy_classifier.describe()