# Iris ML Streaming

In [3]:
from hops import jobs, hdfs, serving, featurestore
import tensorflow as tf
from functools import reduce
import time, random
import numpy as np

FILE_NAME = 'model-monitoring-1.0-SNAPSHOT.jar'
IRIS_RESOURCES_DIR_NAME = "Resources/Iris/"
IRIS_RESOURCES_DIR = "hdfs:///Projects/" + hdfs.project_name() + "/" + IRIS_RESOURCES_DIR_NAME
APP_PATH = IRIS_RESOURCES_DIR + FILE_NAME
IRIS_MODEL_NAME="Iris"
IRIS_TRAIN_DATASET_NAME = "iris_train_dataset"
IRIS_FG_NAME = "iris_train_all_features"

# Structured Streaming
STRUCT_JOB_NAME = 'iris_ml_monitoring_struct'
STRUCT_CLASS_NAME = 'io.hops.monitoring.examples.IrisMLMonitoringStructured'

# Direct Streaming
DSTREAM_JOB_NAME = 'iris_ml_monitoring_dstream'
DSTREAM_CLASS_NAME = 'io.hops.monitoring.examples.IrisMLMonitoringDStream'

# Choose type of streaming job

# JOB_NAME = DSTREAM_JOB_NAME
# CLASS_NAME = DSTREAM_CLASS_NAME
JOB_NAME = STRUCT_JOB_NAME
CLASS_NAME = STRUCT_CLASS_NAME

## Spark streaming job

### Job config

In [4]:
def get_spark_dyn_alloc_config(dyn_alloc_enabled=True, dyn_alloc_min_exec=1, dyn_alloc_max_exec=2, dyn_alloc_init_exec=1):
    return { "spark.dynamicAllocation.enabled": dyn_alloc_enabled, "spark.dynamicAllocation.minExecutors": dyn_alloc_min_exec,
              "spark.dynamicAllocation.maxExecutors": dyn_alloc_max_exec, "spark.dynamicAllocation.initialExecutors": dyn_alloc_init_exec }

def get_spark_job_config(dyn_alloc_config, exec_instances=1, exec_gpus=0, exec_cores=1, exec_mem=2048, tf_num_ps=1, black_list_enabled=False):
    config = { "spark.executor.instances": exec_instances, "spark.executor.cores": exec_cores, "spark.executor.memory": exec_mem,
            "spark.executor.gpus": exec_gpus, "spark.tensorflow.num.ps": tf_num_ps, "spark.blacklist.enabled": black_list_enabled }
    config.update(dyn_alloc_config)
    return config

def get_job_config(app_path, main_class, experiment_type="EXPERIMENT", schedule=None, local_resources=[], dist_strategy="COLLECTIVE_ALL_REDUCE", spark_config=None):
    config = { 'appPath': app_path, 'mainClass': main_class, 'experimentType': experiment_type, 'distributionStrategy': dist_strategy, 'schedule': schedule, 'localResources': local_resources }
    if spark_config:
        base_spark_config = {'type': 'sparkJobConfiguration', 'amQueue': 'default', 'amMemory': 2048, 'amVCores': 1, 'jobType': 'SPARK',
                             'mainClass': main_class}
        config.update(base_spark_config)
        config.update(spark_config)
    return config

### Create monitoring job

In [5]:
# generic job config
spk_jb_dyn_alloc_conf = get_spark_dyn_alloc_config()
spk_jb_config = get_spark_job_config(spk_jb_dyn_alloc_conf)
job_config = get_job_config(APP_PATH, CLASS_NAME, spark_config=spk_jb_config)

# check job existance
executions = jobs.get_executions(JOB_NAME, "")
if executions:
    print("Job '{}' already exists".format(JOB_NAME))
else:
    # create streaming job
    response = jobs.create_job(JOB_NAME, job_config)
    if response and response['id']:
        print("Job created with ID", response['id'])
    else:
        print("Something went wrong")

Job 'iris_ml_monitoring_struct' already exists

## Simulate requests

### Start monitoring job

In [6]:
# job arguments:
# NOTE: Avoid doubles
job_timeout = 2*60 # seconds
window_duration = 6*1000 # 40s (milliseconds)
slide_duration = 3*1000 # 5s (milliseconds)
watermark_delay = 4*1000 # 20s (milliseconds)
max_request_delay = 2 # seconds

kfk_topic = serving.get_kafka_topic(IRIS_MODEL_NAME)
job_args = "{} {} {} {} {}".format(kfk_topic, job_timeout, window_duration, slide_duration, watermark_delay)

In [7]:
# check executions
executions = jobs.get_executions(JOB_NAME, "")
job_execution_id = None
if executions['count'] != 0:    
    for item in executions['items']:
        if item['finalStatus'] == "UNDEFINED":
            job_execution_id = item['id']
            print("Job '{}' already running with ID {}".format(JOB_NAME, job_execution_id))
            print("State: {} - Args: '{}'".format(item['state'], item['args']))
            break

# start job if necessary
if job_execution_id is None:    
    response = jobs.start_job(JOB_NAME, job_args)
    job_execution_id = response['id']
    print("Job execution started with ID", job_execution_id)
    print("State: {} - Args: '{}'".format(response['state'], response['args']))

Could not perform action on job's execution (url: /hopsworks-api/api/project/119/jobs/iris_ml_monitoring_struct/executions), server response: 
 HTTP code: 500, HTTP reason: Internal Server Error, error code: 120000, error msg: A generic error occurred., user msg: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/ml_monitoring/lib/python3.6/site-packages/hops/jobs.py", line 91, in start_job
    resource_url, response.status_code, response.reason, error_code, error_msg, user_msg))
hops.exceptions.RestAPIError: Could not perform action on job's execution (url: /hopsworks-api/api/project/119/jobs/iris_ml_monitoring_struct/executions), server response: 
 HTTP code: 500, HTTP reason: Internal Server Error, error code: 120000, error msg: A generic error occurred., user msg: 



In [8]:
# see all executions
response = jobs.get_executions(JOB_NAME, "")
print("All executions:", response['count'])
for execution in response['items']:
    print("Job execution with ID {}, State: {} - Args: {}".format(execution['id'], execution['state'], execution['args']))

All executions: 1
Job execution with ID 280, State: FINISHED - Args: Iris-inf1958 120 6000 3000 4000

### Start served model

In [9]:
# verify model is served and running
if serving.get_status(IRIS_MODEL_NAME) == 'Stopped':
    serving.start(IRIS_MODEL_NAME)
    time.sleep(10) # Let the serving startup correctly
else:
    print("Model '{}' already running".format(IRIS_MODEL_NAME))

Model 'Iris' already running

### Check train data statistics

In [10]:
def get_stats(name, store_type):
    if store_type == 'FEATUREGROUP':
        return featurestore.get_featuregroup_statistics(name)
    elif store_type == 'TRAINING_DATASET':
        return featurestore.get_training_dataset_statistics(name)
    raise Exception('Unknown store type')

def get_clusters(name, store_type, stats=None):
    stats = stats or get_stats(name, store_type)
    cl_an = stats.cluster_analysis
    clusters = cl_an.clusters
    return [(cl.datapoint_name, cl.cluster) for cl in clusters]

def get_correlation_matrix(name, store_type, stats=None):
    stats = stats or get_stats(name, store_type)
    features = []
    correlations = []
    row_feas = []
    for fea_corr in stats.correlation_matrix.feature_correlations:
        row_feas.append(fea_corr.feature_name)
        col_corrs = []
        for corr_val in fea_corr.correlation_values:
            if len(correlations) == 0: features.append(corr_val.feature_name)
            col_corrs.append(corr_val.correlation)
        correlations.append(col_corrs)
    row_idxs = list(map(lambda f: row_feas.index(f), features))
    correlations = np.array(correlations)[row_idxs,:]
    return features, correlations

def get_descriptive_stats(name, store_type, stats=None):
    stats = stats or get_stats(name, store_type)
    
    def merge_dicts(x,y):
        x.update(y)
        return x
    
    desc_stats = {}
    for st in stats.descriptive_stats.descriptive_stats:
        mv_dicts = list(map(lambda mv: {mv.metric_name: mv.value}, st.metric_values))
        desc_stats[st.feature_name] = reduce(merge_dicts, mv_dicts)
    return desc_stats

def get_feature_histograms(name, store_type, stats=None):
    stats = stats or get_stats(name, store_type)
    fea_hist = {}
    for fea_dist in stats.feature_histograms.feature_distributions:
        fea_hist[fea_dist.feature_name] = list(map(lambda d: vars(d), fea_dist.frequency_distribution))
    return fea_hist

In [11]:
# stats
fg_stats = featurestore.get_featuregroup_statistics(IRIS_FG_NAME)
td_stats = featurestore.get_training_dataset_statistics(IRIS_TRAIN_DATASET_NAME)

In [12]:
# clusters
td_clusters = get_clusters(IRIS_TRAIN_DATASET_NAME, 'TRAINING_DATASET', stats=td_stats)
fg_clusters = get_clusters(IRIS_FG_NAME, 'FEATUREGROUP', stats=fg_stats)

In [13]:
# correlation matrix
td_features, td_correlations = get_correlation_matrix(IRIS_TRAIN_DATASET_NAME, 'TRAINING_DATASET', stats=td_stats)
fg_features, fg_correlations = get_correlation_matrix(IRIS_FG_NAME, 'FEATUREGROUP', stats=fg_stats)

In [14]:
# descriptive statistics
td_desc_stats = get_descriptive_stats(IRIS_TRAIN_DATASET_NAME, 'TRAINING_DATASET', stats=td_stats)
fg_desc_stats = get_descriptive_stats(IRIS_FG_NAME, 'FEATUREGROUP', stats=fg_stats)

In [15]:
# feature histograms
td_feature_hist = get_feature_histograms(IRIS_TRAIN_DATASET_NAME, 'TRAINING_DATASET', stats=td_stats)
fg_feature_hist = get_feature_histograms(IRIS_FG_NAME, 'FEATUREGROUP', stats=fg_stats)

In [16]:
# statistics per feature
feature_stats = {}

In [17]:
print(td_desc_stats)

{'species': {'count': 120.0, 'mean': 1.0, 'stddev': 0.84016806, 'min': 0.0, 'max': 2.0}, 'petal_width': {'count': 120.0, 'mean': 1.1966667, 'stddev': 0.7820393, 'min': 0.1, 'max': 2.5}, 'petal_length': {'count': 120.0, 'mean': 3.7391667, 'stddev': 1.8221004, 'min': 1.0, 'max': 6.9}, 'sepal_width': {'count': 120.0, 'mean': 3.065, 'stddev': 0.42715594, 'min': 2.0, 'max': 4.4}, 'sepal_length': {'count': 120.0, 'mean': 5.845, 'stddev': 0.86857843, 'min': 4.4, 'max': 7.9}}

### Send requests

In [18]:
def generate_instance():
    sl = round(np.random.uniform(3,9), 1)
    sw = round(np.random.uniform(1,6), 1)
    pl = round(np.random.uniform(0.1,8), 1)
    pw = round(np.random.uniform(0.1,3.5), 1)
    print("Request: ", [sl, sw, pl, pw])
    return [sl, sw, pl, pw]

In [19]:
def send_request(n_instances, signature_name):
    instances = [generate_instance() for i in range(n_instances)]
    data = { "signature_name": signature_name,
             "instances": instances }
    response = serving.make_inference_request(IRIS_MODEL_NAME, data)
    return response['predictions']

In [20]:
# start simulation
N_REQUESTS = 12

time.sleep(20) # Let the job initiate completely

for i in range(N_REQUESTS):
    time.sleep(round(np.random.uniform(0, max_request_delay), 1))
    # choose api randomly
    signature = random.choice([tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, 'predict_instances'])
    # choose nº instances randomly
    n_instances = random.randint(1, 10)
    # send request
    preds = send_request(n_instances, signature)
#     print("Requests sended: {}".format(i+1), end='\r')

Request:  [8.5, 3.7, 6.3, 2.1]
Request:  [8.5, 4.5, 1.9, 3.2]
Request:  [8.3, 1.5, 7.5, 3.1]
Request:  [7.9, 2.8, 1.9, 3.3]
Request:  [6.3, 1.2, 7.0, 0.5]
Request:  [4.2, 5.1, 2.4, 2.0]
Request:  [7.0, 4.6, 7.3, 0.4]
Request:  [4.6, 2.8, 5.5, 1.4]
Request:  [8.6, 2.6, 0.3, 2.3]
Request:  [5.8, 5.7, 7.9, 1.5]
Request:  [6.9, 2.8, 6.4, 0.9]
Request:  [3.8, 5.7, 5.1, 1.5]
Request:  [7.9, 3.8, 5.5, 1.9]
Request:  [5.4, 3.5, 1.6, 0.6]
Request:  [7.2, 4.0, 0.9, 2.2]
Request:  [6.7, 1.3, 7.3, 2.3]
Request:  [3.3, 2.7, 5.3, 0.5]
Request:  [6.3, 1.7, 3.2, 0.4]
Request:  [8.3, 5.2, 2.2, 1.8]
Request:  [7.8, 4.7, 2.8, 2.7]
Request:  [4.2, 1.2, 2.4, 2.3]
Request:  [3.6, 5.2, 6.5, 2.0]
Request:  [8.7, 2.6, 5.1, 3.4]
Request:  [4.7, 3.8, 4.5, 2.0]
Request:  [7.3, 4.4, 1.2, 1.9]
Request:  [6.6, 3.5, 3.5, 1.9]
Request:  [6.0, 2.9, 1.9, 3.4]
Request:  [3.1, 4.4, 2.9, 1.1]
Request:  [7.7, 3.8, 3.9, 3.3]
Request:  [6.8, 2.2, 4.0, 2.0]
Request:  [3.3, 4.3, 1.5, 1.3]
Request:  [3.0, 2.4, 4.5, 1.9]
Request:

### Check logs

#### Statistics

In [229]:
import pyarrow.parquet as pq
from hops import hdfs

LOGS_STATS_DIR = IRIS_RESOURCES_DIR + kfk_topic + "-parquet/"
logs_stats_parquet_file = spark.read.parquet(LOGS_STATS_DIR + "*.parquet")

In [230]:
logs_stats_parquet_file.createOrReplaceTempView("logs_stats_parquet_file")
stats_df = spark.sql("SELECT * FROM logs_stats_parquet_file")

In [231]:
print(stats_df.show(20, truncate=False))

+------------------------------------------+------------+-----------+------------+-----------+------+
|window                                    |sepal_length|sepal_width|petal_length|petal_width|stat  |
+------------------------------------------+------------+-----------+------------+-----------+------+
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|3.0         |0.2        |0.2         |1.3        |min   |
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|9.0         |3.4        |7.4         |5.6        |max   |
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|0.15789473  |0.08421053 |0.18947369  |0.1131579  |mean  |
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|5.7131586   |1.6157894  |3.352632    |3.5131578  |avg   |
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|38.0        |38.0       |38.0        |38.0       |count |
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|0.47459388  |0.035475567|0.18949154  |0.31019497 |stddev|
|[2020-03-04 19:41:39, 2020-03-04 19:41:45]|217.10002   |61.399998  |127.40002   |

#### Alerts

In [232]:
import pyarrow.parquet as pq
from hops import hdfs

LOGS_ALERTS_DIR = IRIS_RESOURCES_DIR + kfk_topic + "-alerts-parquet/"
logs_alerts_parquet_file = spark.read.parquet(LOGS_ALERTS_DIR + "*.parquet")

'Path does not exist: hdfs://10.0.2.15:8020/Projects/ml_monitoring/Resources/Iris/Iris-inf1958-alerts-parquet/*.parquet;'
Traceback (most recent call last):
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 316, in parquet
    return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
  File "/srv/hops/spark/python/lib/py4j-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
    raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: 'Path does not exist: hdfs://10.0.2.15:8020/Projects/ml_monitoring/Resources/Iris/Iris-inf1958-alerts-parquet/*.parquet;'



In [233]:
logs_alerts_parquet_file.createOrReplaceTempView("logs_alerts_parquet_file")
alerts_df = spark.sql("SELECT * FROM logs_alerts_parquet_file")

name 'logs_alerts_parquet_file' is not defined
Traceback (most recent call last):
NameError: name 'logs_alerts_parquet_file' is not defined



In [234]:
print(alerts_df.show(20, truncate=False))

name 'alerts_df' is not defined
Traceback (most recent call last):
NameError: name 'alerts_df' is not defined



#### WindowStreamResolver logs

In [None]:
import pyarrow.parquet as pq
from hops import hdfs

LOGS_WINDOWRESOLVER_DIR = IRIS_RESOURCES_DIR + kfk_topic + "-window-resolver-parquet/"
logs_window_resolver_parquet_file = spark.read.parquet(LOGS_WINDOWRESOLVER_DIR + "*.parquet")

In [None]:
logs_window_resolver_parquet_file.createOrReplaceTempView("logs_window_resolver_parquet_file")
window_resolver_df = spark.sql("SELECT * FROM logs_window_resolver_parquet_file")

In [None]:
print(window_resolver_df.show(20, truncate=False))

### Stop served model

In [None]:
# stop the model
if serving.get_status(IRIS_MODEL_NAME) != 'Stopped':
    serving.stop(IRIS_MODEL_NAME)
    print("Model '{}' stopped".format(IRIS_MODEL_NAME))
else:
    print("Model '{}' already stopped".format(IRIS_MODEL_NAME))


### Stop monitoring job

> **WARNING**: Currently 'stop_job' method does not work. Url is not built properly, data is not serialized and header is not added.

> The url "/hopsworks-api/api/project/119/jobs/iris_ml_monitoring_dstream/executions/status" is missing the execution number.

> It should be "/hopsworks-api/api/project/119/jobs/iris_ml_monitoring_dstream/executions/<EXECUTOR_NUMBER>/status".

In [None]:
# NOT WORKING

# stop job
# response = jobs.stop_job(JOB_NAME)
# print(response)

In [None]:
# Source: https://github.com/logicalclocks/hops-util-py/blob/7804a0d6734fe6e8a23c2598547316d40776e94c/hops/jobs.py#L96

# Modification of stop_job method
from hops import constants, util, hdfs
from hops.exceptions import RestAPIError
import json
def stop_job(name, execution_id):
    """
    Stop the current execution of the job.
    Returns:
        The job status.
    """
    headers = {constants.HTTP_CONFIG.HTTP_CONTENT_TYPE: constants.HTTP_CONFIG.HTTP_APPLICATION_JSON}
    method = constants.HTTP_CONFIG.HTTP_PUT
    resource_url = constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   hdfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_JOBS_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   name + constants.DELIMITERS.SLASH_DELIMITER + \
                   constants.REST_CONFIG.HOPSWORKS_EXECUTIONS_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \
                   str(execution_id) + constants.DELIMITERS.SLASH_DELIMITER + \
                   "status"

    status = {"status":"stopped"}
    response = util.send_request(method, resource_url, data=json.dumps(status), headers=headers)
    response_object = response.json()
    if response.status_code >= 400:
        error_code, error_msg, user_msg = util._parse_rest_error(response_object)
        raise RestAPIError("Could not perform action on job's execution (url: {}), server response: \n "
                           "HTTP code: {}, HTTP reason: {}, error code: {}, error msg: {}, user msg: {}".format(
            resource_url, response.status_code, response.reason, error_code, error_msg, user_msg))
    return response_object

In [None]:
# stop the job
executions = jobs.get_executions(JOB_NAME, "")
for item in executions['items']:
    if item['id'] == job_execution_id and item['finalStatus'] == 'UNDEFINED':
        response = stop_job(JOB_NAME, job_execution_id)
        print("JOB execution with ID {} stopped when: \n - Duration: {} - Progress: {}".format(job_execution_id, response['duration'], response['progress']))
    else:
        print("JOB execution with ID {} already stopped: \n - Duration: {} - Progress: {} - Final status: {} - State: {}".format(job_execution_id, item['duration'], item['progress'], item['finalStatus'], item['state']))