# Production Infrastructure

### Loading Libraries

In [1]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Requests
import requests

# Math
import math

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Date & Time
from datetime import datetime
from dataclasses import dataclass

# Machine Learning Flow API
import mlflow
import mlflow.sklearn
from mlflow.tracking.client import MlflowClient
from mlflow.models.signature import infer_signature

# Hyperparamter Tuning
import optuna

# Collections
from collections import namedtuple

# Scikit-Learn
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler

# Apache Spark
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import when
from pyspark.sql.functions import udf

In [3]:
class DataIngest:
  
  def __init__(self, url, local, source, sink, schema, database, table_name):
    self.url = url
    self.local = local
    self.source = source
    self.sink = sink
    self.schema = schema
    self.database = database
    self.table_name = table_name
    
  def _acquire_raw(self):
    response = requests.get(self.url, stream=True)
    with open(self.url.split("/")[-1], "wb") as data:
      data.write(response.content)
    response.close()
    
  def _transfer_local(self):
    dbutils.fs.mv(self.local, self.source)
    
  def _read_source(self):
    return spark.read.csv(self.source, header=True, inferSchema=False, schema=self.schema)
  
  def _write_source(self, data):
    data.write.format("delta").mode("overwrite").option("mergeSchema", "true").option("overwriteSchema", "true").save(self.sink)
    
  @staticmethod
  def _supplement_data(data):
    
    @udf("double")
    def _calculate_heat_index(t, h):
      f = ((t * 9/5) + 32)
      t2 = math.pow(f, 2)
      h2 = math.pow(h, 2)
      c = [ 0.363445176, 0.988622465, 4.777114035, -0.114037667, -0.000850208, -0.020716198, 0.000687678, 0.000274954, 0]
      heat_index =  c[0] + (c[1]*f) + (c[2]*h) + (c[3]*f*h) + (c[4]*t2) + (c[5]*h2) + (c[6]*t2*h) + (c[7]*f*h2) + (c[8]*t2*h2)
      return ((heat_index - 32) * 5/9)
    
    return (data.withColumn("month", F.from_unixtime(F.unix_timestamp(F.col("month"), 'MMM'), 'MM').cast(DoubleType()))
                .withColumn("day", when(F.col("day") == "sun", 0.0).when(F.col("day") == "mon", 1.0)
                                  .when(F.col("day") == "tue", 2.0).when(F.col("day") == "wed", 3.0)
                                  .when(F.col("day") == "thu", 4.0).when(F.col("day") == "fri", 5.0)
                                  .otherwise(6.0)
                           )
                .withColumn("heat_index", _calculate_heat_index(F.col("temperature"), F.col("relative_humidity")))
           )
  
  def _create_table(self):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {self.database};")
    spark.sql(f"""CREATE TABLE IF NOT EXISTS {self.database}.{self.table_name} USING DELTA LOCATION '{self.sink}';""")
    dbutils.fs.rm(self.source)
  
  def register_data(self):
    self._acquire_raw()
    self._transfer_local()
    data = self._read_source()
    supplemented = self._supplement_data(data)
    self._write_source(supplemented)
    self._create_table()
    
  def get_data(self):
    return spark.table(f"{self.database}.{self.table_name}")
  
  def get_data_as_pandas(self):
    return self.get_data().toPandas()

In [4]:
class PathHelper:
  def __init__(self, experiment_name):
    self.experiment_name = experiment_name
  def name_generator(self):
    return f"//{self.experiment_name}"

In [5]:

@dataclass
class Registry:
  model_name: str
  production_version: int
  updated: bool
  training_time: str
    
class RegistryStructure:
  def __init__(self, data):
    self.data = data
  def generate_row(self):
    spark_df = spark.createDataFrame(pd.DataFrame([vars(self.data)]))
    return (spark_df.withColumn("training_time", F.to_timestamp(F.col("training_time")))
            .withColumn("production_version", F.col("production_version").cast("long")))

In [6]:
class RegistryLogging:
  
  def __init__(self, database, table, delta_location, model_name, production_version, updated):
    self.database = database
    self.table = table
    self.delta_location = delta_location
    self.entry_data = Registry(model_name, production_version, updated, self._get_time())
  
  @classmethod
  def _get_time(self):
    return datetime.today().strftime('%Y-%m-%d %H:%M:%S')
  
  def _check_exists(self):
    return spark._jsparkSession.catalog().tableExists(self.database, self.table)
  
  def write_entry(self):
    log_row = RegistryStructure(self.entry_data).generate_row()
    log_row.write.format("delta").mode("append").save(self.delta_location)
    if not self._check_exists():
      spark.sql(f"""CREATE TABLE IF NOT EXISTS {self.database}.{self.table} USING DELTA LOCATION '{self.delta_location}';""")
    

class ModelRegistration:
  
  def __init__(self, experiment_name, experiment_title, model_name, metric, direction):
    self.experiment_name = experiment_name
    self.experiment_title = experiment_title
    self.model_name = model_name
    self.metric = metric
    self.direction = direction
    self.client = MlflowClient()
    self.experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    
  def _get_best_run_info(self, key):
    run_data = mlflow.search_runs(self.experiment_id, order_by=[f"metrics.{self.metric} {self.direction}"])
    return run_data.head(1)[key].values[0]
    
  def _get_registered_status(self):
    return self.client.get_registered_model(name=self.experiment_title)
    
  def _get_current_prod(self):
    return [x.run_id for x in self._get_registered_status().latest_versions if x.current_stage == "Production"][0]
  
  def _get_prod_version(self):
    return int([x.version for x in self._get_registered_status().latest_versions if x.current_stage == "Production"][0])
  
  def _get_metric(self, run_id):
    return mlflow.get_run(run_id).data.metrics.get(self.metric)
  
  def _find_best(self):
    try: 
      current_prod_id = self._get_current_prod()
      prod_metric = self._get_metric(current_prod_id)
    except mlflow.exceptions.RestException:
      current_prod_id = -1
      prod_metric = 1e7
    
    best_id = self._get_best_run_info('run_id')
    best_metric = self._get_metric(best_id)
    
    if self.direction == "ASC":
      if prod_metric < best_metric:
        return current_prod_id
      else:
        return best_id
    else:
      if prod_metric > best_metric:
        return current_prod_id
      else:
        return best_id
  
  def _generate_artifact_path(self, run_id):
    return f"runs:/{run_id}/{self.model_name}"
    
  def register_best(self, registration_message, logging_location, log_db, log_table):
    best_id = self._find_best()
    try:
      current_prod = self._get_current_prod()
      current_prod_version = self._get_prod_version()
    except mlflow.exceptions.RestException:
      current_prod = -1
      current_prod_version = -1
    updated = current_prod != best_id
    
    if updated:
      register_new = mlflow.register_model(self._generate_artifact_path(best_id), self.experiment_title)
      self.client.update_registered_model(name=register_new.name, 
                                          description="Forest Fire Prediction for the National Park")
      self.client.update_model_version(name=register_new.name, 
                                       version=register_new.version, 
                                       description=registration_message)
      self.client.transition_model_version_stage(name=register_new.name, 
                                                 version=register_new.version, 
                                                 stage="Production")
      if current_prod_version > 0:
        self.client.transition_model_version_stage(name=register_new.name,
                                                   version=current_prod_version,
                                                   stage="Archived")
      RegistryLogging(log_db, log_table, logging_location, self.experiment_title, int(register_new.version), updated).write_entry()
      return "upgraded prod"
    else:
      RegistryLogging(log_db, log_table, logging_location, self.experiment_title, int(current_prod_version), updated).write_entry()
      return "no change"
    
  def get_model_as_udf(self):
    prod_id = self._get_current_prod()
    artifact_uri = self._generate_artifact_path(prod_id)
    return mlflow.pyfunc.spark_udf(spark, model_uri=artifact_uri)


In [7]:
class DataPrep:
    
    def __init__(self, data, label, test_size=0.3):
        self.data = data
        self.label = label
        self.test_size = test_size
        
    def split_features(self):
        Data = namedtuple('Data', 'X y')
        X = self.data.drop([self.label], axis=1)
        y = self.data[self.label]
        return Data(X, y)
    
    def train_test_split(self, stratify_column):
        TrainTest = namedtuple('Data', 'X_train X_test y_train y_test X y')
        split_data = self.split_features()
        X_train, X_test, y_train, y_test = train_test_split(split_data.X, 
                                                            split_data.y, 
                                                            stratify=split_data.X[stratify_column],
                                                            test_size=self.test_size
                                                           )
        return TrainTest(X_train, X_test, y_train, y_test, split_data.X, split_data.y)

class ModelScoring:
    
    def __init__(self, y_test, y_pred, param_count, algorithm):
        self.y_test = y_test
        self.y_pred = y_pred
        self.n = len(y_test)
        self.param_count = param_count
        self.algorithm = algorithm
    
    def _mse_calc(self):
        return mean_squared_error(self.y_test, self.y_pred)
    
    def _bic(self):
        return self.n * np.log(self._mse_calc()) + self.param_count * np.log(self.n)
    
    def _rmse(self):
        return np.sqrt(self._mse_calc())
    
    def evaluate(self):
        return {'rmse': self._rmse(), 'bic': self._bic()}[self.algorithm]

class Logging:
  
  def __init__(self, metric, model_name, run_name):
    self.metric = metric
    self.model_name = model_name
    self.run_name = run_name
    self.signature = {}
    
  def log_mlflow(self, trial, data, model, params):
    if not self.signature:
      self.signature = infer_signature(data.X_train, model.predict(data.X_train))
    with mlflow.start_run(run_name=self.run_name):
      mlflow.log_params(params)
      mlflow.sklearn.log_model(model, self.model_name, signature=self.signature)
      scores = ModelScoring(data.y_test, model.predict(data.X_test), len(params.keys()), self.metric).evaluate()
      mlflow.log_metric(self.metric, scores)
      mlflow.log_param("trial_number", trial.number)
    return scores
      
class RandomForestTuning:
    
    def __init__(self, data, label, stratify_column, trials, model_name, run_name, test_size=0.3, metric='rmse'):
        self.data = data
        self.label = label
        self.stratify_column = stratify_column
        self.trials = trials
        self.model_name = model_name
        self.run_name = run_name
        self.test_size = test_size
        self.metric = metric
    
    @staticmethod
    def _random_forest_model(**kwargs):
        model = RandomForestRegressor(n_estimators=kwargs['n_estimators'],
                                      max_depth=kwargs['max_depth'],
                                      min_samples_split=kwargs['min_samples_split'],
                                      min_samples_leaf=kwargs['min_samples_leaf'],
                                      max_leaf_nodes=kwargs['max_leaf_nodes'],
                                      min_impurity_decrease=kwargs['min_impurity_decrease'],
                                      max_features=kwargs['max_features'],
                                      n_jobs=-1
                                     )
        return model
    
    def _run_trial(self, trial):
        logger = Logging(self.metric, self.model_name, self.run_name)
        
        splits = DataPrep(self.data, self.label, self.test_size).train_test_split(self.stratify_column)
        params = {
                 'n_estimators': trial.suggest_int("n_estimators", 50, 2000, 10),
                 'max_depth': trial.suggest_int("max_depth", 2, 24, 1),
                 'min_samples_split': trial.suggest_int("min_samples_split", 2, 100, 1),
                 'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 50, 1),
                 'max_leaf_nodes': trial.suggest_int("max_leaf_nodes", 4, 500, 1),
                 'min_impurity_decrease': trial.suggest_loguniform("min_impurity_decrease", 1e-22, 1e-1),
                 'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2'])
                }

        model = self._random_forest_model(**params).fit(splits.X_train, splits.y_train)
        scores = logger.log_mlflow(trial, splits, model, params)
        return scores
        
    def run(self, experiment_name):
        mlflow.set_experiment(experiment_name)

        trial = optuna.create_study(direction='minimize')
        trial.optimize(self._run_trial, self.trials)
        return trial    

In [8]:
EXPERIMENT_TITLE = "Forest_Fire_Model_5"
EXPERIMENT_NAME = PathHelper(EXPERIMENT_TITLE).name_generator()
RUN_NAME = "initial_prod_release"
MODEL_NAME = "random_forest_model"
OPTIMIZER_ITERATIONS = 5
TEST_PERCENTAGE = 0.5

DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv"
LOCAL_FILE = "file:/databricks/driver/forestfires.csv"
SOURCE = "dbfs:/home/benjamin.wilson@databricks.com/demo/firedata/fire.csv"  
SINK = "/home/benjamin.wilson@databricks.com/demo/fire" 
DATABASE = "ben_demo" 
TABLE_NAME = "fire_regression"
LOG_TABLE = f"{TABLE_NAME}_logs"
LOGGING_LOCATION = f"{SINK}_{LOG_TABLE}"

FIRE_SCHEMA = StructType([
  StructField('x_coord', DoubleType()),
  StructField('y_coord', DoubleType()),
  StructField('month', StringType()),
  StructField('day', StringType()),
  StructField('fine_fuel_moisture_code', DoubleType()),
  StructField('duff_moisture_code', DoubleType()),
  StructField('drought_code', DoubleType()),
  StructField('initial_spread_index', DoubleType()),
  StructField('temperature', DoubleType()),
  StructField('relative_humidity', DoubleType()),
  StructField('windspeed', DoubleType()),
  StructField('rain_amount', DoubleType()),
  StructField('area', DoubleType())
])

In [10]:
# data_handler = DataIngest(DATA_URL, LOCAL_FILE, SOURCE, SINK, FIRE_SCHEMA, DATABASE, TABLE_NAME)

# data_handler.register_data()
# fire_data = data_handler.get_data_as_pandas()

In [None]:
random_forest_tune = RandomForestTuning(fire_data, 'area', 'x_coord', OPTIMIZER_ITERATIONS, MODEL_NAME, RUN_NAME, TEST_PERCENTAGE, 'rmse')

trial_rf = random_forest_tune.run(EXPERIMENT_NAME)

In [None]:
registry = ModelRegistration(EXPERIMENT_NAME, EXPERIMENT_TITLE, MODEL_NAME, "rmse", "ASC")

registry.register_best("initial run", LOGGING_LOCATION, DATABASE, LOG_TABLE)

In [None]:
display(spark.table(f"{DATABASE}.{LOG_TABLE}").orderBy(F.col("training_time")))

In [None]:
RUN_NAME = "updated_prod_release"
OPTIMIZER_ITERATIONS = 50
TEST_PERCENTAGE = 0.1

random_forest_tune_update = RandomForestTuning(fire_data, 'area', 'x_coord', OPTIMIZER_ITERATIONS, MODEL_NAME, RUN_NAME, TEST_PERCENTAGE, 'rmse')
trial_rf_update = random_forest_tune_update.run(EXPERIMENT_NAME)

registry_update = ModelRegistration(EXPERIMENT_NAME, EXPERIMENT_TITLE, MODEL_NAME, "rmse", "ASC")
registry_update.register_best("initial run", LOGGING_LOCATION, DATABASE, LOG_TABLE)

In [None]:
display(spark.table(f"{DATABASE}.{LOG_TABLE}").orderBy(F.col("training_time")))

In [None]:
RUN_NAME = "passive_retrain_attempt"
OPTIMIZER_ITERATIONS = 5 
TEST_PERCENTAGE = 0.6 

random_forest_tune_update = RandomForestTuning(fire_data, 'area', 'x_coord', OPTIMIZER_ITERATIONS, MODEL_NAME, RUN_NAME, TEST_PERCENTAGE, 'rmse')
trial_rf_update = random_forest_tune_update.run(EXPERIMENT_NAME)

registry_update = ModelRegistration(EXPERIMENT_NAME, EXPERIMENT_TITLE, MODEL_NAME, "rmse", "ASC")
registry_update.register_best("initial run", LOGGING_LOCATION, DATABASE, LOG_TABLE)

In [None]:
display(spark.table(f"{DATABASE}.{LOG_TABLE}").orderBy(F.col("training_time")))  

In [None]:
raw_data = spark.table("ben_demo.fire_regression")
predicted_data = raw_data.withColumn("prediction", registry.get_model_as_udf()(*[x for x in raw_data.columns if x != "area"]))
display(predicted_data)