<font color="#CA3532"><h1 align="left">Master Data Analytics. EDEM.</h1></font>
<font color="#6E6E6E"><h2 align="left">Herramientas MLOps.</h2></font> 
<font color="#6E6E6E"><h2 align="left">Tarea 2. Usando MLFlow como model registry.</h2></font> 
#### Daniel Ruiz Riquelme

In [8]:
import mlflow
from mlflow import log_metric, log_param, log_artifact

In [None]:
!pwd

In [None]:
# 1. Install MLFlow
# !pip install mlflow[extras]
# !pip freeze | grep mlflow

In [None]:
# 2. Setup MLFlow Tracking Server
# Run the following command in a terminal to start the MLFlow Tracking Server
# Configure mlflow to store all in a local directory
# !mlflow server --backend-store-uri ./mlruns --default-artifact-root ./mlruns

In [9]:
# 3. Create a new experiment
# This is a local tracking server. You can also use a remote tracking server. See https://mlflow.org/docs/latest/tracking.html for more details.
# Remote tracking server will be simulated later

import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("mlflow-model-training-iris")

2024/06/22 11:41:52 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-model-training-iris' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/396890877033717685', creation_time=1719049312593, experiment_id='396890877033717685', last_update_time=1719049312593, lifecycle_stage='active', name='mlflow-model-training-iris', tags={}>

In [10]:
# 4. Log a model experiment result

# Lets first create a simple function to train a model
# The function will receive a set of hyperparameters and return the model and the accuracy
import datetime

def get_data():
    from sklearn import datasets

    # X, y = datasets.make_classification(n_samples=1000, n_features=4, n_informative=4, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=0.5, random_state=40)
    # use iris dataset
    X, y = datasets.load_breast_cancer(return_X_y=True)
    return X, y

def train_model(hyperparameters):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    import mlflow.sklearn
    X, y = get_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
    model = RandomForestClassifier(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    return model, accuracy

# Now lets train a model with some hyperparameters
hyperparameters = {"n_estimators": 10, "max_depth": 5}
model, accuracy = train_model(hyperparameters)

# Now lets log the model and the accuracy
date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
with mlflow.start_run(run_name=f"mlflow-training-{date}"):
    
    # Hyperparameters are logged as parameters
    for hyperparameter, value in hyperparameters.items():
        mlflow.log_param(hyperparameter, value)
    
    # Accuracy is logged as a metric 
    mlflow.log_metric("accuracy", accuracy)
    
    # Model is logged as an artifact
    mlflow.sklearn.log_model(model, "model")

In [11]:
# 5. Run a hyperparameter search and log the results
import optuna
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

X, y = get_data()

def optimize_rf(trial):
    
    with mlflow.start_run(run_name=f"optuna-hp-{trial.number}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"):
        # Set the hyperparameter values that we want to optimize
        n_estimators = trial.suggest_int('n_estimators', 1, 100)
        max_depth = trial.suggest_int('max_depth', 2, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
        max_features = trial.suggest_float('max_features', 0.1, 1.0)
        
        
        # Create a random forest classifier using the suggested hyperparameters
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features)
        
        # Use cross-validation to evaluate the performance of the classifier
        scores = cross_val_score(rf, X, y, cv=5)
        
        # Log the hyperparameters and cross-validation scores to MLflow
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        mlflow.log_param('min_samples_leaf', min_samples_leaf)
        mlflow.log_param('max_features', max_features)
        mlflow.log_metric('mean_cv_score', scores.mean())
    
    # Return the mean of the cross-validation scores as the objective value
    return scores.mean()

# Create an Optuna study
study = optuna.create_study()

# Run the optimization loop
study.optimize(optimize_rf, n_trials=100)

# Get the best hyperparameter values
best_params = study.best_params

[32m[I 2024-06-22 11:42:00,977][0m A new study created in memory with name: no-name-2530a62f-62e8-4222-8f95-0fbe3de9b69a[0m
[32m[I 2024-06-22 11:42:01,323][0m Trial 0 finished with value: 0.9508306163639186 and parameters: {'n_estimators': 50, 'max_depth': 4, 'min_samples_split': 11, 'min_samples_leaf': 9, 'max_features': 0.5052479158064491}. Best is trial 0 with value: 0.9508306163639186.[0m
[32m[I 2024-06-22 11:42:01,914][0m Trial 1 finished with value: 0.943797546964757 and parameters: {'n_estimators': 86, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 15, 'max_features': 0.6902952164736533}. Best is trial 1 with value: 0.943797546964757.[0m
[32m[I 2024-06-22 11:42:02,424][0m Trial 2 finished with value: 0.9473218444340941 and parameters: {'n_estimators': 70, 'max_depth': 4, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_features': 0.7579088562601933}. Best is trial 1 with value: 0.943797546964757.[0m
[32m[I 2024-06-22 11:42:02,853][0m Trial 3 fini

In [18]:
# 6. Register a model version using the best hyperparameters

with mlflow.start_run(run_name=f"optuna-hp-final"):
    # Create the final model using the best hyperparameters
    final_model = RandomForestClassifier(**best_params)

    # Train the final model on the entire dataset
    final_model.fit(X, y)

    # Log the model to the "Models" section
    mlflow.sklearn.log_model(final_model, "random_forest_model", registered_model_name="random_forest_model")



Registered model 'random_forest_model' already exists. Creating a new version of this model...
2024/06/22 11:45:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random_forest_model, version 2
Created version '2' of model 'random_forest_model'.


In [21]:
# 7. Retrieve a model version and use it for prediction

import mlflow.pyfunc

model_name = "random_forest_model"
model_version = 2

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)

model.predict(X[:5])

array([0, 0, 0, 0, 0])

In [24]:
import mlflow
logged_model = 'runs:/3def78a0e9e345f793e0a7bba55b7919/random_forest_model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(X[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
# EXTRA: Deploy the model to a REST API endpoint with mlflow models serve
# Set up env variables
# export $(grep -v '^#' .env | xargs)
# export MLFLOW_TRACKING_URI=http://localhost:5000

In [None]:
# Using python_function backend
# mlflow models serve --model-uri s3://mlflow/1/e18a1a1695e844258403040ae6e6cb4b/artifacts/random_forest_model --env-manager=local -p 5002
# Try it out with curl

# curl -d '{"inputs":[[ 1.48722421,  1.02475923,  6.03974507, -1.72832624]]}' -H 'Content-Type: application/json'  localhost:5002/invocations

In [None]:
# Using MLServer backend
# mlflow models serve --model-uri s3://mlflow/1/e18a1a1695e844258403040ae6e6cb4b/artifacts/random_forest_model --env-manager=local -p 5002 --enable-mlserver
# Try it out with curl

# curl -d '{"inputs":[[ 1.48722421,  1.02475923,  6.03974507, -1.72832624]]}' -H 'Content-Type: application/json'  localhost:5002/invocations



In [None]:
# Undeploy mlflow docker compose
# docker-compose down