Tracking con PySpark usando el sistema local de archivos --- 0:00 min
===

* 0:00 min | Ultima modificación: Abril 4, 2022 | YouTube

Código base
---

In [5]:
def load_data():

    import pandas as pd

    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    df = pd.read_csv(url, sep=";")

    y = df["quality"]
    x = df.copy()
    x.pop("quality")

    return x, y


def make_train_test_split(x, y):
    #
    # Se aprovecha PySpark para realizar el particionamiento de datos y 
    # mantener la compatibilidad con los ejemplos anteriores.
    #
    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test


def eval_metrics(y_true, y_pred):

    #
    # En este ejemplo se puede hacer con sklearn debido a que la muestra es 
    # pequeña y se desea hacer el ejemplo compatible con el resto de tutoriales
    # de esta sección.
    #
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2


def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

MLflow Tracking
---

In [6]:
def make_experiment(experiment_name, units, verbose=0):

    import os

    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
    
    import tensorflow as tf

    import mlflow
    import mlflow.sklearn

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    FEATURES = 11


    #
    # Establece el directorio de tracking. Esta es la dirección absoluta al
    # directorio actual en este ejemplo.
    #
    if not os.path.exists("mlruns"):
        os.makedirs("mlruns")
    mlflow.set_tracking_uri("file:///datalake/mlflow/mlruns")
    print("Tracking directory:", mlflow.get_tracking_uri())

    mlflow.tensorflow.autolog(
        every_n_iter=1,
        log_models=True,
        disable=False,
        exclusive=False,
        disable_for_unsupported_versions=False,
        silent=False,
        registered_model_name=None,
    )

    #
    # Almancena las corridas  en el experimento indicado
    #
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run() as run:

        run = mlflow.active_run()
        print("Active run_id: {}".format(run.info.run_id))

        model = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(
                    units,
                    activation="relu",
                    input_shape=(FEATURES,),
                ),
                tf.keras.layers.Dense(1),
            ]
        )
        
        mlflow.log_param("units", units)
        

        model.compile(
            optimizer=tf.keras.optimizers.Adam(),
            loss="mean_squared_error",
            metrics=[
                "mean_squared_error",
                "mean_absolute_error",
            ]
        )


        model.fit(
            x_train,
            y_train,
            epochs=1000,
            verbose=verbose,
            validation_split=0.2,
        )

        results = model.evaluate(
            x_test,
            y_test,
            verbose=verbose,
        )
        
        
        y_pred = model.predict(x_test)
        mse, mae, r2 = eval_metrics(y_test, y_pred=y_pred)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        
    #
    # Reporta el mejor modelo encontrado en la corrida
    #
    y_pred = model.predict(x_test)
    mse, mae, r2 = eval_metrics(y_test, y_pred=y_pred)
    if verbose > 0:
        report(model, mse, mae, r2)

In [7]:
import numpy as np

#
# Se tantea para diferentes cantidades de neuronas en la capa oculta. Por 
# simplicidad no se varian otros parámetros como la tasa de aprendizaje o su
# esquema de modificación
#
for units in range(1, 5):
    print(f"Runing for H={units} ...")
    make_experiment(
        units=units,
        experiment_name="red-wine",
        verbose=0,
    )
    print()

Runing for H=1 ...
Tracking directory: file:///datalake/mlflow/mlruns
Active run_id: 2f6719e2b77648228786700d3d479286
INFO:tensorflow:Assets written to: /tmp/tmp8ogh4xoi/model/data/model/assets

Runing for H=2 ...
Tracking directory: file:///datalake/mlflow/mlruns
Active run_id: e57e691adc8e483e87672c2c3343b191
INFO:tensorflow:Assets written to: /tmp/tmp17y4wrnf/model/data/model/assets

Runing for H=3 ...
Tracking directory: file:///datalake/mlflow/mlruns
Active run_id: ff7278c9570745e58c6451046ec35445
INFO:tensorflow:Assets written to: /tmp/tmpk9liyoli/model/data/model/assets

Runing for H=4 ...
Tracking directory: file:///datalake/mlflow/mlruns
Active run_id: 17a068cdf22a4fbb8c1812bbfcb8700e
INFO:tensorflow:Assets written to: /tmp/tmpzay4hipz/model/data/model/assets



Código base
---

In [1]:
def load_data():

    import pandas as pd

    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    df = pd.read_csv(url, sep=";")

    y = df["quality"]
    x = df.copy()
    x.pop("quality")

    return x, y


def make_train_test_split(x, y):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
        x,
        y,
        test_size=0.25,
        random_state=123456,
    )
    return x_train, x_test, y_train, y_test


def eval_metrics(df):

    from pyspark.mllib.evaluation import RegressionMetrics as rmtrcs
    
    metrics = rmtrcs(df.rdd.map(lambda x: (x.quality, x.prediction)))
    
    mse = metrics.meanSquaredError
    mae = metrics.meanAbsoluteError
    r2 = metrics.r2
    
    return mse, mae, r2


def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"  R2: {r2}")

MLflow Tracking
---

In [2]:
def make_experiment(experiment_name, regParam, elasticNetParam, verbose=1):

    import os

    import mlflow
    import pandas as pd
    from pyspark.ml.feature import VectorAssembler
    from pyspark.ml.regression import LinearRegression
    from pyspark.sql import SparkSession

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    
    pdf_train = pd.concat([x_train, pd.to_numeric(y_train, downcast='float')], axis="columns")
    pdf_test = pd.concat([x_test, pd.to_numeric(y_test, downcast='float')], axis="columns")
    

    #
    # Spark
    #
    spark = SparkSession.builder.getOrCreate()

    df_train = spark.createDataFrame(pdf_train)
    df_test = spark.createDataFrame(pdf_test)

    vectorAssembler = VectorAssembler(
        inputCols=list(pdf_train.columns[:-1]),
        outputCol="features",
    )
    df_train = vectorAssembler.transform(df_train)
    df_test = vectorAssembler.transform(df_test)
    
    
    lr =  LinearRegression(
        featuresCol = 'features',
        labelCol = 'quality',
        predictionCol = 'prediction',
        maxIter = 1000,
        regParam = regParam,
        elasticNetParam = elasticNetParam,
        fitIntercept=True,
        standardization=True
    )
    
    
    if not os.path.exists("mlruns"):
        os.makedirs("mlruns")
    mlflow.set_tracking_uri("file:///datalake/mlflow/mlruns")
    print("Tracking directory:", mlflow.get_tracking_uri())

    #
    # Establece el directorio de tracking. Esta es la dirección absoluta al
    # directorio actual en este ejemplo.
    #
    mlflow.pyspark.ml.autolog()

    #
    # Almancena las corridas  en el experimento indicado
    #
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run() as run:

        run = mlflow.active_run()
        print("Active run_id: {}".format(run.info.run_id))

        model = lr.fit(df_train)
    
    df_test = model.transform(df_test)
    
    mse, mae, r2 = eval_metrics(df_test)
    if verbose > 0:
        report(model, mse, mae, r2)

    mlflow.log_metric("mse", mse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)
    
    spark.stop()

In [3]:
import numpy as np

#
# Se realizar el primer tanteo
#
make_experiment(
    experiment_name="red-wine",
    regParam=0.00001,
    elasticNetParam=0.00001,
    verbose=1,
)

22/04/04 17:55:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022/04/04 17:55:31 INFO mlflow.tracking.fluent: Experiment with name 'red-wine' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Tracking directory: file:///datalake/mlflow/mlruns
Active run_id: ee8934436ecd4bc69ef1e6f12afb8bd2


22/04/04 17:55:33 WARN util.Instrumentation: [c8be1cb5] regParam is zero, which might cause numerical instability and overfitting.
22/04/04 17:55:35 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/04/04 17:55:35 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/04/04 17:55:35 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/04/04 17:55:35 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

LinearRegressionModel: uid=LinearRegression_cca6c59a820a, numFeatures=11:
  MSE: 0.4552639630531903
  MAE: 0.5292753254282959
  R2: -1.1144132381735132


In [None]:
#
# Se realizar el segundo tanteo
#
make_experiment(
    experiment_name="red-wine",
    regParam=0.0005,
    elasticNetParam=0.0001,
    verbose=1,
)

MLflow ui
---

Para visualizar la interfase use:

```bash
mlflow ui
```

**Nota:** En docker usar:

```bash
mlflow ui --host 0.0.0.0 
``` 

con: 

http://127.0.0.1:5001


![assets/mlflow-tracking-1-sklearn-part-0.png](assets/mlflow-tracking-2-tensorflow-part-0.png)

**Detalles de la corrida**

![assets/mlflow-tracking-1-sklearn-part-1.png](assets/mlflow-tracking-2-tensorflow-part-1.png)
![assets/mlflow-tracking-1-sklearn-part-2.png](assets/mlflow-tracking-2-tensorflow-part-2.png)
![assets/mlflow-tracking-1-sklearn-part-3.png](assets/mlflow-tracking-2-tensorflow-part-3.png)

Chequeo
---

In [8]:
def check_estimator():

    import mlflow

    x, y = load_data()
    x_train, x_test, y_train, y_test = make_train_test_split(x, y)

    # NOTA: este parámetro es copiado directamente de la interfase de MLflow
    estimator_path = "runs:/17a068cdf22a4fbb8c1812bbfcb8700e/model"
    estimator = mlflow.pyfunc.load_model(estimator_path)
    mse, mae, r2 = eval_metrics(y_test, y_pred=estimator.predict(x_test))
    report(estimator, mse, mae, r2)


#
# Debe coincidir con el mejor modelo encontrado en la celdas anteriores
#
check_estimator()



mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.keras
  run_id: 17a068cdf22a4fbb8c1812bbfcb8700e
:
  MSE: 0.46573855418702975
  MAE: 0.5218748867511749
  R2: 0.3317715066006244


In [None]:
# -----------------------------------------------------------------------------
# No se borran las corridas para comparar resultados con otras librerías
# -----------------------------------------------------------------------------
# %%bash
# rm -rf outputs mlruns models