# Modelo de bosque aleatorio
En este cuaderno aplico un modelo de bosque aleatorio (*random forest*) para el ejercicio de clasificación que se está aplicando en el proyecto. Haré una optimización de hiperparámetros empleando optuna. Empiezo por importar las librerías y los servicios necesarios.

In [0]:
import warnings
warnings.filterwarnings("ignore")
import os
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report

In [0]:
!apt install git
!pip install mlflow requests

In [0]:
import mlflow

In [0]:
token = ""
repo_url = "https://github.com/juramireza/proyecto_mlds_6"
import re
pat = re.compile(r"(https://)(.*)")
match = re.match(pat, repo_url)
url_token = "".join([match.group(1), token, "@", match.group(2)])
os.environ["GITHUB"] = url_token

In [0]:
!git clone $GITHUB

%cd proyecto_mlds_6

In [0]:
!git config --global user.email "jdortizc@unal.edu.co"
!git config --global user.name "jdoc"
!git config --global init.defaultBranch master
!git init

In [0]:
!git remote -v

In [0]:
!pwd
!ls -a
!git status

In [0]:
!git commit -m "Modelo final del proyecto: bosque aleatorio"

## Comandos para usar *mlflow* mediante *ngrok*

In [0]:
!mkdir mlruns/final_model

!git add /content/proyecto_mlds_6/mlruns/final_model
!git commit -m "Carpeta para guardar los datos de MLFLow del modelo final"

In [0]:
command = """
mlflow server \
        --backend-store-uri sqlite:///tracking.db \
        --default-artifact-root file:mlruns/final_model \
        -p 5000 &
"""
get_ipython().system_raw(command)

In [0]:
!pip install pyngrok

In [0]:
token = "" # Agregue el token dentro de las comillas
os.environ["NGROK_TOKEN"] = token

In [0]:
!ngrok authtoken $NGROK_TOKEN

In [0]:
from pyngrok import ngrok
ngrok.connect(5000, "http")

## Construcción del modelo y registro en mlflow

In [0]:
mlflow.set_tracking_uri("http://localhost:5000")
exp_id = mlflow.create_experiment(name="final_model", artifact_location="mlruns/final_model/")

run = mlflow.start_run(experiment_id = exp_id, run_name="final_model")

from mlflow.models import infer_signature

In [0]:
X_train = pd.read_csv('/content/proyecto_mlds_6/data/train_test_data/X_train.csv')
y_train = pd.read_csv('/content/proyecto_mlds_6/data/train_test_data/y_train.csv')
X_test = pd.read_csv('/content/proyecto_mlds_6/data/train_test_data/X_test.csv')
y_test = pd.read_csv('/content/proyecto_mlds_6/data/train_test_data/y_test.csv')

params = {'n_estimators': 73,
          'criterion': 'gini',
          'max_depth': 50,
          'min_samples_split': 20,
          'min_samples_leaf': 2,
          'min_weight_fraction_leaf': 2.5214105645479093e-05,
          'ccp_alpha': 1.8151142421551003e-05,
          'max_samples': 0.959999447846922}
model = RandomForestClassifier(**params).fit(X_train, y_train)

# Infer the model signature
y_pred = model.predict(X_test)
signature = infer_signature(X_test, y_pred)

# Log parameters and metrics using the MLflow APIs
mlflow.log_params(params)
mlflow.log_metrics({"accuracy": accuracy_score(y_test, y_pred),
                    "f1": f1_score(y_test, y_pred)})

# Confusion matrix
mc = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(mc, annot=True, fmt="d", cmap="crest", linewidths=.5, annot_kws={"size": 16})
ax.set_xlabel("Etiquetas predichas")
ax.set_ylabel("Etiquetas reales")
ax.set_title("Matriz de confusión para el bosque aleatorio")
fig.show()
fig.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png", "confusion_matrix")

# Log the sklearn model and register as version 1
mlflow.sklearn.log_model(sk_model=model,
                         artifact_path="final_model",
                         signature=signature,
                         registered_model_name="final_model")

mlflow.end_run()

In [0]:
print('Exactitud: ',accuracy_score(y_test, y_pred))
print('Métrica F1: ',f1_score(y_test, y_pred))

In [0]:
print(classification_report(y_test, y_pred))

## Despliegue del modelo con *mlflow*

In [0]:
os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"

In [0]:
command = """
mlflow models serve -m 'models:/final_model/1' -p 8001 --env-manager 'local' &
"""
get_ipython().system_raw(command)

In [0]:
import requests
import json

In [0]:
# Solicitud de predicciones a partir del conjunto de prueba

n = 3 # Cantidad de datos con los que se quiere realizar predicciones
data_request = X_test.sample(n=n) # Tomo aleatoriamente "n" muestras
data_request

In [0]:
# Etiquetas reales para comparar
y_test.iloc[data_request.index.values]

In [0]:
# Doy a los datos el formato requerido para la solicitud
data_request = json.dumps({"dataframe_split": data_request.to_dict(orient="split")})
display(data_request)

In [0]:
# Realizo la solicitud
r = requests.post("http://localhost:8001/invocations", data=data_request, headers={"Content-Type": "application/json"})
print(r.json())

## Realizo el 'push' en *git*

In [0]:
!git status

In [0]:
!git add hp.db
!git add tracking.db
!git commit -m "Archivos modificados por MLFlow"
!git add confusion_matrix.png
!git commit -m "Matriz de confusión del modelo final"
!git add mlruns/final_model/
!git commit -m "Carpeta donde se aloja el modelo final para mlflow"

In [0]:
!git status

In [0]:
!git commit -m "Realizo el despliegue del modelo final desde 'mlflow'"

In [0]:
!git push origin master

In [0]:
!git status