## Example mlflow
Save the experiments in "mlflow locally" and compare the results of differents experiments

Introductory level

### -1) Install mlflow

In [None]:
### 1) Install mlflow

In [None]:
!pip show mlflow

### 0. Packages

In [None]:
# ml packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.datasets import fetch_california_housing
import seaborn as sns
from dotenv import load_dotenv, find_dotenv
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


# evaluate
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error


# mlflow
import mlflow

### 4) Connect to mlflow
In this part you decide the server of mlflow that you will connect:
- locally
- cluster cloud
- etc

In [None]:
#conect to mlflow
path_local_artifacts_mlflow = 'mlruns'

mlflow.set_tracking_uri(path_local_artifacts_mlflow)

### 5) Set the experiment
If the experiment doesn´t exist, it will be created automatically

In [None]:
experiment_name = '1_mlflow_example'
mlflow.set_experiment(experiment_name)

In [None]:
# validate the experiment is created/set
mlflow.get_experiment_by_name(experiment_name)

### 6) Train models and save its results in mlflow

#### 6.1) Load Data

In [None]:
# load data
data_X, data_y = fetch_california_housing(return_X_y=True, as_frame=True)

In [None]:
data_X.head()

In [None]:
data_y.head()

#### 6.2 EDA

In [None]:
# EDA
data_X.describe()

In [None]:
# correlations between features
corr = data_X.corr()

# Crear un mapa de calor con Matplotlib
plt.figure(figsize=(8, 4))
heatmap = plt.imshow(corr, cmap='coolwarm', interpolation='none', aspect='auto')

# Mostrar los valores en cada celda
for i in range(len(corr)):
    for j in range(len(corr)):
        plt.text(j, i, f'{corr.iloc[i, j]:.2f}', ha='center', va='center', color='w')

# Añadir barra de color
plt.colorbar(heatmap, fraction=0.046, pad=0.04)

# Añadir etiquetas
plt.xticks(range(len(corr)), corr.columns, rotation=90)
plt.yticks(range(len(corr)), corr.index)

# Añadir título
plt.title('Correlation Matrix')

# save
name_correleation_matrix = 'correlation_matrix.png'
plt.savefig(name_correleation_matrix)

# Mostrar el gráfico
plt.show()

#### 6.3 split data

In [None]:
# split train and test
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size = 0.25, random_state = 0)

In [None]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### 6.4 Values of target

In [None]:
# statistics target
y_train_mean = round(y_train.mean(), 2)
y_train_std = round(y_train.std(), 2)
y_test_mean = round(y_test.mean(), 2)
y_test_std = round(y_test.std(), 2)

print('statistics target')
print(f'--train-- mean = {y_train_mean}, std = {y_train_std}')
print(f'--test-- mean = {y_test_mean}, std = {y_test_std}')

In [None]:
# histogram target
plt.hist(y_train, 
         color = 'black', 
         alpha = 0.5,
        label = f'--train-- mean = {y_train_mean}, std = {y_train_std}'
        )

plt.hist(y_test, 
         color = 'orange', 
         alpha = 0.5,
        label = f'--test-- mean = {y_test_mean}, std = {y_test_std}'
        )

plt.title('histogram of target train vs test')
plt.legend()

# save
name_histograms_target = 'histograms_target_train_test.png'
plt.savefig(name_histograms_target)

# show
plt.show()

### 7. Train models, evaluate it and save results
Train all the models with the same dataset and evaluate with the same dataset

Models trained (each model trained is saved in a different run)
- linear regression
- decision tree
- random forest (small)
- random forest (medium)
- random forest (default)
- nn mlp (using sklearn)

### 7.0 Auxiliar functions

#### 7.1 Auxiliar functions train/evaluate models

In [None]:
def evaluate_model(y_true, y_predicted):
    """
    Given "y_true" and "y_predicted" calculate metrics of performance (r2, rmse, mae)
    """
    r2_metric = r2_score(y_true, y_predicted)

    rmse_metric = mean_squared_error(y_true, y_predicted, squared = False)

    mae_metric = mean_absolute_error(y_true, y_predicted)

    print("r2: ", r2_metric)
    print("rmse: ", rmse_metric)
    print("mae_metric: ", mae_metric)
    return r2_metric, rmse_metric, mae_metric

#### 7.1 linear regression (lr)

In [None]:
# train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# get predictions
y_test_predicted = lr.predict(X_test)

# evaluate
r2_lr, rmse_lr, mae_lr = evaluate_model(y_test, y_test_predicted)

In [None]:
# initialize run
run_name = "run-lr"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# save metrics
mlflow.log_metric("r2", r2_lr)
mlflow.log_metric("rmse", rmse_lr)
mlflow.log_metric("mae", mae_lr)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(lr, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

#### 7.2 decision tree (tree)

In [None]:
### parameters
max_depth_tree = 15
min_samples_split_tree = 10
min_samples_leaf_tree = 10


# train model
tree = DecisionTreeRegressor(max_depth = max_depth_tree,
                             min_samples_split = min_samples_split_tree,
                             min_samples_leaf = min_samples_leaf_tree,
                             random_state=42
                            )
tree.fit(X_train, y_train)

# get predictions
y_test_predicted = tree.predict(X_test)


# evaluate
r2_tree, rmse_tree, mae_tree = evaluate_model(y_test, y_test_predicted)

In [None]:
#initialize run
run_name = "run-tree"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# save parametes
mlflow.log_param("max_depth", max_depth_tree)
mlflow.log_param("min_samples_split", min_samples_split_tree)
mlflow.log_param("min_samples_leaf", min_samples_leaf_tree)

# save metrics
mlflow.log_metric("r2", r2_tree)
mlflow.log_metric("rmse", rmse_tree)
mlflow.log_metric("mae", mae_tree)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(tree, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

#### 7.2 Extra - explore the efect of change "max_depth"
- Explore the efect changing the maximum depth of the tree

- Instead of doing a hp tunning, the idea is changing the values of "max_depth" and SEE IN MLFLOW the efect of changing only one hiper-parameter

In [None]:
### parameters
max_depth_tree = [1,2,3,4,5,6,7,8,9,10]
min_samples_split_tree = 10
min_samples_leaf_tree = 20


# train model
for iter_max_depth in max_depth_tree:
    
    #### TRAIN AND EVALUATE
    tree = DecisionTreeRegressor(max_depth = iter_max_depth,
                                 min_samples_split = min_samples_split_tree,
                                 min_samples_leaf = min_samples_leaf_tree,
                                 random_state = 42
                                )
    tree.fit(X_train, y_train)
    y_test_predicted = tree.predict(X_test)
    r2_tree, rmse_tree, mae_tree = evaluate_model(y_test, y_test_predicted)


    #### SAVE MLFLOW
    #initialize run
    run_name = f"run-tree-depth-{iter_max_depth}"
    mlflow.start_run(run_name = run_name)
    run = mlflow.active_run()
    
    # save parametes
    mlflow.log_param("max-depth-tree", iter_max_depth)
    mlflow.log_param("split-tree", min_samples_split_tree)
    mlflow.log_param("leaf-tree", min_samples_leaf_tree)
    
    # save metrics
    mlflow.log_metric("r2", r2_tree)
    mlflow.log_metric("rmse", rmse_tree)
    mlflow.log_metric("mae", mae_tree)
    
    # save graphs
    mlflow.log_artifact(name_correleation_matrix)
    mlflow.log_artifact(name_histograms_target)
    
    # save model (but not registry)
    model_name = 'model.pkl'
    joblib.dump(tree, model_name) # save locally
    mlflow.log_artifact(model_name) # mlflow
    
    # end run
    mlflow.end_run()

#### 7.3 random forest (small) (rf_small)

In [None]:
### parameters
n_estimators_rf_small = 5
max_depth_rf_small = 50
min_samples_split_rf_small = 10 
min_samples_leaf_rf_small = 10


# train model
rf_small = RandomForestRegressor(n_estimators = n_estimators_rf_small,
                                   max_depth = max_depth_rf_small,
                                   min_samples_split = min_samples_split_rf_small,
                                   min_samples_leaf = min_samples_leaf_rf_small,
                                   random_state=42
                                  )
rf_small.fit(X_train, y_train)


# get predictions
y_test_predicted = rf_small.predict(X_test)


# evaluate
r2_rf_small, rmse_rf_small, mae_rf_small = evaluate_model(y_test, y_test_predicted)

In [None]:
#initialize run
run_name = "run-rf-small"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# save parametes
mlflow.log_param("n_estimators", n_estimators_rf_small)
mlflow.log_param("max_depth", max_depth_rf_small)
mlflow.log_param("min_samples_split", min_samples_split_rf_small)
mlflow.log_param("min_samples_leaf", min_samples_leaf_rf_small)

# save metrics
mlflow.log_metric("r2", r2_rf_small)
mlflow.log_metric("rmse", rmse_rf_small)
mlflow.log_metric("mae", mae_rf_small)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(rf_small, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

#### 7.4 random forest (medium) (rf_medium)

In [None]:
### parameters
n_estimators_rf_medium = 30
max_depth_rf_medium = 50
min_samples_split_rf_medium = 10 
min_samples_leaf_rf_medium = 10


# train model
rf_medium = RandomForestRegressor(n_estimators = n_estimators_rf_medium,
                                   max_depth = max_depth_rf_medium,
                                   min_samples_split = min_samples_split_rf_medium,
                                   min_samples_leaf = min_samples_leaf_rf_medium,
                                   random_state=42
                                  )
rf_medium.fit(X_train, y_train)


# get predictions
y_test_predicted = rf_medium.predict(X_test)


# evaluate
r2_rf_medium, rmse_rf_medium, mae_rf_medium = evaluate_model(y_test, y_test_predicted)

In [None]:
#initialize run
run_name = "run-rf-medium"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# save parametes
mlflow.log_param("n_estimators", n_estimators_rf_medium)
mlflow.log_param("max_depth", max_depth_rf_medium)
mlflow.log_param("min_samples_split", min_samples_split_rf_medium)
mlflow.log_param("min_samples_leaf", min_samples_leaf_rf_medium)

# save metrics
mlflow.log_metric("r2", r2_rf_medium)
mlflow.log_metric("rmse", rmse_rf_medium)
mlflow.log_metric("mae", mae_rf_medium)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(rf_medium, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

#### 7.5 random forest (default) (rf_default)

In [None]:
### parameters
n_estimators_rf_default = 100
max_depth_rf_default = 50
min_samples_split_rf_default = 10 
min_samples_leaf_rf_default = 10


# train model
rf_default = RandomForestRegressor(n_estimators = n_estimators_rf_default,
                                   max_depth = max_depth_rf_default,
                                   min_samples_split = min_samples_split_rf_default,
                                   min_samples_leaf = min_samples_leaf_rf_default,
                                   random_state=42
                                  )
rf_default.fit(X_train, y_train)


# get predictions
y_test_predicted = rf_default.predict(X_test)


# evaluate
r2_rf_default, rmse_rf_default, mae_rf_default = evaluate_model(y_test, y_test_predicted)

In [None]:
#initialize run
run_name = "run-rf-default"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# save parametes
mlflow.log_param("n_estimators", n_estimators_rf_default)
mlflow.log_param("max_depth", max_depth_rf_default)
mlflow.log_param("min_samples_split", min_samples_split_rf_default)
mlflow.log_param("min_samples_leaf", min_samples_leaf_rf_default)

# save metrics
mlflow.log_metric("r2", r2_rf_default)
mlflow.log_metric("rmse", rmse_rf_default)
mlflow.log_metric("mae", mae_rf_default)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(rf_default, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

#### 7.6 NN MLP (mlp-sk)

In [None]:
### parameters
hidden_layer_sizes_nn_mlp = [200, 100, 50, 25]
activation_nn_mlp = 'relu'
learning_rate_init_nn_mlp = 0.001
max_iter_nn_mlp = 200
early_stopping_nn_mlp = True
validation_fraction_nn_mlp = 0.1

# train model
nn_mlp = MLPRegressor(hidden_layer_sizes = hidden_layer_sizes_nn_mlp,
                      activation = activation_nn_mlp,
                      learning_rate_init = learning_rate_init_nn_mlp,
                      max_iter = max_iter_nn_mlp,
                      early_stopping = early_stopping_nn_mlp,
                      validation_fraction = validation_fraction_nn_mlp,
                      random_state = 42
                     )
nn_mlp.fit(X_train, y_train)


# get predictions
y_test_predicted = nn_mlp.predict(X_test)


# evaluate
r2_nn_mlp, rmse_nn_mlp, mae_nn_mlp = evaluate_model(y_test, y_test_predicted)

In [None]:
#initialize run
run_name = "run-mlp-sk"
mlflow.start_run(run_name = run_name)
run = mlflow.active_run()

# # save parametes
mlflow.log_param("hidden_layer_sizes_nn_mlp", '[200, 100, 50, 25]')
mlflow.log_param("activation_nn_mlp", activation_nn_mlp)
mlflow.log_param("learning_rate_init_nn_mlp", learning_rate_init_nn_mlp)
mlflow.log_param("max_iter_nn_mlp", max_iter_nn_mlp)
mlflow.log_param("early_stopping_nn_mlp", True)
mlflow.log_param("validation_fraction_nn_mlp", validation_fraction_nn_mlp)


# save metrics
mlflow.log_metric("r2", r2_nn_mlp)
mlflow.log_metric("rmse", rmse_nn_mlp)
mlflow.log_metric("mae", mae_nn_mlp)

# save graphs
mlflow.log_artifact(name_correleation_matrix)
mlflow.log_artifact(name_histograms_target)

# save model (but not registry)
model_name = 'model.pkl'
joblib.dump(nn_mlp, model_name) # save locally
mlflow.log_artifact(model_name) # mlflow

# end run
mlflow.end_run()

### 8. Delete files saved locally

In [None]:
### delete local files
os.remove(model_name)
os.remove(name_correleation_matrix)
os.remove(name_histograms_target)