## 0. Libraries 📚

In [None]:
import pandas as pd
import ast
import joblib

## 1. Load data 📥

In [None]:
diagnoses_df = pd.read_csv("data/ground_truth_df.csv")
diagnoses_df['Codigos_diagnosticos'] = diagnoses_df['Codigos_diagnosticos'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
diagnoses_df['Diagnosticos_estandar'] = diagnoses_df['Diagnosticos_estandar'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
diagnoses_df

## 2. Models using embeddings as input

In [None]:
from sentence_transformers import SentenceTransformer
import os
import numpy as np


EMB_PATH = "X_embeddings.npy"

if os.path.isfile(EMB_PATH):
    X_embeddings = np.load(EMB_PATH)
else:
    embeddings_model = SentenceTransformer('intfloat/multilingual-e5-large')
    texts = ["query: " + t for t in diagnoses_df['Descripcion_diagnosticos']]
    X_embeddings = embeddings_model.encode(texts, convert_to_tensor=False, show_progress_bar=True)
    np.save(EMB_PATH, X_embeddings)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(diagnoses_df['Diagnosticos_estandar'])

In [None]:
from sklearn.model_selection import train_test_split

val_size, test_size = 0.15, 0.15

X_temp, X_test, Y_temp, Y_test = train_test_split(X_embeddings, y, test_size=test_size, random_state=42)

# Then split train+val into train and val
X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size=val_size/(1-test_size), random_state=42)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

def evaluate(model, X, Y_true, label=""):
    Y_pred = model.predict(X)
    print(f"\n=== Evaluation on {label} set ===")
    print(f"F1 score (micro):  {f1_score(Y_true, Y_pred, average='micro', zero_division=0):.4f}", )
    print(f"Precision (micro): {precision_score(Y_true, Y_pred, average='micro', zero_division=0):.4f}", )
    print(f"Recall (micro):    {recall_score(Y_true, Y_pred, average='micro', zero_division=0):.4f}", )

### 2.1. XGBoost per diagnosis

In [None]:
from xgboost import XGBClassifier
from sklearn.base import clone
from tqdm import tqdm
import numpy as np

class CustomOvR_XGBoost:
    def __init__(self, base_model=None):
        default_params = dict(
             # Hyper-parameters
            n_estimators       = 300,
            learning_rate      = 0.1,
            max_depth          = 8,
            subsample          = 0.8,
            colsample_bytree   = 0.8,

            # Fixed settings for multilabel on GPU
            objective          = "binary:logistic",
            eval_metric        = "logloss",
            tree_method        = "hist",
            device             = "cuda",
            n_jobs             = -1,
            random_state       = 42,
            verbosity          = 1
        )
        self.base_model = clone(base_model) if base_model else XGBClassifier(**default_params)
        self.models = []
        self._constant_targets = []  # guarda clases vacías/constantes

    # -------- entrenamiento --------
    def fit(self, X, Y, eval_set=None, verbose=False):
        self.models.clear()
        self._constant_targets.clear()

        for k in tqdm(range(Y.shape[1]), desc="Training XGBoost classifiers"):
            yk = Y[:, k]

            # ¿La columna está vacía o es constante?
            if np.all(yk == 0) or np.all(yk == 1):
                self.models.append(None)          # marcador
                self._constant_targets.append(int(yk[0]))
                continue

            model = clone(self.base_model)
            if eval_set is not None:
                eval_set_k = [(eval_X, eval_y[:, k]) for eval_X, eval_y in eval_set]
                model.fit(X, yk, eval_set=eval_set_k, verbose=verbose)
            else:
                model.fit(X, yk)
            self.models.append(model)
            self._constant_targets.append(None)

        return self

    # -------- predicción --------
    def predict(self, X):
        out = np.zeros((X.shape[0], len(self.models)), dtype=np.int8)

        for k, model in enumerate(self.models):
            if model is None:                 # etiqueta vacía/constante
                out[:, k] = self._constant_targets[k]
            else:
                # Umbral 0.5 sobre la probabilidad positiva
                out[:, k] = (model.predict_proba(X)[:, 1] > 0.5).astype(np.int8)

        return out

#### 2.2.1. Baseline XGBoost per diagnosis

In [None]:
### Baseline Embeddings to xgboost per diagnosis
clf = CustomOvR_XGBoost()
clf.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], verbose=False);

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/baseline_xgboost_per_diagnosis.pkl")

#### 2.2.2. Optuna XGBoost per diagnosis

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# --- Optuna objective --------------------------------------------------------
def objective(trial):
    """Optuna objective function that trains an XGBClassifier and returns F1_micro."""
    
    # Hyper-parameter search space
    params = {
        "n_estimators":      trial.suggest_int("n_estimators", 100, 800),
        "learning_rate":     trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth":         trial.suggest_int("max_depth", 3, 12),
        "subsample":         trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree":  trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma":             trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight":  trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha":         trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda":        trial.suggest_float("reg_lambda", 0.0, 10.0),

        # Fixed settings for multilabel on GPU
        "objective":    "binary:logistic",
        "eval_metric":  "logloss",
        "tree_method":  "hist",
        "device":       "cuda",
        "n_jobs":       -1,
        "random_state": 42,
        "verbosity":    1,
    }

    print(
        f"🔍 Trial {trial.number} | "
        f"n_estimators={params['n_estimators']}, "
        f"learning_rate={params['learning_rate']:.5f}, "
        f"max_depth={params['max_depth']}, "
        f"subsample={params['subsample']:.3f}, "
        f"colsample_bytree={params['colsample_bytree']:.3f}, "
        f"gamma={params['gamma']:.3f}, "
        f"min_child_weight={params['min_child_weight']}, "
        f"reg_alpha={params['reg_alpha']:.3f}, "
        f"reg_lambda={params['reg_lambda']:.3f}"
    )

    # Model training
    base_model = XGBClassifier(**params)
    model = CustomOvR_XGBoost(base_model=base_model)
    model.fit(
        X_train, Y_train,
        eval_set=[(X_val, Y_val)],
        verbose=False
    )

    # Validation metric
    y_pred = model.predict(X_val)
    f1_micro = f1_score(Y_val, y_pred, average="micro", zero_division=0)

    # Optuna minimises the returned value
    return f1_micro

study = optuna.create_study(
    study_name="multilabel_xgboost",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    storage="sqlite:///optuna/XGBoost_per_diagnosis.db",
    load_if_exists=True
)

TOTAL_TRIALS = 50
remaining_trials = max(TOTAL_TRIALS - len(study.trials), 0)
study.optimize(objective, n_trials=remaining_trials, n_jobs=1)

In [None]:
print("✅ Best trial:", study.best_trial.number)
print("🏆 Best configuration:", study.best_params)
print("🔝 Best F1 val:", study.best_value)
optuna.visualization.plot_optimization_history(study)

In [None]:
# Train
params = {
    "n_estimators":      study.best_params['n_estimators'],
    "learning_rate":     study.best_params['learning_rate'],
    "max_depth":         study.best_params['max_depth'],
    "subsample":         study.best_params['subsample'],
    "colsample_bytree":  study.best_params['colsample_bytree'],
    "gamma":             study.best_params['gamma'],
    "min_child_weight":  study.best_params['min_child_weight'],
    "reg_alpha":         study.best_params['reg_alpha'],
    "reg_lambda":        study.best_params['reg_lambda'],
    # Fixed settings for multilabel on GPU
    "objective":    "binary:logistic",
    "eval_metric":  "logloss",
    "tree_method":  "hist",
    "device":       "cuda",
    "n_jobs":       -1,
    "random_state": 42,
    "verbosity":    1,
}
base_model = XGBClassifier(**params)
clf = CustomOvR_XGBoost(base_model=base_model)
clf.fit(X_train, Y_train);

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/optimized_xgboost_per_diagnosis.pkl")

### 2.2. Multilabel XGBoost

#### 2.2.1. Baseline Multilabel XGBoost

In [None]:
### Baseline Embeddings to multilabel xgboost

from xgboost import XGBClassifier

clf = XGBClassifier(
    # Hyper-parameters
    n_estimators       = 300,
    learning_rate      = 0.1,
    max_depth          = 8,
    subsample          = 0.8,
    colsample_bytree   = 0.8,

    # Fixed settings for multilabel on GPU
    objective          = "binary:logistic",
    eval_metric        = "logloss",
    tree_method        = "hist",
    device             = "cuda",
    n_jobs             = -1,
    random_state       = 42,
    verbosity          = 1
)

clf.fit(X_train, Y_train, verbose=False, eval_set=[(X_val, Y_val)])#4.22

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/baseline_xgboost_multilabel.pkl")

#### 2.2.2. Optuna XGBoost multilabel

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# --- Optuna objective --------------------------------------------------------
def objective(trial):
    """Optuna objective function that trains an XGBClassifier and returns F1_micro."""
    
    # Hyper-parameter search space
    params = {
        "n_estimators":      trial.suggest_int("n_estimators", 100, 800),
        "learning_rate":     trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth":         trial.suggest_int("max_depth", 3, 12),
        "subsample":         trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree":  trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma":             trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight":  trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha":         trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda":        trial.suggest_float("reg_lambda", 0.0, 10.0),

        # Fixed settings for multilabel on GPU
        "objective":    "binary:logistic",
        "eval_metric":  "logloss",
        "tree_method":  "hist",
        "device":       "cuda",
        "n_jobs":       -1,
        "random_state": 42,
        "verbosity":    1,
    }

    print(
        f"🔍 Trial {trial.number} | "
        f"n_estimators={params['n_estimators']}, "
        f"learning_rate={params['learning_rate']:.5f}, "
        f"max_depth={params['max_depth']}, "
        f"subsample={params['subsample']:.3f}, "
        f"colsample_bytree={params['colsample_bytree']:.3f}, "
        f"gamma={params['gamma']:.3f}, "
        f"min_child_weight={params['min_child_weight']}, "
        f"reg_alpha={params['reg_alpha']:.3f}, "
        f"reg_lambda={params['reg_lambda']:.3f}"
    )

    # Model training
    model = XGBClassifier(**params)
    model.fit(
        X_train, Y_train,
        eval_set=[(X_val, Y_val)],
        verbose=False
    )

    # Validation metric
    y_pred = model.predict(X_val)
    f1_micro = f1_score(Y_val, y_pred, average="micro", zero_division=0)

    # Optuna minimises the returned value
    return f1_micro

study = optuna.create_study(
    study_name="multilabel_xgboost",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    storage="sqlite:///optuna/XGBoost_multilabel.db",
    load_if_exists=True
)

TOTAL_TRIALS = 50
remaining_trials = max(TOTAL_TRIALS - len(study.trials), 0)
study.optimize(objective, n_trials=remaining_trials, n_jobs=1)

In [None]:
print("✅ Best trial:", study.best_trial.number)
print("🏆 Best configuration:", study.best_params)
print("🔝 Best F1 val:", study.best_value)
optuna.visualization.plot_optimization_history(study)

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(
    # Hyper-parameters
    n_estimators       = study.best_params['n_estimators'],
    learning_rate      = study.best_params['learning_rate'],
    max_depth          = study.best_params['max_depth'],
    subsample          = study.best_params['subsample'],
    colsample_bytree   = study.best_params['colsample_bytree'],
    gamma              = study.best_params['gamma'],
    min_child_weight   = study.best_params['min_child_weight'],
    reg_alpha          = study.best_params['reg_alpha'],
    reg_lambda         = study.best_params['reg_lambda'],


    # Fixed settings for multilabel on GPU
    objective          = "binary:logistic",
    eval_metric        = "logloss",
    tree_method        = "hist",
    device             = "cuda",
    n_jobs             = -1,
    random_state       = 42,
    verbosity          = 1
)

clf.fit(X_train, Y_train, verbose=False, eval_set=[(X_val, Y_val)])

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/optimized_xgboost_multilabel.pkl")

### 2.3. Random Forest per diagnosis

In [None]:
### Baseline Embeddings to random forest per diagnosis

from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from tqdm import tqdm
import numpy as np

class CustomOvR_RandomForest:
    def __init__(self, base_model=None):
        if base_model is None:
            self.base_model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
        else:
            self.base_model = base_model
        self.models = []

    def fit(self, X, Y):
        self.models = []
        for i in tqdm(range(Y.shape[1]), desc="Training RF classifiers"):
            model = clone(self.base_model)
            model.fit(X, Y[:, i])
            self.models.append(model)

    def predict(self, X):
        preds = [model.predict(X) for model in self.models]
        return np.array(preds).T

#### 2.3.1. Baseline Random Forest per diagnosis

In [None]:
# Train
clf = CustomOvR_RandomForest(RandomForestClassifier(n_estimators=25, n_jobs=-1, random_state=42))
clf.fit(X_train, Y_train)

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/baseline_random_forest_per_diagnosis.pkl")

#### 2.3.2. Optuna Random Forest per diagnosis

In [None]:
# ---------------------------- Imports ----------------------------------------
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Asegúrate de tener en el mismo entorno tu clase:
# from mymodule import CustomOvR_RandomForest

# ------------------------- Optuna objective ----------------------------------
def objective(trial):
    """
    Optuna objective function that tunes a RandomForest inside
    a CustomOvR_RandomForest wrapper and returns the micro-averaged
    F1 score on a validation set.
    """

    # -------------------- Hyper-parameter search space -----------------------
    rf_params = {
        "n_estimators":      trial.suggest_int("n_estimators", 5, 50),
        "max_depth":         trial.suggest_int("max_depth", 5, 25),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 6),
        "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 4),
        "max_features":      trial.suggest_categorical(
                                "max_features", ["sqrt", "log2", None]
                             ),
        "bootstrap":         trial.suggest_categorical("bootstrap", [True, False]),
        "class_weight":      trial.suggest_categorical(
                                "class_weight", [None, "balanced"]
                             ),

        # Fixed settings
        "n_jobs":       -1,     # use all CPU cores
        "random_state": 42,
        "verbose":      0,
    }

    # Pretty print of the current trial’s hyper-parameters
    print(
        f"🔍 Trial {trial.number:02d} | "
        f"n_estimators={rf_params['n_estimators']}, "
        f"max_depth={rf_params['max_depth']}, "
        f"min_samples_split={rf_params['min_samples_split']}, "
        f"min_samples_leaf={rf_params['min_samples_leaf']}, "
        f"max_features={rf_params['max_features']}, "
        f"bootstrap={rf_params['bootstrap']}, "
        f"class_weight={rf_params['class_weight']}"
    )

    # ------------------------- Model training --------------------------------
    base_rf = RandomForestClassifier(**rf_params)
    model = CustomOvR_RandomForest(base_model=base_rf)
    model.fit(X_train, Y_train)

    # ----------------------- Validation metric -------------------------------
    y_pred = model.predict(X_val)
    f1_micro = f1_score(Y_val, y_pred, average="micro", zero_division=0)

    # Optuna maximises when direction="maximize"
    return f1_micro


# ----------------------------- Optuna study ----------------------------------
study = optuna.create_study(
    study_name="Multimodel_random_forest",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    storage="sqlite:///optuna/RandomForest_per_diagnosis.db",
    load_if_exists=True
)

# ------------------------ Launch the optimization ----------------------------
TOTAL_TRIALS = 25           # ajusta si necesitas más/menos exploración
remaining_trials = max(TOTAL_TRIALS - len(study.trials), 0)
study.optimize(objective, n_trials=remaining_trials, n_jobs=1)

In [None]:
print("✅ Best trial:", study.best_trial.number)
print("🏆 Best configuration:", study.best_params)
print("🔝 Best F1 val:", study.best_value)
optuna.visualization.plot_optimization_history(study)

In [None]:
### Baseline Embeddings to random forest per diagnosis

from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

params = {
    "n_estimators":      study.best_params['n_estimators'],
    "max_depth":         study.best_params['max_depth'],
    "min_samples_split": study.best_params['min_samples_split'],
    "min_samples_leaf":  study.best_params['min_samples_leaf'],
    "max_features":      study.best_params['max_features'],
    "bootstrap":         study.best_params['bootstrap'],
    "class_weight":      study.best_params['class_weight'],

    # Fixed settings
    "n_jobs":       -1,     # use all CPU cores
    "random_state": 42,
    "verbose":      0,
}

# Train
base_rf = RandomForestClassifier(**params)
clf = CustomOvR_RandomForest(base_model=base_rf)
clf.fit(X_train, Y_train)

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/optimized_random_forest_per_diagnosis.pkl")

### 2.4. Multilabel Random Forest

#### 2.4.1. Baseline Multilabel Random Forest

In [None]:
### Baseline Embeddings to multilabel random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report  # opcional

clf = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,        # usa todos los núcleos de la CPU
    random_state=42,   # para reproducibilidad
    verbose=1
)

# 2. Entrenar
clf.fit(X_train, Y_train)

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/baseline_random_forest_multilabel.pkl")

#### 2.4.2. Optuna Random Forest multilabel

In [None]:
# ----------------------------- Imports ---------------------------------------
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# --------------------------- Optuna objective --------------------------------
def objective(trial):
    """
    Optuna objective function that trains a RandomForestClassifier and
    returns the micro-averaged F1 score on a validation set.
    """

    # Hyper-parameter search space
    params = {
        # Hyper-parameters (reduced search space)
        "n_estimators":      trial.suggest_int("n_estimators", 50, 150),
        "max_depth":         trial.suggest_int("max_depth", 5, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 4),
        "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 4),
        "max_features":      trial.suggest_categorical(
                                "max_features", ["sqrt", "log2"]
                             ),
        "bootstrap":         trial.suggest_categorical("bootstrap", [True]),
        "class_weight":      trial.suggest_categorical(
                                "class_weight", [None, "balanced"]
                             ),

        # Fixed settings
        "n_jobs":      -1,   # use all CPU cores
        "random_state": 42,
        "verbose":      0,
    }

    # Pretty print of the current trial’s hyper-parameters
    print(
        f"🔍 Trial {trial.number:02d} | "
        f"n_estimators={params['n_estimators']}, "
        f"max_depth={params['max_depth']}, "
        f"min_samples_split={params['min_samples_split']}, "
        f"min_samples_leaf={params['min_samples_leaf']}, "
        f"max_features={params['max_features']}, "
        f"bootstrap={params['bootstrap']}, "
        f"class_weight={params['class_weight']}"
    )

    # ------------------------- Model training --------------------------------
    model = RandomForestClassifier(**params)
    model.fit(X_train, Y_train)

    # ----------------------- Validation metric --------------------------------
    y_pred = model.predict(X_val)
    f1_micro = f1_score(Y_val, y_pred, average="micro", zero_division=0)

    # Optuna maximiza cuando direction="maximize"
    return f1_micro


# ---------------------------- Optuna study -----------------------------------
study = optuna.create_study(
    study_name="multilabel_random_forest",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    storage="sqlite:///optuna/RandomForest_multilabel.db",
    load_if_exists=True
)

# ----------------------- Launch the optimization -----------------------------
TOTAL_TRIALS = 50
remaining_trials = max(TOTAL_TRIALS - len(study.trials), 0)
study.optimize(objective, n_trials=remaining_trials, n_jobs=1)

In [None]:
print("✅ Best trial:", study.best_trial.number)
print("🏆 Best configuration:", study.best_params)
print("🔝 Best F1 val:", study.best_value)
optuna.visualization.plot_optimization_history(study)

In [None]:
### Baseline Embeddings to multilabel random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report  # opcional


params = {
    # Hyper-parameters
    "n_estimators":      study.best_params['n_estimators'],
    "max_depth":         study.best_params['max_depth'],
    "min_samples_split": study.best_params['min_samples_split'],
    "min_samples_leaf":  study.best_params['min_samples_leaf'],
    "max_features":      study.best_params['max_features'],
    "bootstrap":         study.best_params['bootstrap'],
    "class_weight":      study.best_params['class_weight'],

    # Fixed settings
    "n_jobs":      -1,   # use all CPU cores
    "random_state": 42,
    "verbose":      0,
}
clf = RandomForestClassifier(**params)

# 2. Entrenar
clf.fit(X_train, Y_train)

In [None]:
evaluate(clf, X_train, Y_train, label="Train")
evaluate(clf, X_val, Y_val, label="Validation")
evaluate(clf, X_test, Y_test, label="Test")

In [None]:
joblib.dump(clf, "models/optimized_random_forest_multilabel.pkl")