In [None]:
# Google Colab requirements..
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/pipe/mtuci-itprog-pipe

In [None]:
# Install requirements
!pip install mlflow
!pip install optuna
!pip install python-dotenv

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import mlflow
import optuna
from optuna.trial import Trial

In [6]:
df = pd.read_csv("bank.csv", sep=";")

y = df[["y"]].apply(
    lambda data: int(data.y == "yes"),
    axis=1
)
X = df.drop("y", axis=1)

splitted = train_test_split(X, y, test_size=0.33)
X_train, X_test, y_train, y_test = splitted

num_cols = list(X_train.select_dtypes(exclude=["object"]).columns)
cat_cols = list(X_train.select_dtypes(include=["object"]).columns)

numerical = Pipeline(steps=[
    ("imputer", SimpleImputer()),  # default strategy="mean"
    # ("power_trans", PowerTransformer()),  # приводит к ошибке: https://github.com/scikit-learn/scikit-learn/issues/27499
    ("scaler", StandardScaler()),
])
categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",
                          sparse_output=False)),
])

ct = ColumnTransformer(
    [
        ("numerical", numerical, num_cols),
        ("categorical", categorical, cat_cols),
    ],
    remainder='drop',
    n_jobs=-1
)
pipe = Pipeline(steps=[
    ("ct", ct),
    ("XGBClassifier", XGBClassifier()),
])

In [7]:
def cls_objective(trial: Trial):
    params = {
        "XGBClassifier__n_estimators": trial.suggest_int(
            "XGBClassifier__n_estimators", 1, 5
        ),
        "XGBClassifier__learning_rate": trial.suggest_float(
            "XGBClassifier__learning_rate", 0.1, 0.8
        ),
        "XGBClassifier__max_depth": trial.suggest_int(
            "XGBClassifier__max_depth", 2, 7
        ),
        "XGBClassifier__gamma": trial.suggest_int(
            "XGBClassifier__gamma", 0.0, 20.0
        ),
    }
    pipe.set_params(**params)
    scores = cross_val_score(
        pipe, X_train, y=y_train,
        cv=5,
        scoring="f1_macro",
    ).mean()
    return scores

study = optuna.create_study(direction="maximize") # previous: direction=maximize
study.optimize(
    cls_objective,
    n_trials=10
)

print(f"\nBest value (f1 macro): {study.best_value:.4f}")

[I 2024-05-31 01:49:18,806] A new study created in memory with name: no-name-b4325c83-9634-4677-b2e9-79c00a3102af
[I 2024-05-31 01:49:26,044] Trial 0 finished with value: 0.7545166626392967 and parameters: {'XGBClassifier__n_estimators': 5, 'XGBClassifier__learning_rate': 0.49593444952697685, 'XGBClassifier__max_depth': 4, 'XGBClassifier__gamma': 14}. Best is trial 0 with value: 0.7545166626392967.
[I 2024-05-31 01:49:27,788] Trial 1 finished with value: 0.7658978947786762 and parameters: {'XGBClassifier__n_estimators': 2, 'XGBClassifier__learning_rate': 0.7483388367840691, 'XGBClassifier__max_depth': 7, 'XGBClassifier__gamma': 9}. Best is trial 1 with value: 0.7658978947786762.
[I 2024-05-31 01:49:29,532] Trial 2 finished with value: 0.7189307092217987 and parameters: {'XGBClassifier__n_estimators': 3, 'XGBClassifier__learning_rate': 0.38836343643838056, 'XGBClassifier__max_depth': 4, 'XGBClassifier__gamma': 17}. Best is trial 1 with value: 0.7658978947786762.
[I 2024-05-31 01:49:31,2


Best value (f1 macro): 0.7659


In [9]:
pipe.set_params(**study.best_params)
pipe.fit(X_train, y_train)

with mlflow.start_run():
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_value", study.best_value)
    mlflow.sklearn.log_model(pipe, "best_model_optuna__1")



In [10]:
# Тут находятся результаты:
#
# https://dagshub.com/leonzag997/mtuci-bank-mlflow-v1/experiments
#
#

import os
from dotenv import load_dotenv

load_dotenv(".env")
user = os.getenv("MLFLOW_TRACKING_USERNAME", "leonzag997")
project = os.getenv("MLFLOW_TRACKING_PROJECTNAME", "mtuci-bank-mlflow-v1")

def log_mlflow():
    uri = f"https://dagshub.com/{user}/{project}.mlflow"
    mlflow.set_tracking_uri(uri)

    with mlflow.start_run(run_name="MLflow on Colab"):
        mlflow.log_params(study.best_params)
        mlflow.log_metric("best_value", study.best_value)

        pipe.set_params(**study.best_params)
        pipe.fit(X_train, y_train)
        mlflow.sklearn.log_model(pipe, "best_model_optuna__1")

if os.getenv("MLFLOW_TRACKING_PASSWORD"):
    log_mlflow()
else:
    print(f"Можно посмотреть -> https://dagshub.com/{user}/{project}.mlflow")