## MLflow Integration for Alcoholism Prediction Model

In [None]:
# Enable auto-reloading of modules
%load_ext autoreload
%autoreload 2

In [None]:
# Set up path and import required modules
import os
import sys
sys.path.append(os.path.abspath(".."))

from DSML.config import RAW_DATA_DIR, categorical, target
from DSML.preproc import get_raw_data

In [None]:
# Track git information for experiment versioning
from DSML.helpers import get_active_branch_name, get_git_commit_hash

branch = get_active_branch_name("..")
commit = get_git_commit_hash()
print(f"Current branch: {branch}")
print(f"Current commit: {commit}")

In [None]:
# Load and preprocess the data
df = get_raw_data()

# Create target variable
y = df.pop('Alcoholic')
X = df

# Get categorical column indices for CatBoost
categorical_indices = [X.columns.get_loc(col) for col in categorical if col in X.columns]

In [None]:
# Set up MLflow tracking
import mlflow
from DSML.train import run_hyperopt, get_or_create_experiment

mlflow.set_tracking_uri(f"http://127.0.0.1:5000")

# Create or get experiment
experiment_id = get_or_create_experiment("alcohol_hyperparam_tuning")
mlflow.set_experiment(experiment_id=experiment_id)

# Run hyperparameter optimization
best_params_path = run_hyperopt(X, y, categorical_indices)

In [None]:
# Perform cross-validation
import joblib
from DSML.train import train_cv

params = joblib.load(best_params_path)
print("Best parameters:", params)

n_folds = 5
cv_output_path = train_cv(X, y, categorical_indices, params, n=n_folds)

In [None]:
# Plot cross-validation results
import pandas as pd
from DSML.train import plot_error_scatter

cv_results = pd.read_csv(cv_output_path)

# Plot F1 scores
plot_error_scatter(
    df_plot=cv_results,
    name="Mean F1 Score",
    title="Cross-Validation (N=5) Mean F1 score with Error Bands",
    xtitle="Training Steps",
    ytitle="Performance Score",
    yaxis_range=[0.5, 1]
)

# Plot logloss
plot_error_scatter(
    df_plot=cv_results,
    x="iterations",
    y="test-Logloss-mean",
    err="test-Logloss-std",
    name="Mean logloss",
    title="Cross-Validation (N=5) Mean Logloss with Error Bands",
    xtitle="Training Steps",
    ytitle="Logloss"
)

In [None]:
# Train final model with MLflow tracking
from DSML.train import train

experiment_id = get_or_create_experiment("alcohol_full_training")
mlflow.set_experiment(experiment_id=experiment_id)

model_path, model_params_path = train(X, y, categorical_indices, params, cv_results=cv_results)

In [None]:
# Load model from MLflow and make predictions
import json
from DSML.predict import predict
from mlflow.client import MlflowClient

client = MlflowClient(mlflow.get_tracking_uri())

# Get latest model version
model_info = client.get_latest_versions('alcohol-class')[0]

# Get model metadata
run_data_dict = client.get_run(model_info.run_id).data.to_dictionary()
run = client.get_run(model_info.run_id)
log_model_meta = json.loads(run.data.tags['mlflow.log-model.history'])

# Load and use model
_, artifact_folder = os.path.split(model_info.source)
model_uri = f"runs:/{model_info.run_id}/{artifact_folder}"
loaded_model = mlflow.catboost.load_model(model_uri)

# Make predictions
params = run_data_dict["params"]
params["feature_columns"] = [inp["name"] for inp in json.loads(log_model_meta[0]['signature']['inputs'])]
preds_path = predict(loaded_model, X, params)
print(f"Predictions saved to: {preds_path}")

In [None]:
# Compare champion and challenger models if available
from DSML.resolve import get_model_by_alias

champ_model = get_model_by_alias(client, alias="champion")
chall_model = get_model_by_alias(client, alias="challenger")

if champ_model and chall_model:
    print("Champion model version:", champ_model.version)
    print("Challenger model version:", chall_model.version)
else:
    print("No champion/challenger models set yet")