## Configs

In [1]:
import os 
import pandas as pd 
# private key to access the storage account 
# os.environ['GOOGLE_APPLICATION_CREDENTIALS']='./keyfile.json' # Specify the gcs authentification file path 


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import mlflow
import mlflow.xgboost

mlflow.set_tracking_uri('http://localhost:5000') # Change to Mlflow URI  (loadbalancer ip)
print("MLflow Version:", mlflow.__version__)
print("MLflow Tracking URI:", mlflow.get_tracking_uri())
print("XGBoost version:",xgb.__version__)
client = mlflow.tracking.MlflowClient()



  from pandas import MultiIndex, Int64Index


MLflow Version: 1.24.0
MLflow Tracking URI: http://localhost:5000
XGBoost version: 1.5.2


## Train and register a model

In [3]:
def build_data(data_path):
    data = pd.read_csv(data_path)
    train, test = train_test_split(data, test_size=0.3, random_state=1)

    # The predicted column is "quality" which is a scalar from [3, 9]
    X_train, X_test = train.drop(["quality"], axis=1), test.drop(["quality"], axis=1)
    y_train, y_test = train["quality"], test["quality"]

    return X_train, X_test, y_train, y_test 


In [4]:
def train(data_path, max_depth, min_child_weight, estimators, model_name):
    X_train, X_test, y_train, y_test = build_data(data_path)
    with mlflow.start_run() as run:
        # Start mlflow session
        run_id = run.info.run_uuid
        experiment_id = run.info.experiment_id
        print("MLflow:")
        print("  run_id:", run_id)
        print("  experiment_id:", experiment_id)
        print("  experiment_name:", client.get_experiment(experiment_id).name)

        # MLflow params
        print("Parameters:")
        print("  max_depth:", max_depth)
        print("  min_child_weight:", min_child_weight)
        print("  estimators:", estimators)
        
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_child_weight", min_child_weight)
        mlflow.log_param("estimators", estimators)

        # Create and fit model
        model = xgb.XGBRegressor(
                 max_depth=max_depth,
                 min_child_weight=min_child_weight,
                 random_state=42) 
        model.fit(X_train, y_train)
        
        # MLflow metrics
        predictions = model.predict(X_test)
        print("predictions:",predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        
        print("Metrics:")
        print("  rmse:", rmse)
        print("  mae:", mae)
        print("  r2:", r2)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        # Log model
        mlflow.xgboost.log_model(model, "xgboost-model", registered_model_name = model_name)


In [13]:
data_path = './wine-quality-white.csv'
experiment_name = 'test_xgboost'
model_name = 'xgb_0'
max_depth = 10
min_child_weight = 1
estimators = 100
train(data_path, max_depth , min_child_weight, estimators, model_name)


MLflow:
  run_id: 79a1effc42a14c84832b940f54f3c94c
  experiment_id: 0
  experiment_name: Default
Parameters:
  max_depth: 10
  min_child_weight: 1
  estimators: 100


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


predictions: [4.7992506 5.165539  6.6657195 ... 6.088301  5.998179  6.1720996]
Metrics:
  rmse: 0.6374228524913463
  mae: 0.42470703043905245
  r2: 0.45379700585493965


S3UploadFailedError: Failed to upload /var/folders/xv/n51qjph14_52lj9y4706w6sc0000gn/T/tmppdrj3p61/model/requirements.txt to mlflow/0/79a1effc42a14c84832b940f54f3c94c/artifacts/xgboost-model/requirements.txt: An error occurred (AccessDenied) when calling the PutObject operation: Access Denied

## Load the model from mlflow and make predictions

In [7]:
# Predict on a Pandas DataFrame

import pandas as pd

test_samples =pd.read_csv(data_path).head(5).drop(columns=['quality'])
loaded_model = mlflow.pyfunc.load_model("runs:/a0928931dff54a829b881be2a3e41d00/xgboost-model")
loaded_model.predict(test_samples)

array([6.0004535, 5.994977 , 5.629604 , 5.9799094, 5.9799094],
      dtype=float32)