# 🧪 Creating an Experiment in MLflow with DagsHub
---

In [8]:
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import mlflow.catboost
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pathlib
import lightgbm as lgb
import mlflow

In [14]:
def readDataFrame(filename):
    # Create the train-test-split
    df = pd.read_csv(filename)
    X = df.drop(columns=["temp","device"])
    y = df["temp"]
    return train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
X_train, X_test, y_train, y_test = readDataFrame('../data/raw/data.csv')

# DagsHub Setting

```bash
pip install mlflow==2.16.1 dagshub==0.3.35 jupyter==1.1.1 xgboost==2.1.1 hyperopt==0.2.7


In [16]:
import dagshub
import mlflow


dagshub.init(url="https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="temp-prediction-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1e4101b3473346aca857fb50f17ee548', creation_time=1726799975789, experiment_id='0', last_update_time=1726799975789, lifecycle_stage='active', name='temp-prediction-experiment', tags={}>

In [17]:
# Convert to numpy so mlflow dont cry
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()

# Use ravel to secure the flatten vector
y_train_np = y_train.to_numpy().ravel() 
y_test_np = y_test.to_numpy().ravel() 

In [18]:
training_dataset = mlflow.data.from_numpy(X_train_np, targets=y_train_np, name="Environmental-Sensor-Telemetry-Data-1")
testidation_dataset = mlflow.data.from_numpy(X_test_np, targets=y_test_np, name="Environmental-Sensor-Telemetry-Data-2")

# Model approach 

Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define random forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train random forest
rf_model.fit(X_train, y_train)

# Make predicctions
y_pred_rf = rf_model.predict(X_test)

# Registry the model with mlflow
with mlflow.start_run():
    mlflow.sklearn.log_model(rf_model, "random_forest_model")
    mlflow.log_metric("rmse", np.sqrt(mean_squared_error(y_train, y_pred_rf)))

2024/09/19 21:39:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '911a82f3e45b46f684df7d7c9c553f3b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/09/19 21:39:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run trusting-dog-664 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/911a82f3e45b46f684df7d7c9c553f3b.
2024/09/19 21:39:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.


KeyboardInterrupt: 

LightGBM

In [None]:
# create the LightGBM data set
train_data = lgb.Dataset(X_train, label=y_train)
testid_data = lgb.Dataset(X_train, label=y_train, reference=train_data)

# define parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.1,
    'num_leaves': 31
}

# train the model using mlflow
with mlflow.start_run():
    lgb_model = lgb.train(lgb_params, train_data, testid_sets=[testid_data], early_stopping_rounds=10)
    y_pred_lgb = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

    # log of the metrics
    mlflow.log_metric("rmse", np.sqrt(mean_squared_error(y_test, y_pred_lgb)))
    mlflow.lightgbm.log_model(lgb_model, "lightgbm_model")


CatBoost

In [None]:
# define the catmodel
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, silent=True)

# train the model
with mlflow.start_run():
    cat_model.fit(X_train, y_train, etest_set=(X_test, y_test), early_stopping_rounds=10)
    
    # Predecir
    y_pred_cat = cat_model.predict(X_test)

    # Log de métricas
    mlflow.log_metric("rmse", np.sqrt(mean_squared_error(y_test, y_pred_cat)))
    mlflow.catboost.log_model(cat_model, "catboost_model")


## Objective of the models

In [20]:
mlflow.sklearn.autolog()

def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "random_forest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train RandomForest model
        rf_model = RandomForestRegressor(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=42
        )
        rf_model.fit(X_train, y_train)
        
        # Predict on validation dataset
        y_pred = rf_model.predict(X_test)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define search space for RandomForest
search_space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'max_depth': hp.quniform('max_depth', 5, 30, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 4, 1),
}

# Run hyperparameter optimization
with mlflow.start_run(run_name="Random Forest Hyper-parameter Optimization", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_rf)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


2024/09/19 21:40:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run secretive-yak-463 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/50d30f227360448c8cff0d8ba828c626.

2024/09/19 21:40:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:46<06:59, 46.65s/trial, best loss: 0.6721304164191635]


2024/09/19 21:45:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run abrasive-whale-948 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/7ffc21c2a4e842f9b70e9657d841b9de.

2024/09/19 21:45:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 20%|██        | 2/10 [06:15<28:19, 212.41s/trial, best loss: 0.4214303877275547]


2024/09/19 21:48:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-cat-305 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/421762f543694fd6b85ad4386afcb43c.

2024/09/19 21:48:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 30%|███       | 3/10 [09:43<24:34, 210.71s/trial, best loss: 0.42068944952655474]


2024/09/19 21:52:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run magnificent-hog-648 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/92f9149957714172b168e137410a6e16.

2024/09/19 21:52:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 40%|████      | 4/10 [13:24<21:28, 214.76s/trial, best loss: 0.4203800175476811] 


2024/09/19 21:55:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-mink-223 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/b47e0e86d65f4054a5ce8e52ee602c01.

2024/09/19 21:55:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 50%|█████     | 5/10 [16:18<16:39, 199.87s/trial, best loss: 0.4203800175476811]


2024/09/19 21:58:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run dashing-stoat-510 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/09bc795ae4b84c97b24f9aab69067239.

2024/09/19 21:58:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 60%|██████    | 6/10 [19:04<12:34, 188.52s/trial, best loss: 0.4203800175476811]


2024/09/19 21:58:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run kindly-rook-529 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/6d13000596544a3390dde73dfee09a50.

2024/09/19 21:58:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 70%|███████   | 7/10 [19:25<06:41, 133.84s/trial, best loss: 0.4203800175476811]


2024/09/19 22:03:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run fun-squid-315 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/18c44cc844d24bb28080c3261afb5b03.

2024/09/19 22:03:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 80%|████████  | 8/10 [24:17<06:08, 184.12s/trial, best loss: 0.4203800175476811]


2024/09/19 22:07:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run tasteful-skink-328 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/0bc6b0f88fb24f12b0f6eda8f239455c.

2024/09/19 22:07:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [27:56<03:14, 194.91s/trial, best loss: 0.4203800175476811]


2024/09/19 22:09:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run popular-vole-180 at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/a4ab6b040f1b47628d59952814ffdd6b.

2024/09/19 22:09:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.



100%|██████████| 10/10 [30:13<00:00, 181.36s/trial, best loss: 0.4203800175476811]


2024/09/19 22:09:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Hyper-parameter Optimization at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0/runs/b48d9754526b499a8ab365d4078a6ffb.
2024/09/19 22:09:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/ioSoyPato/proyecto-final-ciencia-datos.mlflow/#/experiments/0.


In [3]:
mlflow.lightgbm.autolog()

def objective_lgb(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "lightgbm")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
        
        # Train LightGBM model
        lgb_model = lgb.train(
            params=params,
            train_set=train_data,
            num_boost_round=100,
            valid_sets=[valid_data],
            early_stopping_rounds=10
        )
        
        # Predict on validation dataset
        y_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define search space for LightGBM
search_space_lgb = {
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'feature_fraction': hp.uniform('feature_fraction', 0.7, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1.0),
    'bagging_freq': hp.quniform('bagging_freq', 1, 10, 1),
}

# Run hyperparameter optimization
with mlflow.start_run(run_name="LightGBM Hyper-parameter Optimization", nested=True):
    best_params_lgb = fmin(
        fn=objective_lgb,
        space=search_space_lgb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_lgb)


In [4]:
mlflow.catboost.autolog()

def objective_cat(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "catboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train CatBoost model
        cat_model = CatBoostRegressor(
            iterations=1000,
            learning_rate=params['learning_rate'],
            depth=int(params['depth']),
            silent=True
        )
        cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
        
        # Predict on validation dataset
        y_pred = cat_model.predict(X_test)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Log RMSE metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

# Define search space for CatBoost
search_space_cat = {
    'depth': hp.quniform('depth', 4, 10, 1),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
}

# Run hyperparameter optimization
with mlflow.start_run(run_name="CatBoost Hyper-parameter Optimization", nested=True):
    best_params_cat = fmin(
        fn=objective_cat,
        space=search_space_cat,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_cat)
