## Importing Libraries











































































































































































In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlflow.models import infer_signature
from pprint import pprint
from typing import Union, Dict, List, Tuple
import numpy as np
import pandas as pd
import yaml
import boto3
import joblib
import os
import math
import mlflow
import optuna
import warnings
import shutil

# reading credentials files
with open("../credentials.yaml") as f:
    try:
        content = yaml.safe_load(f)
    except yaml.YAMLError as e:
        print(e)

with open("VERSION", "r") as f:
    CODE_VERSION = f.readline().strip()

if content["EC2"] != "YOUR_EC2_INSTANCE_URL":
    mlflow.set_tracking_uri(f"http://{content['EC2']}:5000") 
else:
    mlflow.set_tracking_uri(f"http://127.0.0.1:5000") 

print(f"Tracking Server URI: '{mlflow.get_tracking_uri()}"")

SEED = 42
DATASET_PATH = "../data/Preprocessed_ObesityDataSet.csv"
ARTIFACTS_OUTPUT_PATH = os.path.join("..", "models", "artifacts")
FEATURES_OUTPUT_PATH = os.path.join("..", "models", "features")
FEATURE_SELECTION_EXPERIMENT_NAME = "feature-selection-experimentation"
HYPERPARAMETER_TUNING_EXPERIMENT_NAME = "hyperparameters-tuning-experimentation"

warnings.filterwarnings("ignore")

Tracking Server URI: 'http://127.0.0.1:5000'


## Loading Essentials

In [2]:
def custom_combiner(feature, category):
    return str(feature) + "_" + type(category).__name__ + "_" + str(category)

# downloading the preprocessed dataset from the aws s3 bucket
if content["S3"] != "YOUR_S3_BUCKET_URL":
    # configuring AWS credentials
    os.environ["AWS_ACCESS_KEY_ID"] = content["AWS_ACCESS_KEY"]
    os.environ["AWS_SECRET_ACCESS_KEY"] = content["AWS_SECRET_KEY"]

    # downloading preprocessed dataset
    s3 = boto3.client(
        "s3",
        aws_access_key_id=content["AWS_ACCESS_KEY"],
        aws_secret_access_key=content["AWS_SECRET_KEY"]
    )
    s3.download_file(content["S3"], "Preprocessed_ObesityDataSet.csv", DATASET_PATH)

    # downloading artifacts from the aws s3 bucket
    !aws s3 cp --recursive s3://{content["S3"]}/artifacts {ARTIFACTS_OUTPUT_PATH}

    # downloading models from the aws s3 bucket
    !aws s3 cp --recursive s3://{content["S3"]}/features {FEATURES_OUTPUT_PATH}

In [3]:
# loading features
X_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_train.pkl"))
y_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_train.pkl"))

X_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_valid.pkl"))
y_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_valid.pkl"))

# loading artifacts
sc = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_sc.pkl"))
ohe = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_ohe.pkl"))
ohe_label = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "label_ohe.pkl"))

# loading feature columns
temp_df = pd.read_csv(DATASET_PATH, sep=",")
FEATURES_NAME = temp_df.columns.tolist()
del temp_df

## Feature Selection Experimentation

In [4]:
# creating the baseline models
dt = DecisionTreeClassifier(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED, verbose=0)
xg = XGBClassifier(random_state=SEED)
lg = LGBMClassifier(random_state=SEED, verbose=-1, objective="multiclass")
cb = CatBoostClassifier(random_seed=SEED, verbose=0, allow_writing_files=False)

In [5]:
def apply_feature_selection(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    number_features: int,
    X_train: np.ndarray,
    y_train: np.array,
    X_valid: np.ndarray,
    y_valid: np.array,
) -> Dict:
    # initializing and fitting the sfs class
    sfs = SequentialFeatureSelector(
        model,
        n_features_to_select=number_features,
        cv=3
    )
    sfs.fit(X=X_train, y=y_train)

    # getting the indexes of the best features
    selected_features_indexes = np.argwhere(sfs.get_support()).reshape(-1)

    reduced_X_train = sfs.transform(X_train)
    reduced_X_valid = sfs.transform(X_valid)

    # training the model
    model.fit(reduced_X_train, y_train)

    # calculating the training f1 score
    predicted_y_train = model.predict(reduced_X_train)
    train_f1 = f1_score(
        y_true=y_train,
        y_pred=predicted_y_train,
        average="weighted"
    )
    
    # calculating the validation f1 score
    predicted_y_valid = model.predict(reduced_X_valid)
    valid_f1 = f1_score(
        y_true=y_valid,
        y_pred=predicted_y_valid,
        average="weighted"
    )

    # inferring the signature of the trained model
    signature = infer_signature(
        model_input=reduced_X_train,
        model_output=predicted_y_train
    )
    
    # saving the metrics and artifacts that we want to log in mlflow
    selected_features_names = list(map(lambda i: FEATURES_NAME[i], selected_features_indexes.tolist()))

    results = {
        "train_f1": train_f1,
        "valid_f1": valid_f1,
        "features": selected_features_names,
        "model": model,
        "model_signature": signature
    }

    return results

def set_configurations_mlflow(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    y_train: np.array,
    y_valid: np.array,
) -> Tuple[np.array, np.array, str, str]:
    # reshaping the target values (if needed) and setting the run name and which
    # flavor is being used for each machine learning model
    if isinstance(model, DecisionTreeClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "decision_tree"
        flavor = "sklearn"
    
    if isinstance(model, RandomForestClassifier):
        run_name = "random_forest"
        flavor = "sklearn"
    
    if isinstance(model, XGBClassifier):
        run_name = "xgboost"
        flavor = "xgboost"
    
    if isinstance(model, LGBMClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "lightgbm"
        flavor = "lightgbm"
    
    if isinstance(model, CatBoostClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "catboost"
        flavor = "catboost"
    
    # disabling some options of the current flavor's autolog
    if flavor == "sklearn":
        mlflow.sklearn.autolog(
            log_models=False,
            log_post_training_metrics=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "xgboost":
        mlflow.xgboost.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "lightgbm":
        mlflow.lightgbm.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "catboost":
        # there is no autolog implemented for catboost
        pass

    return y_train, y_valid, run_name, flavor

def run_feature_selection_experiment(
    models: List,
    min_features: int,
    max_features: int,
    experiment_id: str
) -> None:
    for model in models:
        # reshaping the target values (if needed) and setting some mlflow's configuration
        new_y_train, new_y_valid, run_name, flavor = set_configurations_mlflow(
            model=model,
            y_train=y_train,
            y_valid=y_valid
        )
        
        # starting a new run for the current model
        with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
            pprint(f"Starting the run for the {run_name} model!\n")

            for i, n_features in enumerate(range(min_features, max_features + 1)):
                # creating a nested run inside the model's main run
                with mlflow.start_run(
                    experiment_id=experiment_id,
                    run_name=f"{run_name}_experiment_{i}",
                    nested=True
                ):
                    # running the feature selection main function
                    results = apply_feature_selection(
                        model=model,
                        number_features=n_features,
                        X_train=X_train,
                        y_train=new_y_train,
                        X_valid=X_valid,
                        y_valid=new_y_valid
                    )

                    # logging the trained model
                    if flavor == "sklearn":
                        mlflow.sklearn.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model"s default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "xgboost":
                        mlflow.xgboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "lightgbm":
                        mlflow.lightgbm.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params())
                    elif flavor == "catboost":
                        mlflow.catboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_all_params())

                    # logging the training and validation scores
                    mlflow.log_metric("train_f1", results["train_f1"])
                    mlflow.log_metric("valid_f1", results["valid_f1"])

                    # logging the artifacts (original dataset, features, and encoders objects)
                    mlflow.log_artifact(DATASET_PATH)
                    mlflow.log_artifact(ARTIFACTS_OUTPUT_PATH)
                    mlflow.log_artifact(FEATURES_OUTPUT_PATH)

                    # logging the indexes of the best features
                    mlflow.log_param("features", results["features"])
                    

In [6]:
models = [dt, rf, xg, lg]
min_features = math.floor(X_train.shape[1] * 0.2)
max_features = math.floor(X_train.shape[1] * 0.5)

# creating a new mlflow's experiment
experiment_id = mlflow.create_experiment(
    name=FEATURE_SELECTION_EXPERIMENT_NAME,
    tags={"version": "v2", "code_version": CODE_VERSION}
)

# running the feature selection experiments
run_feature_selection_experiment(
    models=models,
    min_features=min_features,
    max_features=max_features,
    experiment_id=experiment_id
)

'Starting the run for the decision_tree model!\n'


2024/11/06 12:14:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_0 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/74c0ea1eaff9420fb57f8b48f92175d8.
2024/11/06 12:14:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 12:14:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_1 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/22b1d33bec4647d489dd9dbea2dbb670.
2024/11/06 12:14:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 12:14:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_2 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/2ef1ae77efde49d28f000ead6a1dc822.
2024/11/06 12:14:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0

'Starting the run for the random_forest model!\n'


2024/11/06 12:20:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_0 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/7ddc34b06185443a8df5fd81991f84e3.
2024/11/06 12:20:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 12:25:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_1 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/46fd92b3984d4f409e9bd97240b1cdb8.
2024/11/06 12:25:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 14:00:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_2 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/a887192938fe4b62b2e4808f17f5a74a.
2024/11/06 14:00:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0

'Starting the run for the xgboost model!\n'


2024/11/06 15:08:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_0 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/0cdbe3021f14495e8049a2834a363af8.
2024/11/06 15:08:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 15:11:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_1 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/1c45b66a68f24bd8a9d54fef51e10498.
2024/11/06 15:11:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 15:12:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_2 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/d78bd6255cbe4537a2cf669a005c726a.
2024/11/06 15:12:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experi

'Starting the run for the lightgbm model!\n'


2024/11/06 15:54:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_0 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/061d24b31fb149e9a7fd2ac06db1d197.
2024/11/06 15:54:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 15:56:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_1 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/ae2970b4ba064c469ea7bcd7e04700d4.
2024/11/06 15:56:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/728086036465241217.
2024/11/06 15:59:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_2 at: http://127.0.0.1:5000/#/experiments/728086036465241217/runs/cdca6710eeb243b3a8818a5b6d5097bf.
2024/11/06 15:59:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/exp

## Hyperparameters Tuning

In [7]:
class Objective:
    def __init__(
        self,
        run_name: str,
        experiment_id: str,
        X_train: np.ndarray,
        y_train: np.array,
        X_valid: np.ndarray,
        y_valid: np.array,
        indexes: List
    ) -> None:
        self.run_name = run_name
        self.experiment_id = experiment_id
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.indexes_name = indexes
        self.indexes = [FEATURES_NAME.index(i) for i in indexes]

        if self.run_name in ["decision_tree", "lightgbm", "catboost"]:
            self.y_train = np.argmax(self.y_train, axis=1)
            self.y_valid = np.argmax(self.y_valid, axis=1)
        
        self.X_train = self.X_train[:, self.indexes]
        self.X_valid = self.X_valid[:, self.indexes]
    
    def __call__(
        self,
        trial: optuna.trial.Trial
    ) -> float:
        with mlflow.start_run(experiment_id=self.experiment_id, nested=True):
            if self.run_name == "decision_tree":
                params = {
                    "max_depth": trial.suggest_int("max_depth", 2, 32, step=2),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 8, step=1),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 6, step=1),
                    "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", 0, 0.5, step=0.1),
                    "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 16, step=2),
                    "random_state": SEED
                }
                model = DecisionTreeClassifier(**params)
            
            if self.run_name == "random_forest":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
                    "max_depth": trial.suggest_int("max_depth", 10, 50),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
                    "random_state": SEED
                }
                model = RandomForestClassifier(**params)
            
            if self.run_name == "xgboost":
                params = {
                    "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                    "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
                    "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
                    "random_state": SEED
                }
                model = XGBClassifier(**params)
            
            if self.run_name == "lightgbm":
                params = {
                    "objective": "multiclass",
                    "verbosity": -1,
                    "random_state": SEED,
                    "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                    "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                }
                model = LGBMClassifier(**params)
            
            if self.run_name == "catboost":
                params = {
                    "random_seed": SEED,
                    "verbose": 0,
                    "allow_writing_files": False,
                    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
                    "depth": trial.suggest_int("depth", 1, 12),
                    "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                    "bootstrap_type": trial.suggest_categorical(
                        "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
                    )
                }
                model = CatBoostClassifier(**params)
            
            model.fit(X=self.X_train, y=self.y_train)

            # calculating the training f1 score
            train_prediction = model.predict(self.X_train)
            train_f1 = f1_score(
                y_true=self.y_train,
                y_pred=train_prediction,
                average="weighted"
            )

            # calculating the validation f1 score
            valid_prediction = model.predict(self.X_valid)
            valid_f1 = f1_score(
                y_true=self.y_valid,
                y_pred=valid_prediction,
                average="weighted"
            )

            # logging the training and validation scores
            mlflow.log_metric("train_f1", train_f1)
            mlflow.log_metric("valid_f1", valid_f1)

            # inferring the signature of the trained model
            signature = infer_signature(
                model_input=self.X_train,
                model_output=train_prediction
            )

            # saving the trained model
            if self.run_name in ["decision_tree", "random_forest"]:
                # sklearn flavor
                mlflow.sklearn.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model"s default parameters
                mlflow.log_params(model.get_params(deep=True))
            elif self.run_name == "xgboost":
                mlflow.xgboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "lightgbm":
                mlflow.lightgbm.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "catboost":
                mlflow.catboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_all_params())

        return valid_f1

In [8]:
# creating a new mlflow's experiment
hpt_experiment_id = mlflow.create_experiment(
    name=HYPERPARAMETER_TUNING_EXPERIMENT_NAME,
    tags={"version": "v2", "code_version": CODE_VERSION}
)

### Decision Tree

In [None]:
dt_run_name = "decision_tree"
dt_features_indexes = ["x0_str_Male", "x0_str_Frequently", "x0_int_1.1", "x0_int_1.3", "BMI", "PAL", "BMR"]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=dt_run_name):
    objective = Objective(
        run_name=dt_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=dt_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-06 16:40:17,598] A new study created in memory with name: no-name-ad2d8604-f5a2-46df-a458-db1100a59474
2024/11/06 16:40:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run casual-wolf-623 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/fe5c5ec69e9346418776a4abb720de9e.
2024/11/06 16:40:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/197479581450729743.
[I 2024-11-06 16:40:20,001] Trial 0 finished with value: 0.1386782580697094 and parameters: {'max_depth': 18, 'min_samples_split': 2, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.4, 'max_leaf_nodes': 12}. Best is trial 0 with value: 0.1386782580697094.
2024/11/06 16:40:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-sloth-789 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/8a293f71b7a94d119341a26a532fa349.
2024/11/06 16:40:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http

#### Random Forest

In [10]:
rf_run_name = "random_forest"
rf_features_indexes = [
    "x0_str_Male",
    "x0_str_q2",
    "x0_str_yes",
    "x0_str_Sometimes",
    "x0_str_yes.2",
    "x0_str_infrequent_sklearn",
    "x0_str_Public_Transportation",
    "x0_int_1",
    "Height",
    "Weight",
    "FCVC",
    "NCP",
    "FAF",
    "BMI",
    "BSA",
    "diff_W_IBW",
    "BMR"
]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=rf_run_name):
    objective = Objective(
        run_name=rf_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=rf_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-06 16:46:42,312] A new study created in memory with name: no-name-bb751741-07b0-4321-801c-4b3123fdc7ef
2024/11/06 16:46:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run upset-moth-249 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/4709f9c08f3444e3ae517a45c7cea337.
2024/11/06 16:46:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/197479581450729743.
[I 2024-11-06 16:46:51,117] Trial 0 finished with value: 0.977467562557924 and parameters: {'n_estimators': 878, 'max_depth': 45, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.977467562557924.
2024/11/06 16:46:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run bouncy-mole-259 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/e18d0ba414f04add9fa4fb6c526b7570.
2024/11/06 16:46:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/19747958

#### XGBoost

In [11]:
xgb_run_name = "xgboost"
xg_features_indexes = [
    "x0_str_Male",
    "x0_str_q2",
    "x0_str_q4",
    "x0_str_Frequently",
    "x0_str_Sometimes",
    "x0_str_no",
    "x0_str_yes.2",
    "x0_str_yes.3",
    "x0_str_infrequent_sklearn",
    "x0_str_Public_Transportation",
    "x0_str_Walking",
    "x0_str_infrequent_sklearn.1",
    "x0_int_1.1",
    "FCVC",
    "FAF",
    "TUE",
    "BMI",
    "BMR"
]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=xgb_run_name):
    objective = Objective(
        run_name=xgb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=xg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-06 16:57:11,822] A new study created in memory with name: no-name-12937710-0e87-4670-a487-27cd96edaf27
2024/11/06 16:57:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run intrigued-cat-610 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/06c435686ba444a4aef31056536b7eef.
2024/11/06 16:57:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/197479581450729743.
[I 2024-11-06 16:57:18,834] Trial 0 finished with value: 0.9777432708479324 and parameters: {'booster': 'dart', 'lambda': 0.36965381945117254, 'alpha': 0.014236766440999022}. Best is trial 0 with value: 0.9777432708479324.
2024/11/06 16:57:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run sincere-penguin-237 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/9da7178477004784b4e70a9f4714e507.
2024/11/06 16:57:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/197

#### LightGBM

In [12]:
lg_run_name = "lightgbm"
lg_features_indexes = [
    "x0_str_Male",
    "x0_str_q2",
    "x0_str_yes",
    "x0_str_Frequently",
    "x0_str_no",
    "x0_str_Sometimes.1",
    "x0_str_infrequent_sklearn",
    "x0_str_Walking",
    "x0_str_infrequent_sklearn.1",
    "NCP",
    "BMI",
    "BMR"
]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=lg_run_name):
    objective = Objective(
        run_name=lg_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=lg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-06 17:02:09,763] A new study created in memory with name: no-name-313973f0-b856-4ca1-aacd-a58be02a5556
2024/11/06 17:02:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run powerful-skink-616 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/da1bb7cb71ac4cbc9eb799acece3613a.
2024/11/06 17:02:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/197479581450729743.
[I 2024-11-06 17:02:11,617] Trial 0 finished with value: 0.958221882387447 and parameters: {'lambda_l1': 4.004218223285692, 'lambda_l2': 0.00021553023687030393, 'num_leaves': 108, 'feature_fraction': 0.78322327631569, 'bagging_fraction': 0.721636988160743, 'bagging_freq': 4, 'min_child_samples': 60}. Best is trial 0 with value: 0.958221882387447.
2024/11/06 17:02:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run valuable-goose-461 at: http://127.0.0.1:5000/#/experiments/197479581450729743/runs/d1280e780d2e40c1abe64b28fdad137d.
20

#### CatBoost

In [None]:
cb_run_name = "catboost"
cb_features_indexes = [3, 14, 18, 25, 26, 30, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=cb_run_name):
    objective = Objective(
        run_name=cb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=cb_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

In [None]:
if content["EC2"] != "YOUR_S3_BUCKET_URL":
    # removing downloaded dataset from local
    os.remove("../data/Preprocessed_ObesityDataSet.csv")

    # removing the local artifacts and features
    shutil.rmtree("../models/artifacts/")
    shutil.rmtree("../models/features/")

## Registering Best Models

In [24]:
run_id = "b61c0fdc2f4044e5b557c8d96a21e75c"
run_name = "angry-bird-888 "
name = "lightgbm"
tags = {"version": "2.0", "type": "baseline hyperparameter", "model": name}

result = mlflow.register_model(
    model_uri=f"runs:/{run_id}/{run_name}",
    name=name,
    tags=tags,
    await_registration_for=150,
)

Registered model 'lightgbm' already exists. Creating a new version of this model...
2024/11/06 22:40:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 150 seconds for model version to finish creation. Model name: lightgbm, version 2
Created version '2' of model 'lightgbm'.
