## Importing Libraries











































































































































































In [1]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlflow.models import infer_signature
from pprint import pprint
from typing import Union, Dict, List, Tuple
import numpy as np
import pandas as pd
import yaml
import boto3
import joblib
import os
import math
import mlflow
import optuna
import warnings
import shutil

warnings.filterwarnings("ignore")

In [2]:
def read_yaml_file(path, file):
    # reading credentials files
    with open(f"{os.path.join(path, file)}") as f:
        try:
            content = yaml.safe_load(f)
        except yaml.YAMLError as e:
            raise e
    
    return content

CONFIG_PATH = os.path.join("..", "src", "config")

In [3]:
with open("VERSION", "r") as f:
    CODE_VERSION = f.readline().strip()

credentials_config = read_yaml_file(
    path=CONFIG_PATH,
    file="credentials.yaml"
)

general_settings = read_yaml_file(
    path=CONFIG_PATH,
    file="settings.yaml"
)

if credentials_config["EC2"] != "YOUR_EC2_INSTANCE_URL":
    mlflow.set_tracking_uri(f"http://{credentials_config['EC2']}:5000") 
else:
    mlflow.set_tracking_uri(f"http://127.0.0.1:5000") 

print(f"Tracking Server URI: '{mlflow.get_tracking_uri()}'")

SEED = 42
ARTIFACTS_OUTPUT_PATH = general_settings["ARTIFACTS_PATH"].replace(".", "..")
FEATURES_OUTPUT_PATH = general_settings["FEATURES_PATH"].replace(".", "..")
RAW_FILE_PATH = os.path.join(general_settings["DATA_PATH"].replace(".", ".."), general_settings["RAW_FILE_NAME"])
PROCESSED_RAW_FILE = "Preprocessed_" + general_settings["RAW_FILE_NAME"]
PROCESSED_RAW_FILE_PATH = os.path.join(general_settings["DATA_PATH"].replace(".", ".."), PROCESSED_RAW_FILE)
FEATURE_SELECTION_EXPERIMENT_NAME = "feature-selection-experimentation"
HYPERPARAMETER_TUNING_EXPERIMENT_NAME = "hyperparameters-tuning-experimentation"

Tracking Server URI: 'http://127.0.0.1:5000'


## Loading Essentials

In [4]:
# downloading the preprocessed dataset from the aws s3 bucket
if credentials_config["S3"] != "YOUR_S3_BUCKET_URL":
    # configuring AWS credentials
    os.environ["AWS_ACCESS_KEY_ID"] = credentials_config["AWS_ACCESS_KEY"]
    os.environ["AWS_SECRET_ACCESS_KEY"] = credentials_config["AWS_SECRET_KEY"]

    # downloading preprocessed dataset
    s3 = boto3.client(
        "s3",
        aws_access_key_id=credentials_config["AWS_ACCESS_KEY"],
        aws_secret_access_key=credentials_config["AWS_SECRET_KEY"]
    )
    s3.download_file(
        credentials_config["S3"],
        PROCESSED_RAW_FILE,
        PROCESSED_RAW_FILE_PATH
    )

    # downloading artifacts from the aws s3 bucket
    !aws s3 cp --recursive s3://{credentials_config["S3"]}/artifacts {ARTIFACTS_OUTPUT_PATH}

    # downloading models from the aws s3 bucket
    !aws s3 cp --recursive s3://{credentials_config["S3"]}/features {FEATURES_OUTPUT_PATH}

In [5]:
# loading features
X_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_train.pkl"))
y_train = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_train.pkl"))

X_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "X_valid.pkl"))
y_valid = joblib.load(os.path.join(FEATURES_OUTPUT_PATH, "y_valid.pkl"))

# loading artifacts
sc = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_sc.pkl"))
ohe = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "features_ohe.pkl"))
ohe_label = joblib.load(os.path.join(ARTIFACTS_OUTPUT_PATH, "label_ohe.pkl"))

# loading feature columns
temp_df = pd.read_csv(PROCESSED_RAW_FILE_PATH, sep=",")
FEATURES_NAME = temp_df.columns.tolist()
del temp_df

## Feature Selection Experimentation

In [6]:
# creating the baseline models
dt = DecisionTreeClassifier(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED, verbose=0)
xg = XGBClassifier(random_state=SEED)
lg = LGBMClassifier(random_state=SEED, verbose=-1, objective="multiclass")
cb = CatBoostClassifier(random_seed=SEED, verbose=0, allow_writing_files=False)

In [7]:
def apply_feature_selection(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    number_features: int,
    X_train: np.ndarray,
    y_train: np.array,
    X_valid: np.ndarray,
    y_valid: np.array,
) -> Dict:
    # initializing and fitting the sfs class
    sfs = SequentialFeatureSelector(
        model,
        n_features_to_select=number_features,
        cv=3
    )
    sfs.fit(X=X_train, y=y_train)

    # getting the indexes of the best features
    selected_features_indexes = np.argwhere(sfs.get_support()).reshape(-1)

    reduced_X_train = sfs.transform(X_train)
    reduced_X_valid = sfs.transform(X_valid)

    # training the model
    model.fit(reduced_X_train, y_train)

    # calculating the training f1 score
    predicted_y_train = model.predict(reduced_X_train)
    train_f1 = f1_score(
        y_true=y_train,
        y_pred=predicted_y_train,
        average="weighted"
    )
    
    # calculating the validation f1 score
    predicted_y_valid = model.predict(reduced_X_valid)
    valid_f1 = f1_score(
        y_true=y_valid,
        y_pred=predicted_y_valid,
        average="weighted"
    )

    # inferring the signature of the trained model
    signature = infer_signature(
        model_input=reduced_X_train,
        model_output=predicted_y_train
    )
    
    # saving the metrics and artifacts that we want to log in mlflow
    selected_features_names = list(map(lambda i: FEATURES_NAME[i], selected_features_indexes.tolist()))

    results = {
        "train_f1": train_f1,
        "valid_f1": valid_f1,
        "features": selected_features_names,
        "model": model,
        "model_signature": signature
    }

    return results

def set_configurations_mlflow(
    model: Union[DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatBoostClassifier],
    y_train: np.array,
    y_valid: np.array,
) -> Tuple[np.array, np.array, str, str]:
    # reshaping the target values (if needed) and setting the run name and which
    # flavor is being used for each machine learning model
    if isinstance(model, DecisionTreeClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "decision_tree"
        flavor = "sklearn"
    
    if isinstance(model, RandomForestClassifier):
        run_name = "random_forest"
        flavor = "sklearn"
    
    if isinstance(model, XGBClassifier):
        run_name = "xgboost"
        flavor = "xgboost"
    
    if isinstance(model, LGBMClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "lightgbm"
        flavor = "lightgbm"
    
    if isinstance(model, CatBoostClassifier):
        y_train = np.argmax(y_train, axis=1)
        y_valid = np.argmax(y_valid, axis=1)
        run_name = "catboost"
        flavor = "catboost"
    
    # disabling some options of the current flavor's autolog
    if flavor == "sklearn":
        mlflow.sklearn.autolog(
            log_models=False,
            log_post_training_metrics=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "xgboost":
        mlflow.xgboost.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "lightgbm":
        mlflow.lightgbm.autolog(
            log_models=False,
            log_model_signatures=False,
            log_input_examples=True,
            log_datasets=False,
            silent=True,
            disable=True
        )
    elif flavor == "catboost":
        # there is no autolog implemented for catboost
        pass

    return y_train, y_valid, run_name, flavor

def run_feature_selection_experiment(
    models: List,
    min_features: int,
    max_features: int,
    experiment_id: str
) -> None:
    for model in models:
        # reshaping the target values (if needed) and setting some mlflow's configuration
        new_y_train, new_y_valid, run_name, flavor = set_configurations_mlflow(
            model=model,
            y_train=y_train,
            y_valid=y_valid
        )
        
        # starting a new run for the current model
        with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
            pprint(f"Starting the run for the {run_name} model!\n")

            for i, n_features in enumerate(range(min_features, max_features + 1)):
                # creating a nested run inside the model's main run
                with mlflow.start_run(
                    experiment_id=experiment_id,
                    run_name=f"{run_name}_experiment_{i}",
                    nested=True
                ):
                    # running the feature selection main function
                    results = apply_feature_selection(
                        model=model,
                        number_features=n_features,
                        X_train=X_train,
                        y_train=new_y_train,
                        X_valid=X_valid,
                        y_valid=new_y_valid
                    )

                    # logging the trained model
                    if flavor == "sklearn":
                        mlflow.sklearn.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model"s default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "xgboost":
                        mlflow.xgboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params(deep=True))
                    elif flavor == "lightgbm":
                        mlflow.lightgbm.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_params())
                    elif flavor == "catboost":
                        mlflow.catboost.log_model(
                            results["model"],
                            run_name,
                            signature=results["model_signature"]
                        )
                        # logging the model's default parameters
                        mlflow.log_params(results["model"].get_all_params())

                    # logging the training and validation scores
                    mlflow.log_metric("train_f1", results["train_f1"])
                    mlflow.log_metric("valid_f1", results["valid_f1"])

                    # logging the artifacts (original dataset, features, and encoders objects)
                    mlflow.log_artifact(PROCESSED_RAW_FILE_PATH)
                    mlflow.log_artifact(ARTIFACTS_OUTPUT_PATH)
                    mlflow.log_artifact(FEATURES_OUTPUT_PATH)

                    # logging the indexes of the best features
                    mlflow.log_param("features", results["features"])
                    

In [8]:
models = [dt, rf, xg, lg]
min_features = math.floor(X_train.shape[1] * 0.2)
max_features = math.floor(X_train.shape[1] * 0.5)

# creating a new mlflow's experiment
experiment_id = mlflow.create_experiment(
    name=FEATURE_SELECTION_EXPERIMENT_NAME,
    tags={"version": "v2", "code_version": CODE_VERSION}
)

# running the feature selection experiments
run_feature_selection_experiment(
    models=models,
    min_features=min_features,
    max_features=max_features,
    experiment_id=experiment_id
)

'Starting the run for the decision_tree model!\n'


2024/11/20 11:17:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_0 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/6687f89b846643f287093963cb0196e8.
2024/11/20 11:17:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 11:17:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_1 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/f2ef64eff0a84f96a8a351fc37c54832.
2024/11/20 11:17:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 11:18:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run decision_tree_experiment_2 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/1543c6e1b7404b53b8c0f5745cbe2b03.
2024/11/20 11:18:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0

'Starting the run for the random_forest model!\n'


2024/11/20 11:53:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_0 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/95c281dbaba644bf8af4271f92125da2.
2024/11/20 11:53:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 12:20:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_1 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/73f09cea70b5414d86dfc4cf5de04c5a.
2024/11/20 12:20:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 12:50:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run random_forest_experiment_2 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/a8388ec3bea74e8083af815862e2d24b.
2024/11/20 12:50:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0

'Starting the run for the xgboost model!\n'


2024/11/20 19:02:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_0 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/87a12401dd6444428b4eba8f29a46366.
2024/11/20 19:02:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 19:06:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_1 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/209c0fc9aefb4638a532acdf1949f08b.
2024/11/20 19:06:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 19:10:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run xgboost_experiment_2 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/5f31ed20fce84864a6a2bb5cf0a6890f.
2024/11/20 19:10:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experi

'Starting the run for the lightgbm model!\n'


2024/11/20 20:10:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_0 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/6d2a3ad1320d407a9ee87d9ed0f6ec87.
2024/11/20 20:10:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 20:15:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_1 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/b0135fd999f248c491f0190a78d95380.
2024/11/20 20:15:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/616296920000275133.
2024/11/20 20:21:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm_experiment_2 at: http://127.0.0.1:5000/#/experiments/616296920000275133/runs/b5e903001f0f4c2ea1010a66d934a0d5.
2024/11/20 20:21:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/exp

## Hyperparameters Tuning

In [6]:
class Objective:
    def __init__(
        self,
        run_name: str,
        experiment_id: str,
        X_train: np.ndarray,
        y_train: np.array,
        X_valid: np.ndarray,
        y_valid: np.array,
        indexes: List
    ) -> None:
        self.run_name = run_name
        self.experiment_id = experiment_id
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.indexes_name = indexes
        self.indexes = [FEATURES_NAME.index(i) for i in indexes]

        if self.run_name in ["decision_tree", "lightgbm", "catboost"]:
            self.y_train = np.argmax(self.y_train, axis=1)
            self.y_valid = np.argmax(self.y_valid, axis=1)
        
        self.X_train = self.X_train[:, self.indexes]
        self.X_valid = self.X_valid[:, self.indexes]
    
    def __call__(
        self,
        trial: optuna.trial.Trial
    ) -> float:
        with mlflow.start_run(experiment_id=self.experiment_id, nested=True):
            if self.run_name == "decision_tree":
                params = {
                    "max_depth": trial.suggest_int("max_depth", 2, 32, step=2),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 8, step=1),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 6, step=1),
                    "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", 0, 0.5, step=0.1),
                    "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 16, step=2),
                    "random_state": SEED
                }
                model = DecisionTreeClassifier(**params)
            
            if self.run_name == "random_forest":
                params = {
                    "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
                    "max_depth": trial.suggest_int("max_depth", 10, 50),
                    "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
                    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
                    "random_state": SEED
                }
                model = RandomForestClassifier(**params)
            
            if self.run_name == "xgboost":
                params = {
                    "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
                    "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
                    "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
                    "random_state": SEED
                }
                model = XGBClassifier(**params)
            
            if self.run_name == "lightgbm":
                params = {
                    "objective": "multiclass",
                    "verbosity": -1,
                    "random_state": SEED,
                    "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                    "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                }
                model = LGBMClassifier(**params)
            
            if self.run_name == "catboost":
                params = {
                    "random_seed": SEED,
                    "verbose": 0,
                    "allow_writing_files": False,
                    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
                    "depth": trial.suggest_int("depth", 1, 12),
                    "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
                    "bootstrap_type": trial.suggest_categorical(
                        "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
                    )
                }
                model = CatBoostClassifier(**params)
            
            model.fit(X=self.X_train, y=self.y_train)

            # calculating the training f1 score
            train_prediction = model.predict(self.X_train)
            train_f1 = f1_score(
                y_true=self.y_train,
                y_pred=train_prediction,
                average="weighted"
            )

            # calculating the validation f1 score
            valid_prediction = model.predict(self.X_valid)
            valid_f1 = f1_score(
                y_true=self.y_valid,
                y_pred=valid_prediction,
                average="weighted"
            )

            # logging the training and validation scores
            mlflow.log_metric("train_f1", train_f1)
            mlflow.log_metric("valid_f1", valid_f1)

            # inferring the signature of the trained model
            signature = infer_signature(
                model_input=self.X_train,
                model_output=train_prediction
            )

            # saving the trained model
            if self.run_name in ["decision_tree", "random_forest"]:
                # sklearn flavor
                mlflow.sklearn.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model"s default parameters
                mlflow.log_params(model.get_params(deep=True))
            elif self.run_name == "xgboost":
                mlflow.xgboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "lightgbm":
                mlflow.lightgbm.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_params())
            elif self.run_name == "catboost":
                mlflow.catboost.log_model(
                    model,
                    self.run_name,
                    signature=signature
                )
                # logging the model's default parameters
                mlflow.log_params(model.get_all_params())

        return valid_f1

In [7]:
# creating a new mlflow's experiment
hpt_experiment_id = mlflow.create_experiment(
    name=HYPERPARAMETER_TUNING_EXPERIMENT_NAME,
    tags={"version": "v2", "code_version": CODE_VERSION}
)

### Decision Tree

In [8]:
dt_run_name = "decision_tree"
dt_features_indexes = ['Gender_x0_Male', 'Age_x0_q2', 'Age_x0_q4', 'CAEC_x0_no', 'SCC_x0_yes', 'MTRANS_x0_Motorbike', 'Weight', 'BMI', 'BSA', 'IBW', 'diff_W_IBW']

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=dt_run_name):
    objective = Objective(
        run_name=dt_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=dt_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-22 19:09:54,158] A new study created in memory with name: no-name-49f8d98d-7224-4d1b-ad08-32e21aa69b1f
2024/11/22 19:09:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-gnu-378 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/45be4d5ccf854637b721146f40ff109c.
2024/11/22 19:09:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/671155644889420432.
[I 2024-11-22 19:09:57,360] Trial 0 finished with value: 0.18715324624236512 and parameters: {'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.4, 'max_leaf_nodes': 4}. Best is trial 0 with value: 0.18715324624236512.
2024/11/22 19:09:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run victorious-auk-663 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/35ee3b984e9a4536a3a35a5feefed699.
2024/11/22 19:09:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: h

#### Random Forest

In [9]:
rf_run_name = "random_forest"
rf_features_indexes = ['Gender_x0_Male', 'Age_x0_q4', 'family_history_with_overweight_x0_yes', 'FAVC_x0_yes', 'CAEC_x0_Frequently', 'CAEC_x0_no', 'SCC_x0_yes', 'MTRANS_x0_Public_Transportation', 'MTRANS_x0_Walking', 'HH_x0_good', 'Weight', 'FCVC', 'NCP', 'CH2O', 'TUE', 'BMI', 'IBW', 'BMR']

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=rf_run_name):
    objective = Objective(
        run_name=rf_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=rf_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-22 19:19:03,438] A new study created in memory with name: no-name-19487391-6d40-4f3d-a543-809d4fb7630f
2024/11/22 19:19:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-moth-731 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/c947ee0ecacc4b80817e777c46d01292.
2024/11/22 19:19:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/671155644889420432.
[I 2024-11-22 19:19:24,708] Trial 0 finished with value: 0.8883746094712421 and parameters: {'n_estimators': 283, 'max_depth': 36, 'min_samples_split': 30, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.8883746094712421.
2024/11/22 19:20:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run nervous-slug-38 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/071ca70b80d24975bc35c0f631771b87.
2024/11/22 19:20:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6

#### XGBoost

In [10]:
xgb_run_name = "xgboost"
xg_features_indexes = ['Gender_x0_Male', 'Age_x0_q2', 'Age_x0_q3', 'Age_x0_q4', 'family_history_with_overweight_x0_yes', 'FAVC_x0_yes', 'CAEC_x0_no', 'SCC_x0_yes', 'CALC_x0_no', 'MTRANS_x0_Bike', 'MTRANS_x0_Walking', 'SWC_x0_1', 'IS_x0_1', 'Weight', 'FCVC', 'NCP', 'FAF', 'BMI']

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=xgb_run_name):
    objective = Objective(
        run_name=xgb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=xg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-22 20:17:51,276] A new study created in memory with name: no-name-c1be4e4b-b25b-4906-a167-95e46f7a3636
2024/11/22 20:18:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auspicious-tern-649 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/8ddb9b6240be46dd9713bacb29a5e0e9.
2024/11/22 20:18:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/671155644889420432.
[I 2024-11-22 20:18:43,003] Trial 0 finished with value: 0.9035556524958557 and parameters: {'booster': 'dart', 'lambda': 0.07302538946518439, 'alpha': 8.23310152799334e-05}. Best is trial 0 with value: 0.9035556524958557.
2024/11/22 20:19:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-snail-239 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/0f681ee6fcc94696bfc852a7c38f825a.
2024/11/22 20:19:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/6

#### LightGBM

In [11]:
lg_run_name = "lightgbm"
lg_features_indexes = ['Gender_x0_Male', 'Age_x0_q3', 'Age_x0_q4', 'FAVC_x0_yes', 'CAEC_x0_Frequently', 'SCC_x0_yes', 'CALC_x0_no', 'EVEMM_x0_1', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI', 'PAL', 'IBW']

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=lg_run_name):
    objective = Objective(
        run_name=lg_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=lg_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

[I 2024-11-22 21:10:34,045] A new study created in memory with name: no-name-ddd88c29-eb2c-42d6-8fbc-5f62428de58c
2024/11/22 21:10:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-yak-961 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/f5dcf06d4a4a4f42b8820b3268c258a8.
2024/11/22 21:10:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/671155644889420432.
[I 2024-11-22 21:10:36,794] Trial 0 finished with value: 0.9079476108757573 and parameters: {'lambda_l1': 4.949635896107615, 'lambda_l2': 0.041536400328527935, 'num_leaves': 144, 'feature_fraction': 0.5783552142749421, 'bagging_fraction': 0.5326933733005973, 'bagging_freq': 5, 'min_child_samples': 53}. Best is trial 0 with value: 0.9079476108757573.
2024/11/22 21:10:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run fun-deer-475 at: http://127.0.0.1:5000/#/experiments/671155644889420432/runs/1f4551cc7bb643c28f1ddbeda85b28e8.
2024/11

#### CatBoost

In [None]:
cb_run_name = "catboost"
cb_features_indexes = [3, 14, 18, 25, 26, 30, 35]

with mlflow.start_run(experiment_id=hpt_experiment_id, run_name=cb_run_name):
    objective = Objective(
        run_name=cb_run_name,
        experiment_id=hpt_experiment_id,
        X_train=X_train,
        y_train=y_train,
        X_valid=X_valid,
        y_valid=y_valid,
        indexes=cb_features_indexes
    )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

In [17]:
if credentials_config["EC2"] != "YOUR_S3_BUCKET_URL":
    # removing downloaded dataset from local
    os.remove(PROCESSED_RAW_FILE_PATH)

    # removing the local artifacts and features
    shutil.rmtree(ARTIFACTS_OUTPUT_PATH)
    shutil.rmtree(FEATURES_OUTPUT_PATH)

## Registering Best Models

In [None]:
run_id = "100d86e2471240dbb4b2199533bf6a55"
run_name = "lightgbm_experiment_5"
name = "lightgbm"
tags = {"version": "1.0", "type": "baseline", "model": name}

result = mlflow.register_model(
    model_uri=f"runs:/{run_id}/{run_name}",
    name=name,
    tags=tags,
    await_registration_for=150,
)