# Model registration and versioning with MLFlow

In [1]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score,roc_auc_score, average_precision_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split, KFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier

from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [2]:
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/inouyewilliam/Master-Thesis.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "inouyewilliam"
os.environ["MLFLOW_TRACKING_PASSWORD"] ="b185d44c9fe85ded477875ff2ba1b4d229006006"


mlflow.set_tracking_uri("https://dagshub.com/inouyewilliam/Master-Thesis.mlflow")

In [3]:
def evaluate_model(model, X, y, X_test, y_test):
    # Evaluate the model using cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    cv_score = np.mean(cv_scores)
    
    # Get the model predictions and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate the evaluation metrics
    roc_auc = roc_auc_score(y_test, y_proba)
    avg_precision= average_precision_score(y_test, y_proba)
    accuracy= accuracy_score(y_test, y_pred)
    precision= precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)
    f1= f1_score(y_test, y_pred)
    
    # Return evaluation metrics
    return cv_score,roc_auc,avg_precision,accuracy,precision,recall,f1
    

In [None]:
if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    
    # Read the preprocess csv file
    csv = (
            "C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv"
        )
    try:
            data = pd.read_csv(csv, sep=",")
            
    except Exception as e:
            logger.exception(
                "Unable to download training & test CSV. Error: %s", e
            )

    # Split the data into training and test sets. (0.8, 0.2) split.
        
    X = data.drop("mood", axis=1)
    y = data["mood"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform Feature Selection to find the best K
    
    def select_k_best(X, y, estimator, k_values=[2, 5, 7, 10, 12, 15]):
        best_k = 0
        best_score = float('-inf')
        best_selector = None
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        for k in k_values:
            pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("selector", SelectKBest(k=k)),
                ("model", estimator)])
            scores = cross_val_score(pipeline, X, y, cv=cv)
            if scores.mean() > best_score:
                best_k = k
                best_score = scores.mean()
                best_selector = pipeline.named_steps["selector"]
                best_selector.fit(X, y)
                selected_features = X.columns[best_selector.get_support()]
                print(f"Best k: {best_k}")
                print(f"Selected features: {list(selected_features)}")
        return best_k
    
    estimator = LGBMClassifier()
    best_k = select_k_best(X_train, y_train, estimator, k_values=[2, 5, 7, 10, 12, 15])
    

    with mlflow.start_run():
        
        # Build a training Pipeline
     
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k= best_k)),
            ("model", LGBMClassifier())])
        
        
        # Hyperparameter Optimization
         
        param_distributions = {
        "model__max_depth": sp_randint(3, 10),
        "model__n_estimators": sp_randint(50, 200),
        "model__num_leaves": sp_randint(2, 50),
        "model__learning_rate": sp_uniform(0.001, 0.1)
        }
        
        
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=50,
                                   cv=5, n_jobs=-1, verbose=2)
        
        random_search.fit(X_train, y_train)
        
        # Model Evaluation
        (cv_score,roc_auc,average_precision,accuracy,precision,recall,f1) = evaluate_model(random_search, X, y, X_test, y_test)

        print("cv_score: %s" % cv_score)
        print("best params: %s" % random_search.best_params_)
        print("roc_auc: %s" % roc_auc)
        print("average_precision: %s" % average_precision)
        print("accuracy: %s" % accuracy)
        print("precision: %s" % precision)
        print("recall: %s" % recall)
        print("f1 score: %s" % f1)

        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("mean_cv_score", cv_score)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("average_precision", average_precision)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1 score", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            
            mlflow.sklearn.log_model(random_search, "model", registered_model_name="LgbmModel")
        else:
            mlflow.sklearn.log_model(random_search, "model")
            #cv_score: 0.6911111111111111

In [4]:
if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    
    # Read the preprocess csv file
    csv = (
            "C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv"
        )
    try:
            data = pd.read_csv(csv, sep=",")
            
    except Exception as e:
            logger.exception(
                "Unable to download training & test CSV. Error: %s", e
            )

    # Split the data into training and test sets. (0.8, 0.2) split.
        
    X = data.drop("mood", axis=1)
    y = data["mood"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
    # Perform Feature Selection to find the best K
    
    def select_k_best(X, y, estimator, k_values=[2, 5, 7, 10, 12, 15]):
        best_k = 0
        best_score = float('-inf')
        best_selector = None
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        for k in k_values:
            pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("selector", SelectKBest(k=k)),
                ("model", estimator)])
            scores = cross_val_score(pipeline, X, y, cv=cv)
            if scores.mean() > best_score:
                best_k = k
                best_score = scores.mean()
                best_selector = pipeline.named_steps["selector"]
                best_selector.fit(X, y)
                selected_features = X.columns[best_selector.get_support()]
                print(f"Best k: {best_k}")
                print(f"Selected features: {list(selected_features)}")
        return best_k
    
    estimator = ExtraTreesClassifier()
    best_k = select_k_best(X_train, y_train, estimator, k_values=[2, 5, 7, 10, 12, 15])
    

    with mlflow.start_run():
        
        # Build a training Pipeline
     
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k= best_k)),
            ("model", ExtraTreesClassifier())])
        
        
        # Hyperparameter Optimization
         
        param_distributions = {
        "model__n_estimators": sp_randint(50, 200),
        "model__max_depth": sp_randint(3, 10),
        "model__min_samples_split": sp_randint(2, 10),
        "model__min_samples_leaf": sp_randint(1, 10),
        "model__bootstrap": [True, False],
        "model__criterion": ["gini", "entropy"]
        }
        
        
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=50,
                                   cv=5, n_jobs=-1, verbose=2)
        
        random_search.fit(X_train, y_train)
        
        # Model Evaluation
        (cv_score,roc_auc,average_precision,accuracy,precision,recall,f1) = evaluate_model(random_search, X, y, X_test, y_test)

        print("cv_score: %s" % cv_score)
        print("best params: %s" % random_search.best_params_)
        print("roc_auc: %s" % roc_auc)
        print("average_precision: %s" % average_precision)
        print("accuracy: %s" % accuracy)
        print("precision: %s" % precision)
        print("recall: %s" % recall)
        print("f1 score: %s" % f1)

        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("mean_cv_score", cv_score)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("average_precision", average_precision)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1 score", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            
            mlflow.sklearn.log_model(random_search, "model", registered_model_name="ExtraTreeModel")
        else:
            mlflow.sklearn.log_model(random_search, "model")
            #cv_score: 0.7033333333333334

Best k: 2
Selected features: ['danceability', 'energy']
Best k: 5
Selected features: ['danceability', 'energy', 'loudness', 'duration_ms', 'time_signature']
Best k: 10
Selected features: ['popularity', 'genres', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'tempo', 'duration_ms', 'time_signature']
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
cv_score: 0.7050000000000001
best params: {'model__bootstrap': True, 'model__criterion': 'gini', 'model__max_depth': 8, 'model__min_samples_leaf': 5, 'model__min_samples_split': 8, 'model__n_estimators': 60}
roc_auc: 0.7775030138171927
average_precision: 0.7737945715989483
accuracy: 0.6972222222222222
precision: 0.

Registered model 'ExtraTreeModel' already exists. Creating a new version of this model...
2023/05/09 17:13:37 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ExtraTreeModel, version 8
Created version '8' of model 'ExtraTreeModel'.


In [5]:
if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    # Read the preprocess csv file
    csv = (
            "C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv"
        )
    try:
            data = pd.read_csv(csv, sep=",")
            
    except Exception as e:
            logger.exception(
                "Unable to download training & test CSV. Error: %s", e
            )

    # Split the data into training and test sets. (0.8, 0.2) split.
        
    X = data.drop("mood", axis=1)
    y = data["mood"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

       
    # Perform Feature Selection to find the best K
    
    def select_k_best(X, y, estimator, k_values=[2, 5, 7, 10, 12, 15]):
        best_k = 0
        best_score = float('-inf')
        best_selector = None
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        for k in k_values:
            pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("selector", SelectKBest(k=k)),
                ("model", estimator)])
            scores = cross_val_score(pipeline, X, y, cv=cv)
            if scores.mean() > best_score:
                best_k = k
                best_score = scores.mean()
                best_selector = pipeline.named_steps["selector"]
                best_selector.fit(X, y)
                selected_features = X.columns[best_selector.get_support()]
                print(f"Best k: {best_k}")
                print(f"Selected features: {list(selected_features)}")
        return best_k
    
    estimator = XGBClassifier()
    best_k = select_k_best(X_train, y_train, estimator, k_values=[2, 5, 7, 10, 12, 15])
    

    with mlflow.start_run():
        
        # Build a training Pipeline
     
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k= best_k)),
            ("model", XGBClassifier())])
        
        
        # Hyperparameter Optimization
         
        param_distributions = {
        "model__max_depth": sp_randint(3, 10),
        "model__n_estimators": sp_randint(50, 200),
        "model__learning_rate": sp_uniform(0.001, 0.1),
        "model__subsample": sp_uniform(0.5, 0.5),
        "model__colsample_bytree": sp_uniform(0.5, 0.5),
        "model__reg_lambda": sp_uniform(0.1, 1)
        }
        
        
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=50,
                                   cv=5, n_jobs=-1, verbose=2)
        
        random_search.fit(X_train, y_train)
        
        # Model Evaluation
        (cv_score,roc_auc,average_precision,accuracy,precision,recall,f1) = evaluate_model(random_search, X, y, X_test, y_test)

        print("cv_score: %s" % cv_score)
        print("best params: %s" % random_search.best_params_)
        print("roc_auc: %s" % roc_auc)
        print("average_precision: %s" % average_precision)
        print("accuracy: %s" % accuracy)
        print("precision: %s" % precision)
        print("recall: %s" % recall)
        print("f1 score: %s" % f1)

        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("mean_cv_score", cv_score)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("average_precision", average_precision)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1 score", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            
            mlflow.sklearn.log_model(random_search, "model", registered_model_name="XGBModel")
        else:
            mlflow.sklearn.log_model(random_search, "model")
            #cv_score: 0.6994444444444445

Best k: 2
Selected features: ['danceability', 'energy']
Best k: 5
Selected features: ['danceability', 'energy', 'loudness', 'duration_ms', 'time_signature']
Best k: 7
Selected features: ['explicit', 'danceability', 'energy', 'loudness', 'tempo', 'duration_ms', 'time_signature']
Best k: 10
Selected features: ['popularity', 'genres', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'tempo', 'duration_ms', 'time_signature']




Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
cv_score: 0.6994444444444445
best params: {'model__colsample_bytree': 0.9678174971104738, 'model__learning_rate': 0.07953406511139437, 'model__max_depth': 6, 'model__n_estimators': 161, 'model__reg_lambda': 0.2014715428660321, 'model__subsample': 0.8317508845540279}
roc_auc: 0.7941949244227382
average_precision: 0.7870229633665965
accuracy: 0.7083333333333334
precision: 0.7091836734693877
recall: 0.7433155080213903
f1 score: 0.7258485639686684


Registered model 'XGBModel' already exists. Creating a new version of this model...
2023/05/09 17:30:19 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: XGBModel, version 5
Created version '5' of model 'XGBModel'.


In [None]:
if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    
    # Read the preprocess csv file
    csv = (
            "C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv"
        )
    try:
            data = pd.read_csv(csv, sep=",")
            
    except Exception as e:
            logger.exception(
                "Unable to download training & test CSV. Error: %s", e
            )

    # Split the data into training and test sets. (0.8, 0.2) split.
        
    X = data.drop("mood", axis=1)
    y = data["mood"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform Feature Selection to find the best K
    
    def select_k_best(X, y, estimator, k_values=[2, 5, 7, 10, 12, 15]):
        best_k = 0
        best_score = float('-inf')
        best_selector = None
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        for k in k_values:
            pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("selector", SelectKBest(k=k)),
                ("model", estimator)])
            scores = cross_val_score(pipeline, X, y, cv=cv)
            if scores.mean() > best_score:
                best_k = k
                best_score = scores.mean()
                best_selector = pipeline.named_steps["selector"]
                best_selector.fit(X, y)
                selected_features = X.columns[best_selector.get_support()]
                print(f"Best k: {best_k}")
                print(f"Selected features: {list(selected_features)}")
        return best_k
    
    estimator = RandomForestClassifier()
    best_k = select_k_best(X_train, y_train, estimator, k_values=[2, 5, 7, 10, 12, 15])
    

    with mlflow.start_run():
        
        # Build a training Pipeline
     
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k= best_k)),
            ("model", RandomForestClassifier())])
        
        
        # Hyperparameter Optimization
         
        param_distributions = {
        "model__n_estimators": sp_randint(50, 200),
        "model__max_depth": sp_randint(3, 10),
        "model__min_samples_split": sp_randint(2, 20),
        "model__min_samples_leaf": sp_randint(1, 10),
        }

        
        
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=50,
                                   cv=5, n_jobs=-1, verbose=2)
        
        random_search.fit(X_train, y_train)
        
        # Model Evaluation
        (cv_score,roc_auc,average_precision,accuracy,precision,recall,f1) = evaluate_model(random_search, X, y, X_test, y_test)

        print("cv_score: %s" % cv_score)
        print("best params: %s" % random_search.best_params_)
        print("roc_auc: %s" % roc_auc)
        print("average_precision: %s" % average_precision)
        print("accuracy: %s" % accuracy)
        print("precision: %s" % precision)
        print("recall: %s" % recall)
        print("f1 score: %s" % f1)

        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("mean_cv_score", cv_score)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("average_precision", average_precision)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1 score", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            
            mlflow.sklearn.log_model(random_search, "model", registered_model_name="RandomForestModel")
        else:
            mlflow.sklearn.log_model(random_search, "model")
            #cv_score: 0.6955555555555556

In [None]:
if __name__ == "__main__":
    
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    
    # Read the preprocess csv file
    csv = (
            "C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv"
        )
    try:
            data = pd.read_csv(csv, sep=",")
            
    except Exception as e:
            logger.exception(
                "Unable to download training & test CSV. Error: %s", e
            )

    # Split the data into training and test sets. (0.8, 0.2) split.
        
    X = data.drop("mood", axis=1)
    y = data["mood"]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Perform Feature Selection to find the best K
    
    def select_k_best(X, y, estimator, k_values=[2, 5, 7, 10, 12, 15]):
        best_k = 0
        best_score = float('-inf')
        best_selector = None
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
        for k in k_values:
            pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("selector", SelectKBest(k=k)),
                ("model", estimator)])
            scores = cross_val_score(pipeline, X, y, cv=cv)
            if scores.mean() > best_score:
                best_k = k
                best_score = scores.mean()
                best_selector = pipeline.named_steps["selector"]
                best_selector.fit(X, y)
                selected_features = X.columns[best_selector.get_support()]
                print(f"Best k: {best_k}")
                print(f"Selected features: {list(selected_features)}")
        return best_k
    
    estimator = GradientBoostingClassifier()
    best_k = select_k_best(X_train, y_train, estimator, k_values=[2, 5, 7, 10, 12, 15])
    

    with mlflow.start_run():
        
        # Build a training Pipeline
     
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k = best_k)),
            ("model", GradientBoostingClassifier())])
        
        
        # Hyperparameter Optimization
         
        param_distributions = {
        "model__n_estimators": sp_randint(50, 200),
        "model__max_depth": sp_randint(3, 10),
        "model__min_samples_split": sp_randint(2, 20),
        "model__min_samples_leaf": sp_randint(1, 10),
        "model__learning_rate": sp_uniform(0.001, 0.1)
        }

        
        
        random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=50,
                                   cv=5, n_jobs=-1, verbose=2)
        
        random_search.fit(X_train, y_train)
        
        # Model Evaluation
        (cv_score,roc_auc,average_precision,accuracy,precision,recall,f1) = evaluate_model(random_search, X, y, X_test, y_test)

        print("cv_score: %s" % cv_score)
        print("best params: %s" % random_search.best_params_)
        print("roc_auc: %s" % roc_auc)
        print("average_precision: %s" % average_precision)
        print("accuracy: %s" % accuracy)
        print("precision: %s" % precision)
        print("recall: %s" % recall)
        print("f1 score: %s" % f1)

        
        mlflow.log_params(random_search.best_params_)
        mlflow.log_metric("mean_cv_score", cv_score)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("average_precision", average_precision)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1 score", f1)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme


        # Model registry does not work with file store
        if tracking_url_type_store != "file":

            # Register the model
            
            mlflow.sklearn.log_model(random_search, "model", registered_model_name="GradientBoostingModel")
        else:
            mlflow.sklearn.log_model(random_search, "model")
            #cv_score: 0.6955555555555556