#Imports

In [16]:
# mlflow is used to track experiments, log and serve machine learning models
import mlflow

# mlflow.sklearn allows direct integration between scikit-learn models and MLflow (for logging and saving models)
import mlflow.sklearn

# matplotlib.pyplot is used to create basic plots and visualizations (like ROC curves or confusion matrices)
import matplotlib.pyplot as plt

# GaussianNB is the Naive Bayes classifier for continuous variables based on a Gaussian distribution
from sklearn.naive_bayes import GaussianNB

# Evaluation metrics for classification models (e.g., accuracy, recall, f1-score, etc.)
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, log_loss, confusion_matrix, roc_curve
)

# Import the train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# Now use the function
# X_train, X_test, y_train, y_test = train_test_split(previsores, classe,
#                                         test_size=0.3, random_state=123) #Here, we split the data into train (70%) and test (30%) and apply a seed "random_state=123"
#                                                                         #to ensure experiment reproducibility
# seaborn is a visualization library that makes creating more aesthetically pleasing statistical plots easier (like heatmaps)
import seaborn as sns

# pandas is used for reading, manipulating, and analyzing data in table format (DataFrames)
import pandas as pd

# numpy is used for efficient array manipulation and numerical operations
import numpy as np

#These packages cover the entire pipeline: modeling, evaluation, visualization, and model registration with MLflow.

#Loading the dataset

In [17]:
credit = pd.read_csv('Credit.csv')
credit.shape

(1000, 21)

In [18]:
credit.head(5) #Visualize the first 5 rows

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,'critical/other existing credit',radio/tv,1169,'no known savings',>=7,4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,'existing paid',radio/tv,5951,<100,1<=X<4,2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,<100,4<=X<7,2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,<0,42,'existing paid',furniture/equipment,7882,<100,4<=X<7,2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,<0,24,'delayed previously','new car',4870,<100,1<=X<4,3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


#Converting string variables to numerical variables

In [19]:
# Loop that iterates through all columns of the DataFrame
for col in credit.columns:  # 'col' will take on each column name, one by one

    # Checks if the column contains text data (string)
    if credit[col].dtype == 'object':  # This avoids affecting numerical columns

        # Converts the column to categorical type and assigns numerical codes to each category
        credit[col] = credit[col].astype('category').cat.codes

        # .astype('category') -> converts text into categories (internally)
        # .cat.codes -> assigns an integer number to each category

In [20]:
credit.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,2,6,1,7,1169,0,3,4,3,2,...,2,67,1,1,2,3,1,1,1,1
1,1,48,3,7,5951,3,0,2,0,2,...,2,22,1,1,1,3,1,0,1,0
2,0,12,1,4,2096,3,1,2,3,2,...,2,49,1,1,1,2,2,0,1,1
3,2,42,3,5,7882,3,1,2,3,1,...,0,45,1,0,1,3,2,0,1,1
4,2,24,2,1,4870,3,0,3,3,2,...,1,53,1,0,2,3,2,0,1,0


#Setting the predictors and the target variable

In [21]:
predictors = credit.iloc[:, 0:20].values
# selects all rows (:) and columns from 0 to 19 (0:20, 20 is exclusive)
# .values transforms into a NumPy array, which is the expected format for many ML models
# 'predictors' are the independent variables (features) that the model will use to predict

target = credit.iloc[:, 20].values
# selects all rows (:) of the column with index 20 (the 21st column)
# this column is the dependent variable (target, label) that the model must learn to predict
# also transformed into a NumPy array

In [22]:
predictors #display predictors in NumPy array format

array([[ 2,  6,  1, ...,  1,  1,  1],
       [ 1, 48,  3, ...,  1,  0,  1],
       [ 0, 12,  1, ...,  2,  0,  1],
       ...,
       [ 0, 12,  3, ...,  1,  0,  1],
       [ 2, 45,  3, ...,  1,  1,  1],
       [ 1, 45,  1, ...,  1,  0,  1]])

#Splitting the data intro train and test

In [23]:
X_train, X_test, y_train, y_test = train_test_split(predictors,target,
                                        test_size=0.3,random_state=123) #Here, we split the data into train (70%) and test (30%) and apply a seed "random_state=123"
                                                                        #to ensure experiment reproducibility

#Implementing a Naive Bayes experiment with hyperparameters

In [24]:
mlflow.set_experiment("nb_experiment") #Configure the experiment and give it a name

# List with different smoothing values to test the impact of this hyperparameter on the Naive Bayes model
smoothing_values = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]

for smoothing in smoothing_values:
    with mlflow.start_run(): #Starts the experiment with a loop
        # Training
        naive_bayes = GaussianNB(var_smoothing=smoothing)
        naive_bayes.fit(X_train, y_train)
        predictions = naive_bayes.predict(X_test)
        probs = naive_bayes.predict_proba(X_test)[:, 1]  # Probabilities for ROC

        # Calculating Metrics
        accuracy = accuracy_score(y_test, predictions)
        recall = recall_score(y_test, predictions)
        precision = precision_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)
        auc = roc_auc_score(y_test, predictions)
        log = log_loss(y_test, predictions)

        # Logging hyperparameter and metrics
        mlflow.log_param("var_smoothing", smoothing)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("log", log)

        # Calculating and Plotting the Confusion Matrix
        cm = confusion_matrix(y_test, predictions)
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix - smoothing={smoothing}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        conf_file = f"confusion_{smoothing}.png"
        plt.savefig(conf_file)
        mlflow.log_artifact(conf_file)
        plt.close()

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, probs)
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
        plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
        plt.xlabel("FPR (False Positives)")
        plt.ylabel("TPR (True Positives)")
        plt.title(f"ROC Curve - smoothing={smoothing}")
        plt.legend()
        roc_file = f"roc_{smoothing}.png"
        plt.savefig(roc_file)
        mlflow.log_artifact(roc_file)
        plt.close()

        # Logging the model
        mlflow.sklearn.log_model(naive_bayes, name="NB_Model")

        # Print the model execution ID
        print(f"Run finished with var_smoothing={smoothing}. ID:", mlflow.active_run().info.run_id)

    mlflow.end_run() #Ends the experiment

2025/09/08 16:20:37 INFO mlflow.tracking.fluent: Experiment with name 'nb_experiment' does not exist. Creating a new experiment.


Run finished with var_smoothing=1e-09. ID: 7298e21a4efb48a3be06626704e1eec3




Run finished with var_smoothing=1e-08. ID: 4475c5bbc23f4fbdb164fbbd000db059




Run finished with var_smoothing=1e-07. ID: 5ddf70403a0f4116a7774aa9772dae7e




Run finished with var_smoothing=1e-06. ID: 07531377c9f6430a895b3aee7bed5e92




Run finished with var_smoothing=1e-05. ID: d303e49388124926a54178dbe9f5451e


In [25]:
#To view the newly trained models in MLflow, go to the anaconda prompt and type:

#(base) C:\Users\Your_User>mlflow ui

#After this, you will see the following result:
# INFO:waitress:Serving on http://127.0.0.1:5000

#From this, you will need to click on the http link using ctrl + enter to access the MLflow page with the experiments already run and versioned.