# Setting up Mlflow

In [1]:
# Install the following librairies (it is better to create a venv (or conda) virtual environment first and install these librairies in it)
!pip install mlflow
!pip install --upgrade jinja2
!pip install --upgrade Flask
!pip install setuptools
import pandas as pd
import numpy as np



In [None]:
# starts an MLflow server locally.
!mlflow server --host 127.0.0.1 --port 8080

## Using the MLflow Client API


- Initiate a new Experiment.

- Start Runs within an Experiment.

- Document parameters, metrics, and tags for your Runs.

- Log artifacts linked to runs, such as models, tables, plots, and more.



In [3]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
!pip install catboost
!pip install xgboost
!pip install lightgbm
from catboost import CatBoostClassifier
from xgboost import XGBClassifier



In [4]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

#it allows programmatic interaction with the MLflow tracking server.

We now have a client interface to the tracking server that can both send data to and retrieve data from the tracking server.



In [5]:
all_experiments = client.search_experiments()

print(all_experiments)


[<Experiment: artifact_location='file:///d:/Projet_MLOps/Projet_MLOPS/mlruns/546722830652984112', creation_time=1760305213585, experiment_id='546722830652984112', last_update_time=1760305213585, lifecycle_stage='active', name='Credit_Default_Model', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/876832469169906264', creation_time=1760277868539, experiment_id='876832469169906264', last_update_time=1760277868539, lifecycle_stage='active', name='Credit_Default_Model', tags={'mlflow.note.content': 'Projet de prédiction de défauts de paiements pour '
                        'prêts personnels en banque de détail. Le but est de '
                        'construire un modèle qui estime la probabilité de '
                        "défaut pour chaque client, afin d'aider la banque à "
                        'anticiper les pertes et maintenir la stabilité '
                        'financière.',
 'model_type': 'classification',
 'objective': 'default probability and expected loss e

### Importation des données

In [6]:
!pip install -q gdown

In [7]:
import gdown
import pandas as pd

file_id = "1YrHUOpWLZdA88tOfus6s8CI8XUDqax3x"
url = f"https://drive.google.com/uc?id={file_id}"

filename = "Loan_Data.csv"
gdown.download(url, filename, quiet=False)


df = pd.read_csv(filename)
df.head()

Downloading...
From: https://drive.google.com/uc?id=1YrHUOpWLZdA88tOfus6s8CI8XUDqax3x
To: d:\Projet_MLOps\Projet_MLOPS\Loan_Data.csv
100%|██████████| 547k/547k [00:00<00:00, 1.71MB/s]


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


### Exploration des données

In [8]:
df.isna().sum()

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

In [9]:
# Compter le nombre de lignes en double
nombre_de_doublons = df.duplicated().sum()

print(f"Nombre de lignes en double dans le DataFrame : {nombre_de_doublons}")

Nombre de lignes en double dans le DataFrame : 0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


In [11]:
df.shape

(10000, 8)

### create an experiment

In [None]:
# Description détaillée de l'expérience
experiment_description = (
    "Projet de prédiction de défauts de paiements pour prêts personnels en banque de détail. "
    "Le but est de construire un modèle qui estime la probabilité de défaut pour chaque client, "
    "afin d'aider la banque à anticiper les pertes et maintenir la stabilité financière."
)

# Tags pour faciliter la recherche et l'organisation dans MLflow
experiment_tags = {
    "project_name": "credit-default-prediction",
    "sector": "retail-banking",
    "team": "risk-analytics",
    "model_type": "classification",
    "objective": "default probability and expected loss estimation",
    "mlflow.note.content": experiment_description,
}

# Création de l'expérience dans MLflow
credit_default_experiment = client.create_experiment(
    name="Credit_Default_Model", tags=experiment_tags
)

In [13]:
# Recherche des expériences avec le tag project_name = 'credit-default-prediction'
credit_experiments = client.search_experiments(
    filter_string="tags.`project_name` = 'credit-default-prediction'"
)

# Affiche tous les attributs de la première expérience trouvée
print(vars(credit_experiments[0]))

{'_experiment_id': '876832469169906264', '_name': 'Credit_Default_Model', '_artifact_location': 'mlflow-artifacts:/876832469169906264', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': "Projet de prédiction de défauts de paiements pour prêts personnels en banque de détail. Le but est de construire un modèle qui estime la probabilité de défaut pour chaque client, afin d'aider la banque à anticiper les pertes et maintenir la stabilité financière.", 'model_type': 'classification', 'objective': 'default probability and expected loss estimation', 'project_name': 'credit-default-prediction', 'sector': 'retail-banking', 'team': 'risk-analytics'}, '_creation_time': 1760277868539, '_last_update_time': 1760277868539}


### Logging our first runs with MLflow

In [14]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [15]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.

mlflow.set_tracking_uri("http://127.0.0.1:8080")


In [16]:
# Définit l'expérience active sur "Credit_Default_Model" et retourne ses métadonnées
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

# Définit un nom pour ce run spécifique d'entraînement
run_name = "credit_rf_test1"

# Définir un chemin d'artifact pour sauvegarder le modèle
artifact_path = "rf_credit1"


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

#### model: random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
# 1. Séparation des variables et target (on retire customer_id et la cible default)
X = df.drop(columns=["customer_id", "default"])
y = df["default"]

# 2. Split train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Paramètres du modèle
params = {
    "n_estimators": 50,
    "max_depth": 8,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# 4. Entraînement du modèle de classification
rf = RandomForestClassifier(**params)
rf.fit(X_train, y_train)

# 5. Prédiction des classes et probabilités
y_pred = rf.predict(X_val)
y_pred_proba = rf.predict_proba(X_val)[:, 1]  # Probabilité de défaut (classe 1)

# 6. Calcul des métriques classification
metrics = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred),
    "recall": recall_score(y_val, y_pred),
    "f1": f1_score(y_val, y_pred),
    "auc": roc_auc_score(y_val, y_pred_proba),
}

# 7. MLflow : tracking du run & logging des métriques et du modèle
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run credit_rf_test1 at: http://127.0.0.1:8080/#/experiments/546722830652984112/runs/2c79cd56c1c34b44956ae8870056a2a1
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/546722830652984112


#### model: Logistic regression

In [23]:
# Définit l'expérience active sur "Credit_Default_Model" et retourne ses métadonnées
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

# Définit un nom pour ce run spécifique d'entraînement (adapté pour logistic regression)
run_name = run_name

# Définir un chemin d'artifact pour sauvegarder le modèle (nom explicite)
artifact_path = artifact_path

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 1. Définition de l'expérience dans MLflow
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

# 2. Nom spécifique du run
run_name = "credit_logreg_test2"

# 3. Chemin d’enregistrement de l’artifact du modèle
artifact_path = "logreg_credit2"

# 4. Séparation des features et de la cible
X = df.drop(columns=["customer_id", "default"])
y = df["default"]

# 5. Split train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Standardisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 7. Initialisation du modèle de régression logistique SANS pénalité
params = {
    "penalty": None,        # pas de pénalisation/régularisation
    "solver": "lbfgs",
    "max_iter": 300,
    "random_state": 888,
}

logreg = LogisticRegression(**params)
logreg.fit(X_train_scaled, y_train)

# 1. Prédiction des classes et probabilités
y_pred = logreg.predict(X_val_scaled)
y_pred_proba = logreg.predict_proba(X_val_scaled)[:, 1]

# 2. Calcul des métriques classification
metrics = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred),
    "recall": recall_score(y_val, y_pred),
    "f1": f1_score(y_val, y_pred),
    "auc": roc_auc_score(y_val, y_pred_proba),
}

# 3. Tracking dans MLflow (run & log)
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        sk_model=logreg,
        input_example=X_val_scaled,    # Données standardisées
        artifact_path=artifact_path
    )



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

🏃 View run credit_logreg_test2 at: http://127.0.0.1:8080/#/experiments/546722830652984112/runs/cca36b5ce2da4d6c80733e476318ee08
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/546722830652984112


#### Model: CatBoost

In [25]:
# Définit l'expérience active sur "Credit_Default_Model" et retourne ses métadonnées
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

# Définit un nom explicite pour ce run spécifique d'entraînement (ici : CatBoost classifier)
run_name = "credit_catboost_test1"

# Définir un chemin d'artifact pour sauvegarder le modèle CatBoost
artifact_path = "catboost_credit1"


In [27]:
# Séparation des features/cible
X = df.drop(columns=["customer_id", "default"])
y = df["default"]

# Split train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Paramètres du modèle CatBoost
params = {
    "iterations": 600,
    "learning_rate": 0.1,
    "depth": 8,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": 888,
    "verbose": False
}

# Entraînement
cat_model = CatBoostClassifier(**params)
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Prédictions
y_pred = cat_model.predict(X_val)
y_pred_proba = cat_model.predict_proba(X_val)[:, 1]

# Métriques classification.
metrics = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred),
    "recall": recall_score(y_val, y_pred),
    "f1": f1_score(y_val, y_pred),
    "auc": roc_auc_score(y_val, y_pred_proba),
}

# MLflow tracking
credit_experiment = mlflow.set_experiment("Credit_Default_Model")
run_name = "credit_catboost_test2"
artifact_path = "catboost_credit2"

with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.catboost.log_model(
        cat_model, artifact_path=artifact_path
    )




🏃 View run credit_catboost_test2 at: http://127.0.0.1:8080/#/experiments/546722830652984112/runs/97ef307a9520488c867ca0ae9c4bd8e9
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/546722830652984112


## Model: Xgboost

In [28]:
import mlflow
import mlflow.xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# ======================================================
# 🔹 1. Définition de l'expérience MLflow
# ======================================================
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

# Nom du run.
run_name = "credit_xgboost_test2"

# Chemin pour sauvegarder le modèle.
artifact_path = "xgboost_credit2"

# ======================================================
# 🔹 2. Préparation des données
# ======================================================
X = df.drop(columns=["customer_id", "default"])
y = df["default"]

# Split train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ======================================================
# 🔹 3. Paramètres du modèle XGBoost
# ======================================================
params = {
    "n_estimators": 1000,
    "learning_rate": 0.1,
    "max_depth": 8,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "random_state": 888,
    "use_label_encoder": False
}

# ======================================================
# 🔹 4. Entraînement du modèle
# ======================================================
xgb_model = XGBClassifier(**params)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# ======================================================
# 🔹 5. Évaluation du modèle
# ======================================================
y_pred = xgb_model.predict(X_val)
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

metrics = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred),
    "recall": recall_score(y_val, y_pred),
    "f1": f1_score(y_val, y_pred),
    "auc": roc_auc_score(y_val, y_pred_proba)
}

# ======================================================
# 🔹 6. Tracking avec MLflow
# ======================================================
with mlflow.start_run(run_name=run_name):
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.xgboost.log_model(
        xgb_model, artifact_path=artifact_path
    )



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


🏃 View run credit_xgboost_test2 at: http://127.0.0.1:8080/#/experiments/546722830652984112/runs/557e837ee0a54d3fbfef8942f0f1d043
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/546722830652984112


## Model: LightGBM

In [None]:
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

# ======================================================
# 🔹 1. Définition de l'expérience MLflow
# ======================================================
credit_experiment = mlflow.set_experiment("Credit_Default_Model")

run_name = "credit_lightgbm_test2"
artifact_path = "lightgbm_credit2"

# ======================================================
# 🔹 2. Préparation des données
# ======================================================
X = df.drop(columns=["customer_id", "default"])
y = df["default"]

# Split train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ======================================================
# 🔹 3. Paramètres du modèle LightGBM
# ======================================================
params = {
    "n_estimators": 500,
    "learning_rate": 0.1,
    "max_depth": 8,
    "objective": "binary",
    "metric": "auc",
    "random_state": 888,
    "boosting_type": "gbdt",
    "verbosity": -1
}

# ======================================================
# 🔹 4. Entraînement du modèle
# ======================================================
lgb_model = lgb.LGBMClassifier(**params)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="auc")

# ======================================================
# 🔹 5. Évaluation du modèle
# ======================================================
y_pred = lgb_model.predict(X_val)
y_pred_proba = lgb_model.predict_proba(X_val)[:, 1]

metrics = {
    "accuracy": accuracy_score(y_val, y_pred),
    "precision": precision_score(y_val, y_pred),
    "recall": recall_score(y_val, y_pred),
    "f1": f1_score(y_val, y_pred),
    "auc": roc_auc_score(y_val, y_pred_proba)
}

# ======================================================
# 🔹 6. Tracking avec MLflow
# ======================================================
with mlflow.start_run(run_name=run_name):
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.lightgbm.log_model(
        lgb_model, artifact_path=artifact_path 
    )




🏃 View run credit_lightgbm_test2 at: http://127.0.0.1:8080/#/experiments/546722830652984112/runs/19ea8e0bb5564301b1a9ab0dcc3747d4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/546722830652984112
